corefile uploader: Updates per review comments offline (#3915)
* Updates per review comments 1) core_uploader service waits for syslog.service 2) core_uploader service enabled for restart on failure 3) Use mtime instead of file size + ample time to be robust. * Avoid reloading already uploaded file, by marking the names with a prefix. * Updated failing path. 1) If rc file is missing or required data missing, it periodically logs error in forever loop. 2) If upload fails, retry every hour with a error log, forever. * Fix few bugs * The binary update_json.py will come from sonic-utilities.
This commit is contained in:
parent
6db0c76a06
commit
da7db51259
@ -196,7 +196,6 @@ sudo cp $IMAGE_CONFIGS/hostcfgd/*.j2 $FILESYSTEM_ROOT_USR_SHARE_SONIC_TEMPLATES/
|
||||
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_uploader.service $FILESYSTEM_ROOT/etc/systemd/system/
|
||||
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl disable core_uploader.service
|
||||
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_uploader.py $FILESYSTEM_ROOT/usr/bin/
|
||||
sudo cp $IMAGE_CONFIGS/corefile_uploader/update_json.py $FILESYSTEM_ROOT/usr/bin/
|
||||
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_analyzer.rc.json $FILESYSTEM_ROOT_ETC_SONIC/
|
||||
sudo chmod og-rw $FILESYSTEM_ROOT_ETC_SONIC/core_analyzer.rc.json
|
||||
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install azure-storage
|
||||
|
@ -36,7 +36,11 @@ cwd = []
|
||||
|
||||
HOURS_4 = (4 * 60 * 60)
|
||||
PAUSE_ON_FAIL = (60 * 60)
|
||||
WAIT_FILE_WRITE1 = (10 * 60)
|
||||
WAIT_FILE_WRITE2= (5 * 60)
|
||||
POLL_SLEEP = (60 * 60)
|
||||
MAX_RETRIES = 5
|
||||
UPLOAD_PREFIX = "UPLOADED_"
|
||||
|
||||
log_level = syslog.LOG_DEBUG
|
||||
|
||||
@ -116,7 +120,7 @@ class Watcher:
|
||||
self.observer.start()
|
||||
try:
|
||||
while True:
|
||||
time.sleep(5)
|
||||
time.sleep(POLL_SLEEP)
|
||||
except:
|
||||
self.observer.stop()
|
||||
log_err("Error in watcher")
|
||||
@ -179,29 +183,33 @@ class Handler(FileSystemEventHandler):
|
||||
elif event.event_type == 'created':
|
||||
# Take any action here when a file is first created.
|
||||
log_debug("Received create event - " + event.src_path)
|
||||
Handler.wait_for_file_write_complete(event.src_path)
|
||||
Handler.handle_file(event.src_path)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def wait_for_file_write_complete(path):
|
||||
ct_size = -1
|
||||
mtime = 0
|
||||
|
||||
while ct_size != os.path.getsize(path):
|
||||
ct_size = os.path.getsize(path)
|
||||
time.sleep(2)
|
||||
# Sleep for ample time enough for file dump to complete.
|
||||
time.sleep(WAIT_FILE_WRITE1)
|
||||
|
||||
time.sleep(2)
|
||||
if ct_size != os.path.getsize(path):
|
||||
# Give another chance & poll until mtime stabilizes
|
||||
while mtime != os.stat(path).st_mtime:
|
||||
mtime = os.stat(path).st_mtime
|
||||
time.sleep(10)
|
||||
|
||||
# A safety pause for double confirmation
|
||||
time.sleep(WAIT_FILE_WRITE2)
|
||||
if mtime != os.stat(path).st_mtime:
|
||||
raise Exception("Dump file creation is too slow: " + path)
|
||||
# Give up as something is terribly wrong with this file.
|
||||
|
||||
log_debug("File write complete - " + path)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def handle_file(path):
|
||||
|
||||
Handler.wait_for_file_write_complete(path)
|
||||
|
||||
lpath = "/".join(cwd)
|
||||
make_new_dir(lpath)
|
||||
os.chdir(lpath)
|
||||
@ -221,18 +229,18 @@ class Handler(FileSystemEventHandler):
|
||||
tar.close()
|
||||
log_debug("Tar file for upload created: " + tarf_name)
|
||||
|
||||
Handler.upload_file(tarf_name, tarf_name)
|
||||
Handler.upload_file(tarf_name, tarf_name, path)
|
||||
|
||||
log_debug("File uploaded - " + path)
|
||||
os.chdir(INIT_CWD)
|
||||
|
||||
@staticmethod
|
||||
def upload_file(fname, fpath):
|
||||
def upload_file(fname, fpath, coref):
|
||||
daemonname = fname.split(".")[0]
|
||||
i = 0
|
||||
fail_msg = ""
|
||||
|
||||
while i <= MAX_RETRIES:
|
||||
while True:
|
||||
try:
|
||||
svc = FileService(account_name=acctname, account_key=acctkey)
|
||||
|
||||
@ -246,14 +254,15 @@ class Handler(FileSystemEventHandler):
|
||||
|
||||
svc.create_file_from_path(sharename, "/".join(l), fname, fpath)
|
||||
log_debug("Remote file created: name{} path{}".format(fname, fpath))
|
||||
newcoref = os.path.dirname(coref) + "/" + UPLOAD_PREFIX + os.path.basename(coref)
|
||||
os.rename(coref, newcoref)
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
log_err("core uploader failed: Failed during upload (" + str(e) +")")
|
||||
fail_msg = str(e)
|
||||
except Exception as ex:
|
||||
log_err("core uploader failed: Failed during upload (" + coref + ") err: ("+ str(ex) +") retry:" + str(i))
|
||||
if not os.path.exists(fpath):
|
||||
break
|
||||
i += 1
|
||||
if i >= MAX_RETRIES:
|
||||
raise Exception("Failed while uploading. msg(" + fail_msg + ") after " + str(i) + " retries")
|
||||
time.sleep(PAUSE_ON_FAIL)
|
||||
|
||||
|
||||
@ -261,7 +270,7 @@ class Handler(FileSystemEventHandler):
|
||||
def scan():
|
||||
for e in os.listdir(CORE_FILE_PATH):
|
||||
fl = CORE_FILE_PATH + e
|
||||
if os.path.isfile(fl):
|
||||
if os.path.isfile(fl) and not e.startswith(UPLOAD_PREFIX):
|
||||
Handler.handle_file(fl)
|
||||
|
||||
|
||||
|
@ -1,11 +1,13 @@
|
||||
[Unit]
|
||||
Description=Host core file uploader daemon
|
||||
Requires=updategraph.service
|
||||
After=updategraph.service
|
||||
Requires=syslog.service
|
||||
After=syslog.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/bin/core_uploader.py
|
||||
StandardOutput=null
|
||||
Restart=on-failure
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
@ -1,55 +0,0 @@
|
||||
#! /usr/bin/env python
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
|
||||
TMP_SUFFIX = ".tmp"
|
||||
BAK_SUFFIX = ".bak"
|
||||
|
||||
def dict_update(dst, patch):
|
||||
for k in patch.keys():
|
||||
if type(patch[k]) == dict:
|
||||
dst[k] = dict_update(dst[k], patch[k])
|
||||
else:
|
||||
dst[k] = patch[k]
|
||||
return dst
|
||||
|
||||
def do_update(rcf, patchf):
|
||||
dst = {}
|
||||
patch = {}
|
||||
|
||||
tmpf = rcf + TMP_SUFFIX
|
||||
bakf = rcf + BAK_SUFFIX
|
||||
|
||||
with open(rcf, "r") as f:
|
||||
dst = json.load(f)
|
||||
|
||||
with open(patchf, "r") as f:
|
||||
patch = json.load(f)
|
||||
|
||||
dst = dict_update(dst, patch)
|
||||
|
||||
with open(tmpf, "w") as f:
|
||||
json.dump(dst, f, indent = 4)
|
||||
|
||||
os.rename(rcf, bakf)
|
||||
os.rename(tmpf, rcf)
|
||||
|
||||
|
||||
def main():
|
||||
parser=argparse.ArgumentParser(description="Update JSON based file")
|
||||
parser.add_argument("-r", "--rc", help="JSON file to be updated")
|
||||
parser.add_argument("-p", "--patch", help="JSON file holding patch")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.rc or not args.patch:
|
||||
raise Exception("check usage")
|
||||
|
||||
do_update(args.rc, args.patch)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user