corefile uploader: Updates per review comments offline (#3915)

* Updates per review comments
1) core_uploader service waits for syslog.service
2) core_uploader service enabled for restart on failure
3) Use mtime instead of file size + ample time to be robust.

* Avoid reloading already uploaded file, by marking the names with a prefix.

* Updated failing path.
1) If rc file is missing or required data missing, it periodically logs error in forever loop.
2) If upload fails, retry every hour with a error log, forever.

* Fix few bugs

* The binary update_json.py will come from sonic-utilities.
This commit is contained in:
Renuka Manavalan 2019-12-30 13:01:03 -08:00 committed by Ying Xie
parent 6db0c76a06
commit da7db51259
4 changed files with 32 additions and 77 deletions

View File

@ -196,7 +196,6 @@ sudo cp $IMAGE_CONFIGS/hostcfgd/*.j2 $FILESYSTEM_ROOT_USR_SHARE_SONIC_TEMPLATES/
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_uploader.service $FILESYSTEM_ROOT/etc/systemd/system/
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl disable core_uploader.service
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_uploader.py $FILESYSTEM_ROOT/usr/bin/
sudo cp $IMAGE_CONFIGS/corefile_uploader/update_json.py $FILESYSTEM_ROOT/usr/bin/
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_analyzer.rc.json $FILESYSTEM_ROOT_ETC_SONIC/
sudo chmod og-rw $FILESYSTEM_ROOT_ETC_SONIC/core_analyzer.rc.json
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install azure-storage

View File

@ -36,7 +36,11 @@ cwd = []
HOURS_4 = (4 * 60 * 60)
PAUSE_ON_FAIL = (60 * 60)
WAIT_FILE_WRITE1 = (10 * 60)
WAIT_FILE_WRITE2= (5 * 60)
POLL_SLEEP = (60 * 60)
MAX_RETRIES = 5
UPLOAD_PREFIX = "UPLOADED_"
log_level = syslog.LOG_DEBUG
@ -116,7 +120,7 @@ class Watcher:
self.observer.start()
try:
while True:
time.sleep(5)
time.sleep(POLL_SLEEP)
except:
self.observer.stop()
log_err("Error in watcher")
@ -179,29 +183,33 @@ class Handler(FileSystemEventHandler):
elif event.event_type == 'created':
# Take any action here when a file is first created.
log_debug("Received create event - " + event.src_path)
Handler.wait_for_file_write_complete(event.src_path)
Handler.handle_file(event.src_path)
@staticmethod
def wait_for_file_write_complete(path):
ct_size = -1
mtime = 0
while ct_size != os.path.getsize(path):
ct_size = os.path.getsize(path)
time.sleep(2)
# Sleep for ample time enough for file dump to complete.
time.sleep(WAIT_FILE_WRITE1)
time.sleep(2)
if ct_size != os.path.getsize(path):
# Give another chance & poll until mtime stabilizes
while mtime != os.stat(path).st_mtime:
mtime = os.stat(path).st_mtime
time.sleep(10)
# A safety pause for double confirmation
time.sleep(WAIT_FILE_WRITE2)
if mtime != os.stat(path).st_mtime:
raise Exception("Dump file creation is too slow: " + path)
# Give up as something is terribly wrong with this file.
log_debug("File write complete - " + path)
@staticmethod
def handle_file(path):
Handler.wait_for_file_write_complete(path)
lpath = "/".join(cwd)
make_new_dir(lpath)
os.chdir(lpath)
@ -221,18 +229,18 @@ class Handler(FileSystemEventHandler):
tar.close()
log_debug("Tar file for upload created: " + tarf_name)
Handler.upload_file(tarf_name, tarf_name)
Handler.upload_file(tarf_name, tarf_name, path)
log_debug("File uploaded - " + path)
os.chdir(INIT_CWD)
@staticmethod
def upload_file(fname, fpath):
def upload_file(fname, fpath, coref):
daemonname = fname.split(".")[0]
i = 0
fail_msg = ""
while i <= MAX_RETRIES:
while True:
try:
svc = FileService(account_name=acctname, account_key=acctkey)
@ -246,14 +254,15 @@ class Handler(FileSystemEventHandler):
svc.create_file_from_path(sharename, "/".join(l), fname, fpath)
log_debug("Remote file created: name{} path{}".format(fname, fpath))
newcoref = os.path.dirname(coref) + "/" + UPLOAD_PREFIX + os.path.basename(coref)
os.rename(coref, newcoref)
break
except Exception as e:
log_err("core uploader failed: Failed during upload (" + str(e) +")")
fail_msg = str(e)
except Exception as ex:
log_err("core uploader failed: Failed during upload (" + coref + ") err: ("+ str(ex) +") retry:" + str(i))
if not os.path.exists(fpath):
break
i += 1
if i >= MAX_RETRIES:
raise Exception("Failed while uploading. msg(" + fail_msg + ") after " + str(i) + " retries")
time.sleep(PAUSE_ON_FAIL)
@ -261,7 +270,7 @@ class Handler(FileSystemEventHandler):
def scan():
for e in os.listdir(CORE_FILE_PATH):
fl = CORE_FILE_PATH + e
if os.path.isfile(fl):
if os.path.isfile(fl) and not e.startswith(UPLOAD_PREFIX):
Handler.handle_file(fl)

View File

@ -1,11 +1,13 @@
[Unit]
Description=Host core file uploader daemon
Requires=updategraph.service
After=updategraph.service
Requires=syslog.service
After=syslog.service
[Service]
Type=simple
ExecStart=/usr/bin/core_uploader.py
StandardOutput=null
Restart=on-failure
[Install]
WantedBy=multi-user.target

View File

@ -1,55 +0,0 @@
#! /usr/bin/env python
import os
import sys
import json
import argparse
TMP_SUFFIX = ".tmp"
BAK_SUFFIX = ".bak"
def dict_update(dst, patch):
for k in patch.keys():
if type(patch[k]) == dict:
dst[k] = dict_update(dst[k], patch[k])
else:
dst[k] = patch[k]
return dst
def do_update(rcf, patchf):
dst = {}
patch = {}
tmpf = rcf + TMP_SUFFIX
bakf = rcf + BAK_SUFFIX
with open(rcf, "r") as f:
dst = json.load(f)
with open(patchf, "r") as f:
patch = json.load(f)
dst = dict_update(dst, patch)
with open(tmpf, "w") as f:
json.dump(dst, f, indent = 4)
os.rename(rcf, bakf)
os.rename(tmpf, rcf)
def main():
parser=argparse.ArgumentParser(description="Update JSON based file")
parser.add_argument("-r", "--rc", help="JSON file to be updated")
parser.add_argument("-p", "--patch", help="JSON file holding patch")
args = parser.parse_args()
if not args.rc or not args.patch:
raise Exception("check usage")
do_update(args.rc, args.patch)
if __name__ == '__main__':
main()