From 78db0804d3a0ab9cf54a7ddf4c4d00c7ecd8751d Mon Sep 17 00:00:00 2001 From: Renuka Manavalan <47282725+renukamanavalan@users.noreply.github.com> Date: Mon, 30 Dec 2019 13:01:03 -0800 Subject: [PATCH] corefile uploader: Updates per review comments offline (#3915) * Updates per review comments 1) core_uploader service waits for syslog.service 2) core_uploader service enabled for restart on failure 3) Use mtime instead of file size + ample time to be robust. * Avoid reloading already uploaded file, by marking the names with a prefix. * Updated failing path. 1) If rc file is missing or required data missing, it periodically logs error in forever loop. 2) If upload fails, retry every hour with a error log, forever. * Fix few bugs * The binary update_json.py will come from sonic-utilities. --- .../build_templates/sonic_debian_extension.j2 | 1 - .../corefile_uploader/core_uploader.py | 47 +++++++++------- .../corefile_uploader/core_uploader.service | 6 +- .../corefile_uploader/update_json.py | 55 ------------------- 4 files changed, 32 insertions(+), 77 deletions(-) delete mode 100755 files/image_config/corefile_uploader/update_json.py diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index b63b5addac..a007745406 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -225,7 +225,6 @@ sudo cp $IMAGE_CONFIGS/hostcfgd/*.j2 $FILESYSTEM_ROOT_USR_SHARE_SONIC_TEMPLATES/ sudo cp $IMAGE_CONFIGS/corefile_uploader/core_uploader.service $FILESYSTEM_ROOT/etc/systemd/system/ sudo LANG=C chroot $FILESYSTEM_ROOT systemctl disable core_uploader.service sudo cp $IMAGE_CONFIGS/corefile_uploader/core_uploader.py $FILESYSTEM_ROOT/usr/bin/ -sudo cp $IMAGE_CONFIGS/corefile_uploader/update_json.py $FILESYSTEM_ROOT/usr/bin/ sudo cp $IMAGE_CONFIGS/corefile_uploader/core_analyzer.rc.json $FILESYSTEM_ROOT_ETC_SONIC/ sudo chmod og-rw $FILESYSTEM_ROOT_ETC_SONIC/core_analyzer.rc.json sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install azure-storage diff --git a/files/image_config/corefile_uploader/core_uploader.py b/files/image_config/corefile_uploader/core_uploader.py index 676ff9583b..2ae91ce089 100755 --- a/files/image_config/corefile_uploader/core_uploader.py +++ b/files/image_config/corefile_uploader/core_uploader.py @@ -36,7 +36,11 @@ cwd = [] HOURS_4 = (4 * 60 * 60) PAUSE_ON_FAIL = (60 * 60) +WAIT_FILE_WRITE1 = (10 * 60) +WAIT_FILE_WRITE2= (5 * 60) +POLL_SLEEP = (60 * 60) MAX_RETRIES = 5 +UPLOAD_PREFIX = "UPLOADED_" log_level = syslog.LOG_DEBUG @@ -116,7 +120,7 @@ class Watcher: self.observer.start() try: while True: - time.sleep(5) + time.sleep(POLL_SLEEP) except: self.observer.stop() log_err("Error in watcher") @@ -179,29 +183,33 @@ class Handler(FileSystemEventHandler): elif event.event_type == 'created': # Take any action here when a file is first created. log_debug("Received create event - " + event.src_path) + Handler.wait_for_file_write_complete(event.src_path) Handler.handle_file(event.src_path) @staticmethod def wait_for_file_write_complete(path): - ct_size = -1 + mtime = 0 - while ct_size != os.path.getsize(path): - ct_size = os.path.getsize(path) - time.sleep(2) + # Sleep for ample time enough for file dump to complete. + time.sleep(WAIT_FILE_WRITE1) - time.sleep(2) - if ct_size != os.path.getsize(path): + # Give another chance & poll until mtime stabilizes + while mtime != os.stat(path).st_mtime: + mtime = os.stat(path).st_mtime + time.sleep(10) + + # A safety pause for double confirmation + time.sleep(WAIT_FILE_WRITE2) + if mtime != os.stat(path).st_mtime: raise Exception("Dump file creation is too slow: " + path) + # Give up as something is terribly wrong with this file. log_debug("File write complete - " + path) @staticmethod def handle_file(path): - - Handler.wait_for_file_write_complete(path) - lpath = "/".join(cwd) make_new_dir(lpath) os.chdir(lpath) @@ -221,18 +229,18 @@ class Handler(FileSystemEventHandler): tar.close() log_debug("Tar file for upload created: " + tarf_name) - Handler.upload_file(tarf_name, tarf_name) + Handler.upload_file(tarf_name, tarf_name, path) log_debug("File uploaded - " + path) os.chdir(INIT_CWD) @staticmethod - def upload_file(fname, fpath): + def upload_file(fname, fpath, coref): daemonname = fname.split(".")[0] i = 0 fail_msg = "" - while i <= MAX_RETRIES: + while True: try: svc = FileService(account_name=acctname, account_key=acctkey) @@ -246,14 +254,15 @@ class Handler(FileSystemEventHandler): svc.create_file_from_path(sharename, "/".join(l), fname, fpath) log_debug("Remote file created: name{} path{}".format(fname, fpath)) + newcoref = os.path.dirname(coref) + "/" + UPLOAD_PREFIX + os.path.basename(coref) + os.rename(coref, newcoref) break - except Exception as e: - log_err("core uploader failed: Failed during upload (" + str(e) +")") - fail_msg = str(e) + except Exception as ex: + log_err("core uploader failed: Failed during upload (" + coref + ") err: ("+ str(ex) +") retry:" + str(i)) + if not os.path.exists(fpath): + break i += 1 - if i >= MAX_RETRIES: - raise Exception("Failed while uploading. msg(" + fail_msg + ") after " + str(i) + " retries") time.sleep(PAUSE_ON_FAIL) @@ -261,7 +270,7 @@ class Handler(FileSystemEventHandler): def scan(): for e in os.listdir(CORE_FILE_PATH): fl = CORE_FILE_PATH + e - if os.path.isfile(fl): + if os.path.isfile(fl) and not e.startswith(UPLOAD_PREFIX): Handler.handle_file(fl) diff --git a/files/image_config/corefile_uploader/core_uploader.service b/files/image_config/corefile_uploader/core_uploader.service index 5c061e72a1..09ac1bd519 100644 --- a/files/image_config/corefile_uploader/core_uploader.service +++ b/files/image_config/corefile_uploader/core_uploader.service @@ -1,11 +1,13 @@ [Unit] Description=Host core file uploader daemon -Requires=updategraph.service -After=updategraph.service +Requires=syslog.service +After=syslog.service [Service] Type=simple ExecStart=/usr/bin/core_uploader.py +StandardOutput=null +Restart=on-failure [Install] WantedBy=multi-user.target diff --git a/files/image_config/corefile_uploader/update_json.py b/files/image_config/corefile_uploader/update_json.py deleted file mode 100755 index 03bb39aa4e..0000000000 --- a/files/image_config/corefile_uploader/update_json.py +++ /dev/null @@ -1,55 +0,0 @@ -#! /usr/bin/env python - -import os -import sys -import json -import argparse - -TMP_SUFFIX = ".tmp" -BAK_SUFFIX = ".bak" - -def dict_update(dst, patch): - for k in patch.keys(): - if type(patch[k]) == dict: - dst[k] = dict_update(dst[k], patch[k]) - else: - dst[k] = patch[k] - return dst - -def do_update(rcf, patchf): - dst = {} - patch = {} - - tmpf = rcf + TMP_SUFFIX - bakf = rcf + BAK_SUFFIX - - with open(rcf, "r") as f: - dst = json.load(f) - - with open(patchf, "r") as f: - patch = json.load(f) - - dst = dict_update(dst, patch) - - with open(tmpf, "w") as f: - json.dump(dst, f, indent = 4) - - os.rename(rcf, bakf) - os.rename(tmpf, rcf) - - -def main(): - parser=argparse.ArgumentParser(description="Update JSON based file") - parser.add_argument("-r", "--rc", help="JSON file to be updated") - parser.add_argument("-p", "--patch", help="JSON file holding patch") - args = parser.parse_args() - - if not args.rc or not args.patch: - raise Exception("check usage") - - do_update(args.rc, args.patch) - -if __name__ == '__main__': - main() - -