From a4633d32ed3ee0e91e9fc698b3378f41295ac3e0 Mon Sep 17 00:00:00 2001 From: mssonicbld <79238446+mssonicbld@users.noreply.github.com> Date: Sat, 27 May 2023 05:27:55 +0800 Subject: [PATCH] Clean up the old version container images (#14978) (#15232) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Why I did it Our k8s feature will pull new version container images for each upgrade, the container images inside sonic will be more and more, but for now we don’t have a way to clean up the old version container images, the disk may be filled up. Need to add cleaning up the old version container images logic. Work item tracking Microsoft ADO (number only): 17979809 How I did it Remove the old version container images besides the feature's current version and last version image, last version image is saved for supporting fallback. How to verify it Check whether the old version images are removed Co-authored-by: lixiaoyuner <35456895+lixiaoyuner@users.noreply.github.com> --- src/sonic-ctrmgrd/ctrmgr/container | 26 +++++ src/sonic-ctrmgrd/ctrmgr/container_startup.py | 13 ++- src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py | 38 ++++++- src/sonic-ctrmgrd/ctrmgr/kube_commands.py | 64 ++++++++++++ .../ctrmgr/remote_ctr.config.json | 3 +- src/sonic-ctrmgrd/tests/common_test.py | 14 +++ .../tests/container_startup_test.py | 6 +- src/sonic-ctrmgrd/tests/container_test.py | 3 +- src/sonic-ctrmgrd/tests/ctrmgrd_test.py | 21 +++- src/sonic-ctrmgrd/tests/kube_commands_test.py | 98 +++++++++++++++++++ 10 files changed, 270 insertions(+), 16 deletions(-) diff --git a/src/sonic-ctrmgrd/ctrmgr/container b/src/sonic-ctrmgrd/ctrmgr/container index ca2394b057..9be2a5b3fe 100755 --- a/src/sonic-ctrmgrd/ctrmgr/container +++ b/src/sonic-ctrmgrd/ctrmgr/container @@ -27,6 +27,7 @@ REMOTE_STATE = "remote_state" VERSION = "container_version" SYSTEM_STATE = "system_state" STATE = "state" +ST_FEAT_CTR_STABLE_VER = "container_stable_version" KUBE_LABEL_TABLE = "KUBE_LABELS" KUBE_LABEL_SET_KEY = "SET" @@ -132,6 +133,26 @@ def docker_action(action, feature, **kwargs): return FAILURE +def container_version(feature): + """ Get container image version """ + version = None + try: + client = docker.from_env() + container = client.containers.get(feature) + envs = container.attrs['Config']['Env'] + for env in envs: + if env.startswith("IMAGE_VERSION="): + version = env.split('=')[1] + + syslog.syslog(syslog.LOG_INFO, "docker get image version for {}".format(feature)) + + except (docker.errors.NotFound, docker.errors.APIError) as err: + syslog.syslog(syslog.LOG_ERR, "docker get image version for {} failed with {}". + format(feature, str(err))) + + return version + + def set_label(feature, create): """ Set/drop label as required Update is done in state-db. @@ -374,6 +395,11 @@ def container_wait(feature, **kwargs): pend_wait_secs = 0 ret = SUCCESS + if docker_id == feature: + version = container_version(feature) + if version: + update_data(feature, {ST_FEAT_CTR_STABLE_VER: version}) + if not docker_id and fallback: pend_wait_secs = get_config_data( SONIC_CTR_CONFIG_PEND_SECS, DEFAULT_PEND_SECS) diff --git a/src/sonic-ctrmgrd/ctrmgr/container_startup.py b/src/sonic-ctrmgrd/ctrmgr/container_startup.py index 2af8e0b384..d94b4ef97b 100755 --- a/src/sonic-ctrmgrd/ctrmgr/container_startup.py +++ b/src/sonic-ctrmgrd/ctrmgr/container_startup.py @@ -36,6 +36,11 @@ def _get_version_key(feature, version): return "{}_{}_enabled".format(feature, version) +def _get_local_version_key(feature): + # Coin label for track laster local version + return "{}_local".format(feature) + + def read_data(feature): state_data = { CURRENT_OWNER: "none", @@ -87,8 +92,8 @@ def check_version_blocked(state_db, feature, version): # tbl = swsscommon.Table(state_db, KUBE_LABEL_TABLE) labels = dict(tbl.get(KUBE_LABEL_SET_KEY)[1]) - key = _get_version_key(feature, version) - return (key in labels) and (labels[key].lower() == "false") + key = _get_local_version_key(feature) + return (key in labels) and (labels[key].lower() == version.lower()) def drop_label(state_db, feature, version): @@ -97,8 +102,8 @@ def drop_label(state_db, feature, version): # ctrmgrd sets it with kube API server per reaschability tbl = swsscommon.Table(state_db, KUBE_LABEL_TABLE) - name = _get_version_key(feature, version) - tbl.set(KUBE_LABEL_SET_KEY, [ (name, "false")]) + name = _get_local_version_key(feature) + tbl.set(KUBE_LABEL_SET_KEY, [(name, version)]) def update_data(state_db, feature, data): diff --git a/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py b/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py index b6b3d0bba2..27648dd22d 100755 --- a/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py +++ b/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py @@ -48,6 +48,8 @@ ST_FEAT_OWNER = "current_owner" ST_FEAT_UPDATE_TS = "update_time" ST_FEAT_CTR_ID = "container_id" ST_FEAT_CTR_VER = "container_version" +ST_FEAT_CTR_STABLE_VER = "container_stable_version" +ST_FEAT_CTR_LAST_VER = "container_last_version" ST_FEAT_REMOTE_STATE = "remote_state" ST_FEAT_SYS_STATE = "system_state" @@ -59,6 +61,7 @@ MODE_LOCAL = "local" OWNER_KUBE = "kube" OWNER_LOCAL = "local" OWNER_NONE = "none" +REMOTE_RUNNING = "running" REMOTE_READY = "ready" REMOTE_PENDING = "pending" REMOTE_STOPPED = "stopped" @@ -91,6 +94,8 @@ dflt_st_feat= { ST_FEAT_UPDATE_TS: "", ST_FEAT_CTR_ID: "", ST_FEAT_CTR_VER: "", + ST_FEAT_CTR_STABLE_VER: "", + ST_FEAT_CTR_LAST_VER: "", ST_FEAT_REMOTE_STATE: "none", ST_FEAT_SYS_STATE: "" } @@ -100,18 +105,20 @@ JOIN_RETRY = "retry_join_interval_seconds" LABEL_RETRY = "retry_labels_update_seconds" TAG_IMAGE_LATEST = "tag_latest_image_on_wait_seconds" TAG_RETRY = "retry_tag_latest_seconds" +CLEAN_IMAGE_RETRY = "retry_clean_image_seconds" USE_K8S_PROXY = "use_k8s_as_http_proxy" remote_ctr_config = { JOIN_LATENCY: 10, JOIN_RETRY: 10, LABEL_RETRY: 2, - TAG_IMAGE_LATEST: 30, + TAG_IMAGE_LATEST: 5, TAG_RETRY: 5, + CLEAN_IMAGE_RETRY: 5, USE_K8S_PROXY: "" } -ENABLED_FEATURE_SET = {"telemetry", "snmp"} +DISABLED_FEATURE_SET = {"database"} def log_debug(m): msg = "{}: {}".format(inspect.stack()[1][3], m) @@ -271,7 +278,7 @@ class MainServer: key, op, fvs = subscriber.pop() if not key: continue - if subscriber.getTableName() == FEATURE_TABLE and key not in ENABLED_FEATURE_SET: + if subscriber.getTableName() == FEATURE_TABLE and key in DISABLED_FEATURE_SET: continue log_debug("Received message : '%s'" % str((key, op, fvs))) for callback in (self.callbacks @@ -545,7 +552,7 @@ class FeatureTransitionHandler: self.st_data[key] = _update_entry(dflt_st_feat, data) remote_state = self.st_data[key][ST_FEAT_REMOTE_STATE] - if (old_remote_state != remote_state) and (remote_state == "running"): + if (remote_state == REMOTE_RUNNING) and (old_remote_state != remote_state): # Tag latest start_time = datetime.datetime.now() + datetime.timedelta( seconds=remote_ctr_config[TAG_IMAGE_LATEST]) @@ -583,6 +590,29 @@ class FeatureTransitionHandler: log_debug("Tag latest as local failed retry after {} seconds @{}". format(remote_ctr_config[TAG_RETRY], self.start_time)) + else: + last_version = self.st_data[feat][ST_FEAT_CTR_STABLE_VER] + if last_version == image_ver: + last_version = self.st_data[feat][ST_FEAT_CTR_LAST_VER] + self.server.mod_db_entry(STATE_DB_NAME, FEATURE_TABLE, feat, + {ST_FEAT_CTR_STABLE_VER: image_ver, + ST_FEAT_CTR_LAST_VER: last_version}) + self.st_data[ST_FEAT_CTR_LAST_VER] = last_version + self.st_data[ST_FEAT_CTR_STABLE_VER] = image_ver + self.do_clean_image(feat, image_ver, last_version) + + def do_clean_image(self, feat, current_version, last_version): + ret = kube_commands.clean_image(feat, current_version, last_version) + if ret != 0: + # Clean up old version images failed. Retry after an interval + self.start_time = datetime.datetime.now() + self.start_time += datetime.timedelta( + seconds=remote_ctr_config[CLEAN_IMAGE_RETRY]) + self.server.register_timer(self.start_time, self.do_clean_image, (feat, current_version, last_version)) + + log_debug("Clean up old version images failed retry after {} seconds @{}". + format(remote_ctr_config[CLEAN_IMAGE_RETRY], self.start_time)) + # # Label re-sync diff --git a/src/sonic-ctrmgrd/ctrmgr/kube_commands.py b/src/sonic-ctrmgrd/ctrmgr/kube_commands.py index 6576f25800..fe37c0c842 100755 --- a/src/sonic-ctrmgrd/ctrmgr/kube_commands.py +++ b/src/sonic-ctrmgrd/ctrmgr/kube_commands.py @@ -483,6 +483,70 @@ def tag_latest(feat, docker_id, image_ver): log_error(err) return ret +def _do_clean(feat, current_version, last_version): + err = "" + out = "" + ret = 0 + DOCKER_ID = "docker_id" + REPO = "repo" + _, image_info, err = _run_command("docker images |grep {} |grep -v latest |awk '{{print $1,$2,$3}}'".format(feat)) + if image_info: + version_dict = {} + version_dict_default = {} + for info in image_info.split("\n"): + rep, version, docker_id = info.split() + if len(rep.split("/")) == 1: + version_dict_default[version] = {DOCKER_ID: docker_id, REPO: rep} + else: + version_dict[version] = {DOCKER_ID: docker_id, REPO: rep} + + if current_version in version_dict: + image_prefix = version_dict[current_version][REPO] + del version_dict[current_version] + else: + out = "Current version {} doesn't exist.".format(current_version) + ret = 0 + return ret, out, err + # should be only one item in version_dict_default + for k, v in version_dict_default.items(): + local_version, local_repo, local_docker_id = k, v[REPO], v[DOCKER_ID] + tag_res, _, err = _run_command("docker tag {} {}:{} && docker rmi {}:{}".format( + local_docker_id, image_prefix, local_version, local_repo, local_version)) + if tag_res == 0: + msg = "Tag {} local version images successfully".format(feat) + log_debug(msg) + else: + ret = 1 + err = "Failed to tag {} local version images. Err: {}".format(feat, err) + return ret, out, err + + if last_version in version_dict: + del version_dict[last_version] + + versions = [item[DOCKER_ID] for item in version_dict.values()] + if versions: + clean_res, _, err = _run_command("docker rmi {} --force".format(" ".join(versions))) + else: + clean_res = 0 + if clean_res == 0: + out = "Clean {} old version images successfully".format(feat) + else: + err = "Failed to clean {} old version images. Err: {}".format(feat, err) + ret = 1 + else: + err = "Failed to docker images |grep {} |awk '{{print $3}}'".format(feat) + ret = 1 + + return ret, out, err + +def clean_image(feat, current_version, last_version): + ret, out, err = _do_clean(feat, current_version, last_version) + if ret == 0: + log_debug(out) + else: + log_error(err) + return ret + def main(): syslog.openlog("kube_commands") parser=argparse.ArgumentParser(description= diff --git a/src/sonic-ctrmgrd/ctrmgr/remote_ctr.config.json b/src/sonic-ctrmgrd/ctrmgr/remote_ctr.config.json index b6d2633486..45701430ff 100644 --- a/src/sonic-ctrmgrd/ctrmgr/remote_ctr.config.json +++ b/src/sonic-ctrmgrd/ctrmgr/remote_ctr.config.json @@ -2,9 +2,10 @@ "join_latency_on_boot_seconds": 300, "retry_join_interval_seconds": 30, "retry_labels_update_seconds": 5, - "revert_to_local_on_wait_seconds": 60, + "revert_to_local_on_wait_seconds": 360, "tag_latest_image_on_wait_seconds": 600, "retry_tag_latest_seconds": 30, + "retry_clean_image_seconds": 30, "use_k8s_as_http_proxy": "n" } diff --git a/src/sonic-ctrmgrd/tests/common_test.py b/src/sonic-ctrmgrd/tests/common_test.py index 59f774596a..45f81d150d 100755 --- a/src/sonic-ctrmgrd/tests/common_test.py +++ b/src/sonic-ctrmgrd/tests/common_test.py @@ -185,6 +185,7 @@ class mock_container: self.actions = [] self.name = name self.image = mock_image(self.actions) + self.attrs = {"Config": {"Env": ["IMAGE_VERSION=20201231.11"]}} def start(self): @@ -547,6 +548,19 @@ def set_mock_kube(kube_labels, kube_join, kube_reset): kube_reset.side_effect = kube_reset_side_effect +def clean_image_side_effect(feat, current_version, last_version): + return 0 + + +def tag_latest_side_effect(feat, docker_id, image_ver): + return 0 + + +def set_mock_image_op(clean_image, tag_latest): + clean_image.side_effect = clean_image_side_effect + tag_latest.side_effect = tag_latest_side_effect + + def str_comp(needle, hay): nlen = len(needle) hlen = len(hay) diff --git a/src/sonic-ctrmgrd/tests/container_startup_test.py b/src/sonic-ctrmgrd/tests/container_startup_test.py index b21fe85566..23523f66b1 100755 --- a/src/sonic-ctrmgrd/tests/container_startup_test.py +++ b/src/sonic-ctrmgrd/tests/container_startup_test.py @@ -36,7 +36,7 @@ startup_test_data = { }, common_test.KUBE_LABEL_TABLE: { "SET": { - "snmp_20201230.11_enabled": "false" + "snmp_local": "20201230.11" } } } @@ -345,7 +345,7 @@ startup_test_data = { }, common_test.KUBE_LABEL_TABLE: { "SET": { - "snmp_20201230.11_enabled": "false" + "snmp_local": "20201230.11" } } } @@ -363,7 +363,7 @@ startup_test_data = { }, common_test.KUBE_LABEL_TABLE: { "SET": { - "snmp_20201230.11_enabled": "false" + "snmp_local": "20201230.11" } } } diff --git a/src/sonic-ctrmgrd/tests/container_test.py b/src/sonic-ctrmgrd/tests/container_test.py index 4581111015..8d1534f8b0 100755 --- a/src/sonic-ctrmgrd/tests/container_test.py +++ b/src/sonic-ctrmgrd/tests/container_test.py @@ -416,7 +416,8 @@ wait_test_data = { "remote_state": "none", "system_state": "up", "current_owner": "local", - "container_id": "snmp" + "container_id": "snmp", + "container_stable_version": "20201231.11" } } } diff --git a/src/sonic-ctrmgrd/tests/ctrmgrd_test.py b/src/sonic-ctrmgrd/tests/ctrmgrd_test.py index 0a298dd603..0304985224 100755 --- a/src/sonic-ctrmgrd/tests/ctrmgrd_test.py +++ b/src/sonic-ctrmgrd/tests/ctrmgrd_test.py @@ -291,7 +291,7 @@ feature_test_data = { common_test.STATE_DB_NO: { common_test.FEATURE_TABLE: { "snmp": { - "remote_state": "pending" + "remote_state": "ready" } } } @@ -307,7 +307,19 @@ feature_test_data = { common_test.STATE_DB_NO: { common_test.FEATURE_TABLE: { "snmp": { - "remote_state": "running" + "remote_state": "running", + "container_version": "20201231.74", + "container_stable_version": "20201231.64" + } + } + } + }, + common_test.POST: { + common_test.STATE_DB_NO: { + common_test.FEATURE_TABLE: { + "snmp": { + "container_last_version": "20201231.64", + "container_stable_version": "20201231.74" } } } @@ -473,13 +485,16 @@ class TestContainerStartup(object): @patch("ctrmgrd.kube_commands.kube_reset_master") @patch("ctrmgrd.kube_commands.kube_join_master") @patch("ctrmgrd.kube_commands.kube_write_labels") - def test_feature(self, mock_kube_wr, mock_kube_join, mock_kube_rst, mock_subs, + @patch("ctrmgrd.kube_commands.tag_latest") + @patch("ctrmgrd.kube_commands.clean_image") + def test_feature(self, mock_clean_image, mock_tag_latest, mock_kube_wr, mock_kube_join, mock_kube_rst, mock_subs, mock_select, mock_table, mock_conn): self.init() ret = 0 common_test.set_mock(mock_table, mock_conn) common_test.set_mock_sel(mock_select, mock_subs) common_test.set_mock_kube(mock_kube_wr, mock_kube_join, mock_kube_rst) + common_test.set_mock_image_op(mock_clean_image, mock_tag_latest) for (i, ct_data) in feature_test_data.items(): common_test.do_start_test("ctrmgrd:feature", i, ct_data) diff --git a/src/sonic-ctrmgrd/tests/kube_commands_test.py b/src/sonic-ctrmgrd/tests/kube_commands_test.py index 9b4dd6842b..40b8ab180b 100755 --- a/src/sonic-ctrmgrd/tests/kube_commands_test.py +++ b/src/sonic-ctrmgrd/tests/kube_commands_test.py @@ -309,6 +309,93 @@ tag_latest_test_data = { } } +clean_image_test_data = { + 0: { + common_test.DESCR: "Clean image successfuly(kube to kube)", + common_test.RETVAL: 0, + common_test.ARGS: ["snmp", "20201231.84", "20201231.74"], + common_test.PROC_CMD: [ + "docker images |grep snmp |grep -v latest |awk '{print $1,$2,$3}'", + "docker rmi 744d3a09062f --force" + ], + common_test.PROC_OUT: [ + "sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.74 507f8d28bf6e\n\ + sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.96 744d3a09062f\n\ + sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.84 507f8d28bf6e", + "" + ], + common_test.PROC_CODE: [ + 0, + 0 + ] + }, + 1: { + common_test.DESCR: "Clean image failed(delete image failed)", + common_test.RETVAL: 1, + common_test.ARGS: ["snmp", "20201231.84", "20201231.74"], + common_test.PROC_CMD: [ + "docker images |grep snmp |grep -v latest |awk '{print $1,$2,$3}'", + "docker rmi 744d3a09062f --force" + ], + common_test.PROC_OUT: [ + "sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.74 507f8d28bf6e\n\ + sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.96 744d3a09062f\n\ + sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.84 507f8d28bf6e", + "" + ], + common_test.PROC_CODE: [ + 0, + 1 + ] + }, + 2: { + common_test.DESCR: "Clean image failed(no image found)", + common_test.RETVAL: 1, + common_test.ARGS: ["snmp", "20201231.84", "20201231.74"], + common_test.PROC_CMD: [ + "docker images |grep snmp |grep -v latest |awk '{print $1,$2,$3}'" + ], + common_test.PROC_OUT: [ + "" + ] + }, + 3: { + common_test.DESCR: "Clean image failed(current image doesn't exist)", + common_test.RETVAL: 0, + common_test.ARGS: ["snmp", "20201231.84", "20201231.74"], + common_test.PROC_CMD: [ + "docker images |grep snmp |grep -v latest |awk '{print $1,$2,$3}'", + "" + ], + common_test.PROC_OUT: [ + "sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.74 507f8d28bf6e\n\ + sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.96 744d3a09062f", + "" + ], + common_test.PROC_CODE: [ + 0 + ] + }, + 4: { + common_test.DESCR: "Clean image successfuly(local to kube)", + common_test.RETVAL: 0, + common_test.ARGS: ["snmp", "20201231.84", ""], + common_test.PROC_CMD: [ + "docker images |grep snmp |grep -v latest |awk '{print $1,$2,$3}'", + "docker tag 507f8d28bf6e sonick8scue.azurecr.io/docker-sonic-telemetry:20201231.74 && docker rmi docker-sonic-telemetry:20201231.74" + ], + common_test.PROC_OUT: [ + "docker-sonic-telemetry 20201231.74 507f8d28bf6e\n\ + sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.84 507f8d28bf6e", + "" + ], + common_test.PROC_CODE: [ + 0, + 0 + ] + }, +} + class TestKubeCommands(object): def init(self): @@ -467,3 +554,14 @@ clusters:\n\ ret = kube_commands.tag_latest(*ct_data[common_test.ARGS]) if common_test.RETVAL in ct_data: assert ret == ct_data[common_test.RETVAL] + + @patch("kube_commands.subprocess.Popen") + def test_clean_image(self, mock_subproc): + common_test.set_kube_mock(mock_subproc) + + for (i, ct_data) in clean_image_test_data.items(): + common_test.do_start_test("clean:image", i, ct_data) + + ret = kube_commands.clean_image(*ct_data[common_test.ARGS]) + if common_test.RETVAL in ct_data: + assert ret == ct_data[common_test.RETVAL]