From 14de0a15489daca71ddb75a81d7c8d6b2d9efa93 Mon Sep 17 00:00:00 2001 From: Sudharsan Dhamal Gopalarathnam Date: Wed, 2 Mar 2022 19:08:06 -0800 Subject: [PATCH] [containerd]Fixing container commands when mode is local and state is disabled (#9986) Why I did it During warm-reboot and fast-reboot the below error logs appear Feb 3 22:05:15.187408 r-lionfish-13 ERR container: docker cmd: kill for nat failed with 404 Client Error for http+docker://localhost/v1.41/containers/nat/json: Not Found ("No such container: nat") The container command when called for local mode doesn't check if it is enabled before calling docker kill which throws the above errors. https://github.com/Azure/sonic-utilities/blob/b6ca76b4821453171d9607383b808385968eeeeb/scripts/fast-reboot#L699 How I did it Checking feature state if local mode and returning error exit code along with valid debug message. How to verify it Manually tested with warm-reboot and fast-reboot Added UT to verify it. --- src/sonic-ctrmgrd/ctrmgr/container | 54 +++++++++++++++-------- src/sonic-ctrmgrd/tests/container_test.py | 45 ++++++++++++++++++- 2 files changed, 79 insertions(+), 20 deletions(-) diff --git a/src/sonic-ctrmgrd/ctrmgr/container b/src/sonic-ctrmgrd/ctrmgr/container index 666c967540..db6ded635e 100755 --- a/src/sonic-ctrmgrd/ctrmgr/container +++ b/src/sonic-ctrmgrd/ctrmgr/container @@ -26,6 +26,7 @@ CONTAINER_ID = "container_id" REMOTE_STATE = "remote_state" VERSION = "container_version" SYSTEM_STATE = "system_state" +STATE = "state" KUBE_LABEL_TABLE = "KUBE_LABELS" KUBE_LABEL_SET_KEY = "SET" @@ -38,6 +39,9 @@ SONIC_CTR_CONFIG_PEND_SECS = "revert_to_local_on_wait_seconds" DEFAULT_PEND_SECS = ( 5 * 60 ) WAIT_POLL_SECS = 2 +SUCCESS = 0 +FAILURE = -1 + remote_ctr_enabled = False def debug_msg(m): @@ -87,10 +91,10 @@ def read_data(is_config, feature, fields): def read_config(feature): """ Read requried feature config """ - set_owner, no_fallback = read_data(True, feature, - [(SET_OWNER, "local"), (NO_FALLBACK, False)]) + set_owner, no_fallback, state = read_data(True, feature, + [(SET_OWNER, "local"), (NO_FALLBACK, False), (STATE, "disabled")]) - return (set_owner, not no_fallback) + return (set_owner, not no_fallback, state) def read_state(feature): @@ -107,12 +111,12 @@ def docker_action(action, feature, **kwargs): container = client.containers.get(feature) getattr(container, action)(**kwargs) syslog.syslog(syslog.LOG_INFO, "docker cmd: {} for {}".format(action, feature)) - return 0 + return SUCCESS except (docker.errors.NotFound, docker.errors.APIError) as err: syslog.syslog(syslog.LOG_ERR, "docker cmd: {} for {} failed with {}". format(action, feature, str(err))) - return -1 + return FAILURE def set_label(feature, create): @@ -186,7 +190,7 @@ def container_start(feature, **kwargs): init() - set_owner, fallback = read_config(feature) + set_owner, fallback, _ = read_config(feature) _, remote_state, _ = read_state(feature) debug_msg("{}: set_owner:{} fallback:{} remote_state:{}".format( @@ -244,8 +248,8 @@ def container_stop(feature, **kwargs): debug_msg("BEGIN") init() - - set_owner, _ = read_config(feature) + ret = SUCCESS + set_owner, _ , _ = read_config(feature) current_owner, remote_state, _ = read_state(feature) docker_id = container_id(feature) remove_label = (remote_state != "pending") or (set_owner == "local") @@ -257,7 +261,7 @@ def container_stop(feature, **kwargs): set_label(feature, False) if docker_id: - docker_action("stop", docker_id, **kwargs) + ret = docker_action("stop", docker_id, **kwargs) else: syslog.syslog( syslog.LOG_ERR if current_owner != "none" else syslog.LOG_INFO, @@ -286,6 +290,7 @@ def container_stop(feature, **kwargs): update_data(feature, data) debug_msg("END") + return ret def container_kill(feature, **kwargs): @@ -301,19 +306,25 @@ def container_kill(feature, **kwargs): init() - set_owner, _ = read_config(feature) + ret = SUCCESS + set_owner, _ , state = read_config(feature) current_owner, remote_state, _ = read_state(feature) docker_id = container_id(feature) remove_label = (set_owner != "local") or (current_owner != "local") - debug_msg("{}: set_owner:{} current_owner:{} remote_state:{} docker_id:{}".format( - feature, set_owner, current_owner, remote_state, docker_id)) + debug_msg("{}: set_owner:{} current_owner:{} remote_state:{} docker_id:{} state:{}".format( + feature, set_owner, current_owner, remote_state, docker_id, state)) if remove_label: set_label(feature, False) + if set_owner == "local": + if state not in ["enabled", "always_enabled"]: + debug_msg("{} is not enabled".format(feature)) + return FAILURE + if docker_id: - docker_action("kill", docker_id, **kwargs) + ret = docker_action("kill", docker_id, **kwargs) else: syslog.syslog( @@ -322,6 +333,7 @@ def container_kill(feature, **kwargs): debug_msg("END") + return ret def container_wait(feature, **kwargs): @@ -341,10 +353,11 @@ def container_wait(feature, **kwargs): init() - set_owner, fallback = read_config(feature) + set_owner, fallback, _ = read_config(feature) current_owner, remote_state, _ = read_state(feature) docker_id = container_id(feature) pend_wait_secs = 0 + ret = SUCCESS if not docker_id and fallback: pend_wait_secs = get_config_data( @@ -377,8 +390,9 @@ def container_wait(feature, **kwargs): format(feature)) else: debug_msg("END -- transitioning to docker wait") - docker_action("wait", docker_id, **kwargs) + ret = docker_action("wait", docker_id, **kwargs) + return ret def main(): parser=argparse.ArgumentParser(description="container commands for start/stop/wait/kill/id") @@ -389,24 +403,26 @@ def main(): args = parser.parse_args() kwargs = {} + ret = 0 if args.action == "start": - container_start(args.name, **kwargs) + ret = container_start(args.name, **kwargs) elif args.action == "stop": if args.timeout is not None: kwargs['timeout'] = args.timeout - container_stop(args.name, **kwargs) + ret = container_stop(args.name, **kwargs) elif args.action == "kill": - container_kill(args.name, **kwargs) + ret = container_kill(args.name, **kwargs) elif args.action == "wait": - container_wait(args.name, **kwargs) + ret = container_wait(args.name, **kwargs) elif args.action == "id": id = container_id(args.name, **kwargs) print(id) + return ret if __name__ == "__main__": main() diff --git a/src/sonic-ctrmgrd/tests/container_test.py b/src/sonic-ctrmgrd/tests/container_test.py index cc5f89e8d9..4738597c72 100755 --- a/src/sonic-ctrmgrd/tests/container_test.py +++ b/src/sonic-ctrmgrd/tests/container_test.py @@ -269,7 +269,8 @@ kill_test_data = { common_test.CONFIG_DB_NO: { common_test.FEATURE_TABLE: { "snmp": { - "set_owner": "local" + "set_owner": "local", + "state": "enabled" } } }, @@ -348,6 +349,30 @@ kill_test_data = { } } +# container_kill test cases +# test case 0 -- container kill local disabled container +# -- no change in state-db +# -- no label update +# +invalid_kill_test_data = { + 0: { + common_test.DESCR: "container kill for local disabled container", + common_test.PRE: { + common_test.CONFIG_DB_NO: { + common_test.FEATURE_TABLE: { + "sflow": { + "set_owner": "local" + } + } + } + }, + common_test.POST: { + }, + common_test.ACTIONS: { + } + } +} + # container_wait test cases # test case 0 -- container wait local @@ -498,6 +523,24 @@ class TestContainer(object): ret = common_test.check_mock_containers() assert ret == 0 + @patch("container.swsscommon.DBConnector") + @patch("container.swsscommon.Table") + @patch("container.docker.from_env") + def test_invalid_kill(self, mock_docker, mock_table, mock_conn): + self.init() + common_test.set_mock(mock_table, mock_conn, mock_docker) + + for (i, ct_data) in invalid_kill_test_data.items(): + common_test.do_start_test("container_test:container_kill", i, ct_data) + + ret = container.container_kill("sflow") + assert ret != 0 + + ret = common_test.check_tables_returned() + assert ret == 0 + + ret = common_test.check_mock_containers() + assert ret == 0 @patch("container.swsscommon.DBConnector") @patch("container.swsscommon.Table")