[containerd]Fixing container commands when mode is local and state is disabled (#9986)

Why I did it
During warm-reboot and fast-reboot the below error logs appear
Feb 3 22:05:15.187408 r-lionfish-13 ERR container: docker cmd: kill for nat failed with 404 Client Error for http+docker://localhost/v1.41/containers/nat/json: Not Found ("No such container: nat")

The container command when called for local mode doesn't check if it is enabled before calling docker kill which throws the above errors.
b6ca76b482/scripts/fast-reboot (L699)

How I did it
Checking feature state if local mode and returning error exit code along with valid debug message.

How to verify it
Manually tested with warm-reboot and fast-reboot
Added UT to verify it.
This commit is contained in:
Sudharsan Dhamal Gopalarathnam 2022-03-02 19:08:06 -08:00 committed by Arvindsrinivasan Lakshmi Narasimhan
parent 43c32dfa5d
commit 4da20e7ff9
2 changed files with 79 additions and 20 deletions

View File

@ -26,6 +26,7 @@ CONTAINER_ID = "container_id"
REMOTE_STATE = "remote_state"
VERSION = "container_version"
SYSTEM_STATE = "system_state"
STATE = "state"
KUBE_LABEL_TABLE = "KUBE_LABELS"
KUBE_LABEL_SET_KEY = "SET"
@ -38,6 +39,9 @@ SONIC_CTR_CONFIG_PEND_SECS = "revert_to_local_on_wait_seconds"
DEFAULT_PEND_SECS = ( 5 * 60 )
WAIT_POLL_SECS = 2
SUCCESS = 0
FAILURE = -1
remote_ctr_enabled = False
def debug_msg(m):
@ -87,10 +91,10 @@ def read_data(is_config, feature, fields):
def read_config(feature):
""" Read requried feature config """
set_owner, no_fallback = read_data(True, feature,
[(SET_OWNER, "local"), (NO_FALLBACK, False)])
set_owner, no_fallback, state = read_data(True, feature,
[(SET_OWNER, "local"), (NO_FALLBACK, False), (STATE, "disabled")])
return (set_owner, not no_fallback)
return (set_owner, not no_fallback, state)
def read_state(feature):
@ -107,12 +111,12 @@ def docker_action(action, feature, **kwargs):
container = client.containers.get(feature)
getattr(container, action)(**kwargs)
syslog.syslog(syslog.LOG_INFO, "docker cmd: {} for {}".format(action, feature))
return 0
return SUCCESS
except (docker.errors.NotFound, docker.errors.APIError) as err:
syslog.syslog(syslog.LOG_ERR, "docker cmd: {} for {} failed with {}".
format(action, feature, str(err)))
return -1
return FAILURE
def set_label(feature, create):
@ -186,7 +190,7 @@ def container_start(feature, **kwargs):
init()
set_owner, fallback = read_config(feature)
set_owner, fallback, _ = read_config(feature)
_, remote_state, _ = read_state(feature)
debug_msg("{}: set_owner:{} fallback:{} remote_state:{}".format(
@ -244,8 +248,8 @@ def container_stop(feature, **kwargs):
debug_msg("BEGIN")
init()
set_owner, _ = read_config(feature)
ret = SUCCESS
set_owner, _ , _ = read_config(feature)
current_owner, remote_state, _ = read_state(feature)
docker_id = container_id(feature)
remove_label = (remote_state != "pending") or (set_owner == "local")
@ -257,7 +261,7 @@ def container_stop(feature, **kwargs):
set_label(feature, False)
if docker_id:
docker_action("stop", docker_id, **kwargs)
ret = docker_action("stop", docker_id, **kwargs)
else:
syslog.syslog(
syslog.LOG_ERR if current_owner != "none" else syslog.LOG_INFO,
@ -286,6 +290,7 @@ def container_stop(feature, **kwargs):
update_data(feature, data)
debug_msg("END")
return ret
def container_kill(feature, **kwargs):
@ -301,19 +306,25 @@ def container_kill(feature, **kwargs):
init()
set_owner, _ = read_config(feature)
ret = SUCCESS
set_owner, _ , state = read_config(feature)
current_owner, remote_state, _ = read_state(feature)
docker_id = container_id(feature)
remove_label = (set_owner != "local") or (current_owner != "local")
debug_msg("{}: set_owner:{} current_owner:{} remote_state:{} docker_id:{}".format(
feature, set_owner, current_owner, remote_state, docker_id))
debug_msg("{}: set_owner:{} current_owner:{} remote_state:{} docker_id:{} state:{}".format(
feature, set_owner, current_owner, remote_state, docker_id, state))
if remove_label:
set_label(feature, False)
if set_owner == "local":
if state not in ["enabled", "always_enabled"]:
debug_msg("{} is not enabled".format(feature))
return FAILURE
if docker_id:
docker_action("kill", docker_id, **kwargs)
ret = docker_action("kill", docker_id, **kwargs)
else:
syslog.syslog(
@ -322,6 +333,7 @@ def container_kill(feature, **kwargs):
debug_msg("END")
return ret
def container_wait(feature, **kwargs):
@ -341,10 +353,11 @@ def container_wait(feature, **kwargs):
init()
set_owner, fallback = read_config(feature)
set_owner, fallback, _ = read_config(feature)
current_owner, remote_state, _ = read_state(feature)
docker_id = container_id(feature)
pend_wait_secs = 0
ret = SUCCESS
if not docker_id and fallback:
pend_wait_secs = get_config_data(
@ -377,8 +390,9 @@ def container_wait(feature, **kwargs):
format(feature))
else:
debug_msg("END -- transitioning to docker wait")
docker_action("wait", docker_id, **kwargs)
ret = docker_action("wait", docker_id, **kwargs)
return ret
def main():
parser=argparse.ArgumentParser(description="container commands for start/stop/wait/kill/id")
@ -389,24 +403,26 @@ def main():
args = parser.parse_args()
kwargs = {}
ret = 0
if args.action == "start":
container_start(args.name, **kwargs)
ret = container_start(args.name, **kwargs)
elif args.action == "stop":
if args.timeout is not None:
kwargs['timeout'] = args.timeout
container_stop(args.name, **kwargs)
ret = container_stop(args.name, **kwargs)
elif args.action == "kill":
container_kill(args.name, **kwargs)
ret = container_kill(args.name, **kwargs)
elif args.action == "wait":
container_wait(args.name, **kwargs)
ret = container_wait(args.name, **kwargs)
elif args.action == "id":
id = container_id(args.name, **kwargs)
print(id)
return ret
if __name__ == "__main__":
main()

View File

@ -269,7 +269,8 @@ kill_test_data = {
common_test.CONFIG_DB_NO: {
common_test.FEATURE_TABLE: {
"snmp": {
"set_owner": "local"
"set_owner": "local",
"state": "enabled"
}
}
},
@ -348,6 +349,30 @@ kill_test_data = {
}
}
# container_kill test cases
# test case 0 -- container kill local disabled container
# -- no change in state-db
# -- no label update
#
invalid_kill_test_data = {
0: {
common_test.DESCR: "container kill for local disabled container",
common_test.PRE: {
common_test.CONFIG_DB_NO: {
common_test.FEATURE_TABLE: {
"sflow": {
"set_owner": "local"
}
}
}
},
common_test.POST: {
},
common_test.ACTIONS: {
}
}
}
# container_wait test cases
# test case 0 -- container wait local
@ -498,6 +523,24 @@ class TestContainer(object):
ret = common_test.check_mock_containers()
assert ret == 0
@patch("container.swsscommon.DBConnector")
@patch("container.swsscommon.Table")
@patch("container.docker.from_env")
def test_invalid_kill(self, mock_docker, mock_table, mock_conn):
self.init()
common_test.set_mock(mock_table, mock_conn, mock_docker)
for (i, ct_data) in invalid_kill_test_data.items():
common_test.do_start_test("container_test:container_kill", i, ct_data)
ret = container.container_kill("sflow")
assert ret != 0
ret = common_test.check_tables_returned()
assert ret == 0
ret = common_test.check_mock_containers()
assert ret == 0
@patch("container.swsscommon.DBConnector")
@patch("container.swsscommon.Table")