Why I did it To reduce the container's dependency from host system Work item tracking Microsoft ADO (number only): 17713469 How I did it Move the k8s container startup script to config engine container, other than mount it from host. How to verify it Check file path(/usr/share/sonic/scripts/container_startup.py) inside config engine container. Signed-off-by: Yun Li <yunli1@microsoft.com> Co-authored-by: Qi Luo <qiluo-msft@users.noreply.github.com>
470 lines
13 KiB
Python
Executable File
470 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import os
|
|
import inspect
|
|
import json
|
|
import syslog
|
|
import time
|
|
import datetime
|
|
|
|
import docker
|
|
from swsscommon import swsscommon
|
|
|
|
CTRMGRD_SERVICE_PATH = '/lib/systemd/system/ctrmgrd.service'
|
|
|
|
state_db = None
|
|
|
|
# DB field names
|
|
FEATURE_TABLE = "FEATURE"
|
|
SET_OWNER = "set_owner"
|
|
NO_FALLBACK = "no_fallback_to_local"
|
|
|
|
CURRENT_OWNER = "current_owner"
|
|
UPD_TIMESTAMP = "update_time"
|
|
CONTAINER_ID = "container_id"
|
|
REMOTE_STATE = "remote_state"
|
|
VERSION = "container_version"
|
|
SYSTEM_STATE = "system_state"
|
|
STATE = "state"
|
|
ST_FEAT_CTR_STABLE_VER = "container_stable_version"
|
|
|
|
KUBE_LABEL_TABLE = "KUBE_LABELS"
|
|
KUBE_LABEL_SET_KEY = "SET"
|
|
SERVER_TABLE = "KUBERNETES_MASTER"
|
|
SERVER_KEY = "SERVER"
|
|
ST_SER_CONNECTED = "connected"
|
|
ST_SER_UPDATE_TS = "update_time"
|
|
|
|
# Get seconds to wait for remote docker to start.
|
|
# If not, revert to local
|
|
#
|
|
SONIC_CTR_CONFIG = "/etc/sonic/remote_ctr.config.json"
|
|
SONIC_CTR_CONFIG_PEND_SECS = "revert_to_local_on_wait_seconds"
|
|
DEFAULT_PEND_SECS = ( 5 * 60 )
|
|
WAIT_POLL_SECS = 2
|
|
|
|
SUCCESS = 0
|
|
FAILURE = -1
|
|
|
|
remote_ctr_enabled = False
|
|
|
|
def debug_msg(m):
|
|
msg = "{}: {}".format(inspect.stack()[1][3], m)
|
|
# print(msg)
|
|
syslog.syslog(syslog.LOG_DEBUG, msg)
|
|
|
|
|
|
def init():
|
|
""" Get DB connections """
|
|
global state_db, cfg_db, remote_ctr_enabled
|
|
|
|
cfg_db = swsscommon.DBConnector("CONFIG_DB", 0)
|
|
state_db = swsscommon.DBConnector("STATE_DB", 0)
|
|
|
|
remote_ctr_enabled = os.path.exists(CTRMGRD_SERVICE_PATH)
|
|
|
|
|
|
def get_config_data(fld, dflt):
|
|
""" Read entry from kube config file """
|
|
if os.path.exists(SONIC_CTR_CONFIG):
|
|
with open(SONIC_CTR_CONFIG, "r") as s:
|
|
d = json.load(s)
|
|
if fld in d:
|
|
return d[fld]
|
|
return dflt
|
|
|
|
|
|
def read_data(is_config, feature, fields):
|
|
""" Read data from DB for desired fields using given defaults"""
|
|
ret = []
|
|
|
|
db = cfg_db if is_config else state_db
|
|
if feature == SERVER_KEY:
|
|
tbl = swsscommon.Table(db, SERVER_TABLE)
|
|
else:
|
|
tbl = swsscommon.Table(db, FEATURE_TABLE)
|
|
|
|
data = dict(tbl.get(feature)[1])
|
|
for (field, default) in fields:
|
|
val = data.get(field, default)
|
|
ret += [val]
|
|
|
|
debug_msg("config:{} feature:{} fields:{} val:{}".format(
|
|
is_config, feature, str(fields), str(ret)))
|
|
|
|
return tuple(ret)
|
|
|
|
|
|
def read_config(feature):
|
|
""" Read requried feature config """
|
|
set_owner, no_fallback, state = read_data(True, feature,
|
|
[(SET_OWNER, "local"), (NO_FALLBACK, False), (STATE, "disabled")])
|
|
|
|
return (set_owner, not no_fallback, state)
|
|
|
|
|
|
def read_state(feature):
|
|
""" Read requried feature state """
|
|
|
|
return read_data(False, feature,
|
|
[(CURRENT_OWNER, "none"), (REMOTE_STATE, "none"), (CONTAINER_ID, "")])
|
|
|
|
|
|
def read_server_state():
|
|
""" Read requried feature state """
|
|
|
|
return read_data(False, SERVER_KEY,
|
|
[(ST_SER_CONNECTED, "false"), (ST_SER_UPDATE_TS, "")])
|
|
|
|
|
|
def docker_action(action, feature, **kwargs):
|
|
""" Execute docker action """
|
|
try:
|
|
client = docker.from_env()
|
|
container = client.containers.get(feature)
|
|
getattr(container, action)(**kwargs)
|
|
syslog.syslog(syslog.LOG_INFO, "docker cmd: {} for {}".format(action, feature))
|
|
return SUCCESS
|
|
|
|
except (docker.errors.NotFound, docker.errors.APIError) as err:
|
|
syslog.syslog(syslog.LOG_ERR, "docker cmd: {} for {} failed with {}".
|
|
format(action, feature, str(err)))
|
|
return FAILURE
|
|
|
|
|
|
def container_version(feature):
|
|
""" Get container image version """
|
|
version = None
|
|
try:
|
|
client = docker.from_env()
|
|
container = client.containers.get(feature)
|
|
envs = container.attrs['Config']['Env']
|
|
for env in envs:
|
|
if env.startswith("IMAGE_VERSION="):
|
|
version = env.split('=')[1]
|
|
|
|
syslog.syslog(syslog.LOG_INFO, "docker get image version for {}".format(feature))
|
|
|
|
except (docker.errors.NotFound, docker.errors.APIError) as err:
|
|
syslog.syslog(syslog.LOG_ERR, "docker get image version for {} failed with {}".
|
|
format(feature, str(err)))
|
|
|
|
return version
|
|
|
|
|
|
def set_label(feature, create):
|
|
""" Set/drop label as required
|
|
Update is done in state-db.
|
|
ctrmgrd sets it with kube API server as required
|
|
"""
|
|
if remote_ctr_enabled:
|
|
tbl = swsscommon.Table(state_db, KUBE_LABEL_TABLE)
|
|
fld = "{}_enabled".format(feature)
|
|
|
|
# redundant set (data already exist) can still raise subscriber
|
|
# notification. So check & set.
|
|
# Redundant delete (data doesn't exist) does not raise any
|
|
# subscriber notification. So no need to pre-check for delete.
|
|
#
|
|
tbl.set(KUBE_LABEL_SET_KEY, [(fld, "true" if create else "false")])
|
|
|
|
|
|
def update_data(feature, data):
|
|
if remote_ctr_enabled:
|
|
debug_msg("feature:{} data:{}".format(feature, str(data)))
|
|
tbl = swsscommon.Table(state_db, FEATURE_TABLE)
|
|
tbl.set(feature, list(data.items()))
|
|
|
|
|
|
def container_id(feature):
|
|
"""
|
|
Return the container ID for the feature.
|
|
|
|
if current_owner is local, use feature name as the start/stop
|
|
of local image is synchronous.
|
|
Else get it from FEATURE table in STATE-DB
|
|
|
|
:param feature: Name of the feature to start.
|
|
|
|
"""
|
|
init()
|
|
|
|
tbl = swsscommon.Table(state_db, "FEATURE")
|
|
data = dict(tbl.get(feature)[1])
|
|
|
|
if (data.get(CURRENT_OWNER, "").lower() == "local"):
|
|
return feature
|
|
else:
|
|
return data.get(CONTAINER_ID, feature)
|
|
|
|
|
|
def container_start(feature, **kwargs):
|
|
"""
|
|
Starts a container for given feature.
|
|
|
|
Starts from local image and/or trigger kubernetes to deploy the image
|
|
for this feature. Marks the feature state up in STATE-DB FEATURE table.
|
|
|
|
If feature's set_owner is local, invoke docker start.
|
|
If feature's set_owner is kube, it creates a node label that
|
|
would trigger kubernetes to start the container. With kube as
|
|
owner, if fallback is enabled and remote_state==none, it starts
|
|
the local image using docker, which will run until kube
|
|
deployment occurs.
|
|
|
|
:param feature: Name of the feature to start.
|
|
|
|
"""
|
|
START_LOCAL = 1
|
|
START_KUBE = 2
|
|
|
|
ret = 0
|
|
debug_msg("BEGIN")
|
|
|
|
init()
|
|
|
|
set_owner, fallback, _ = read_config(feature)
|
|
_, remote_state, _ = read_state(feature)
|
|
server_connected, _ = read_server_state()
|
|
|
|
debug_msg("{}: set_owner:{} fallback:{} remote_state:{} server_connected:{}".format(
|
|
feature, set_owner, fallback, remote_state, server_connected))
|
|
|
|
data = {
|
|
SYSTEM_STATE: "up",
|
|
UPD_TIMESTAMP: str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
|
}
|
|
|
|
|
|
start_val = 0
|
|
if set_owner == "local":
|
|
start_val = START_LOCAL
|
|
else:
|
|
start_val = START_KUBE
|
|
if fallback and (remote_state == "none" or server_connected == "false"):
|
|
start_val |= START_LOCAL
|
|
data[REMOTE_STATE] = "none"
|
|
|
|
if start_val == START_LOCAL:
|
|
# Implies *only* local.
|
|
# Ensure label is not there, to block kube deployment.
|
|
set_label(feature, False)
|
|
data[REMOTE_STATE] = "none"
|
|
|
|
if (start_val & START_LOCAL):
|
|
data[CURRENT_OWNER] = "local"
|
|
data[CONTAINER_ID] = feature
|
|
|
|
update_data(feature, data)
|
|
|
|
if (start_val & START_LOCAL):
|
|
ret = docker_action("start", feature, **kwargs)
|
|
|
|
if (start_val & START_KUBE):
|
|
set_label(feature, True)
|
|
debug_msg("END")
|
|
return ret
|
|
|
|
|
|
def container_stop(feature, **kwargs):
|
|
"""
|
|
Stops the running container for this feature.
|
|
|
|
Instruct/ensure kube terminates, by removing label, unless
|
|
an kube upgrade is happening.
|
|
|
|
Gets the container ID for this feature and call docker stop.
|
|
|
|
Marks the feature state down in STATE-DB FEATURE table.
|
|
|
|
:param feature: Name of the feature to stop.
|
|
|
|
"""
|
|
debug_msg("BEGIN")
|
|
|
|
init()
|
|
ret = SUCCESS
|
|
set_owner, _ , _ = read_config(feature)
|
|
current_owner, remote_state, _ = read_state(feature)
|
|
docker_id = container_id(feature)
|
|
remove_label = (remote_state != "pending") or (set_owner == "local")
|
|
|
|
debug_msg("{}: set_owner:{} current_owner:{} remote_state:{} docker_id:{}".format(
|
|
feature, set_owner, current_owner, remote_state, docker_id))
|
|
|
|
if remove_label:
|
|
set_label(feature, False)
|
|
|
|
if docker_id:
|
|
ret = docker_action("stop", docker_id, **kwargs)
|
|
else:
|
|
syslog.syslog(
|
|
syslog.LOG_ERR if current_owner != "none" else syslog.LOG_INFO,
|
|
"docker stop skipped as no docker-id for {}".format(feature))
|
|
|
|
# Container could get killed or crashed. In either case
|
|
# it does not have opportunity to mark itself down.
|
|
# Even during normal termination, with SIGTERM received
|
|
# container process may not have enough window of time to
|
|
# mark itself down and has the potential to get aborted.
|
|
#
|
|
# systemctl ensures that it handles only one instance for
|
|
# a feature at anytime and however the feature container
|
|
# exits, upon stop/kill/crash, systemctl-stop process
|
|
# is assured to get called. So mark the feature down here.
|
|
#
|
|
data = {
|
|
CURRENT_OWNER: "none",
|
|
UPD_TIMESTAMP: str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
|
|
CONTAINER_ID: "",
|
|
SYSTEM_STATE: "down"
|
|
}
|
|
if remote_state == "running":
|
|
data[REMOTE_STATE] = "stopped"
|
|
|
|
update_data(feature, data)
|
|
|
|
debug_msg("END")
|
|
return ret
|
|
|
|
|
|
def container_kill(feature, **kwargs):
|
|
"""
|
|
Kills the running container for this feature.
|
|
|
|
Instruct/ensure kube terminates, by removing label.
|
|
|
|
:param feature: Name of the feature to kill.
|
|
|
|
"""
|
|
debug_msg("BEGIN")
|
|
|
|
init()
|
|
|
|
ret = SUCCESS
|
|
set_owner, _ , state = read_config(feature)
|
|
current_owner, remote_state, _ = read_state(feature)
|
|
docker_id = container_id(feature)
|
|
remove_label = (set_owner != "local") or (current_owner != "local")
|
|
|
|
debug_msg("{}: set_owner:{} current_owner:{} remote_state:{} docker_id:{} state:{}".format(
|
|
feature, set_owner, current_owner, remote_state, docker_id, state))
|
|
|
|
if remove_label:
|
|
set_label(feature, False)
|
|
|
|
if set_owner == "local":
|
|
if state not in ["enabled", "always_enabled"]:
|
|
debug_msg("{} is not enabled".format(feature))
|
|
return FAILURE
|
|
|
|
if docker_id:
|
|
ret = docker_action("kill", docker_id, **kwargs)
|
|
|
|
else:
|
|
syslog.syslog(
|
|
syslog.LOG_ERR if current_owner != "none" else syslog.LOG_INFO,
|
|
"docker stop skipped as no docker-id for {}".format(feature))
|
|
|
|
|
|
debug_msg("END")
|
|
return ret
|
|
|
|
|
|
def container_wait(feature, **kwargs):
|
|
"""
|
|
Waits on the running container for this feature.
|
|
|
|
Get the container-id and call docker wait.
|
|
|
|
If docker-id can't be obtained for a configurable fail-duration
|
|
the wait clears the feature's remote-state in STATE-DB FEATURE
|
|
table and exit.
|
|
|
|
:param feature: Name of the feature to wait.
|
|
|
|
"""
|
|
debug_msg("BEGIN")
|
|
|
|
init()
|
|
|
|
set_owner, fallback, _ = read_config(feature)
|
|
current_owner, remote_state, _ = read_state(feature)
|
|
docker_id = container_id(feature)
|
|
pend_wait_secs = 0
|
|
ret = SUCCESS
|
|
|
|
if docker_id == feature:
|
|
version = container_version(feature)
|
|
if version:
|
|
update_data(feature, {ST_FEAT_CTR_STABLE_VER: version})
|
|
|
|
if not docker_id and fallback:
|
|
pend_wait_secs = get_config_data(
|
|
SONIC_CTR_CONFIG_PEND_SECS, DEFAULT_PEND_SECS)
|
|
|
|
debug_msg("{}: set_owner:{} ct_owner:{} state:{} id:{} pend={}".format(
|
|
feature, set_owner, current_owner, remote_state, docker_id,
|
|
pend_wait_secs))
|
|
|
|
while not docker_id:
|
|
if fallback:
|
|
pend_wait_secs = pend_wait_secs - WAIT_POLL_SECS
|
|
if pend_wait_secs < 0:
|
|
break
|
|
|
|
time.sleep(WAIT_POLL_SECS)
|
|
|
|
current_owner, remote_state, docker_id = read_state(feature)
|
|
|
|
debug_msg("wait_loop: {} = {} {} {}".format(feature, current_owner, remote_state, docker_id))
|
|
|
|
if (remote_state == "pending"):
|
|
update_data(feature, {REMOTE_STATE: "ready"})
|
|
|
|
if not docker_id:
|
|
# Clear remote state and exit.
|
|
# systemd would restart and fallback to local
|
|
update_data(feature, { REMOTE_STATE: "none" })
|
|
debug_msg("{}: Exiting to fallback as remote is *not* starting".
|
|
format(feature))
|
|
else:
|
|
debug_msg("END -- transitioning to docker wait")
|
|
ret = docker_action("wait", docker_id, **kwargs)
|
|
|
|
return ret
|
|
|
|
def main():
|
|
parser=argparse.ArgumentParser(description="container commands for start/stop/wait/kill/id")
|
|
parser.add_argument("action", choices=["start", "stop", "wait", "kill", "id"])
|
|
parser.add_argument('-t', '--timeout', type=int, help='container action timeout value', default=None)
|
|
parser.add_argument("name")
|
|
|
|
args = parser.parse_args()
|
|
kwargs = {}
|
|
|
|
ret = 0
|
|
if args.action == "start":
|
|
ret = container_start(args.name, **kwargs)
|
|
|
|
elif args.action == "stop":
|
|
if args.timeout is not None:
|
|
kwargs['timeout'] = args.timeout
|
|
ret = container_stop(args.name, **kwargs)
|
|
|
|
elif args.action == "kill":
|
|
ret = container_kill(args.name, **kwargs)
|
|
|
|
elif args.action == "wait":
|
|
ret = container_wait(args.name, **kwargs)
|
|
|
|
elif args.action == "id":
|
|
id = container_id(args.name, **kwargs)
|
|
print(id)
|
|
|
|
return ret
|
|
|
|
if __name__ == "__main__":
|
|
main()
|