[Mellanox] implement platform wait in python code (#17398)
- Why I did it New implementation of Nvidia platform_wait due to: 1. sysfs deprecated by hw-mgmt 2. new dependencies to SDK 3. For CMIS host management mode - How I did it wait hw-management ready wait SDK sysfs nodes ready - How to verify it manual test unit test sonic-mgmt regression
This commit is contained in:
parent
f373a16e95
commit
c1cb292310
@ -1,68 +1,32 @@
|
|||||||
#!/bin/bash
|
#!/usr/bin/python3
|
||||||
|
|
||||||
declare -r SYSLOG_LOGGER="/usr/bin/logger"
|
#
|
||||||
declare -r SYSLOG_IDENTIFIER="platform_wait"
|
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
|
||||||
declare -r SYSLOG_ERROR="error"
|
# Apache-2.0
|
||||||
declare -r SYSLOG_NOTICE="notice"
|
#
|
||||||
declare -r SYSLOG_INFO="info"
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
declare -r HW_MGMT_CONFIG="/var/run/hw-management/config"
|
import sys
|
||||||
|
from sonic_platform.device_data import DeviceDataManager
|
||||||
|
from sonic_py_common.logger import Logger
|
||||||
|
|
||||||
declare -r ASIC_INIT_DONE="${HW_MGMT_CONFIG}/asics_init_done"
|
|
||||||
declare -r NUM_ASICS="${HW_MGMT_CONFIG}/asic_num"
|
|
||||||
declare -r ASIC_CHIPUP_COMPLETED="${HW_MGMT_CONFIG}/asic_chipup_completed"
|
|
||||||
|
|
||||||
declare -r EXIT_SUCCESS="0"
|
logger = Logger(log_identifier='platform_wait')
|
||||||
declare -r EXIT_TIMEOUT="1"
|
logger.log_notice('Nvidia: Wait for PMON dependencies to be ready')
|
||||||
|
if DeviceDataManager.wait_platform_ready():
|
||||||
function log_error() {
|
logger.log_notice('Nvidia: PMON dependencies are ready')
|
||||||
eval "${SYSLOG_LOGGER} -t ${SYSLOG_IDENTIFIER} -p ${SYSLOG_ERROR} $@"
|
sys.exit(0)
|
||||||
}
|
else:
|
||||||
|
logger.log_error('Nvidia: PMON dependencies are not ready: timeout')
|
||||||
function log_notice() {
|
sys.exit(-1)
|
||||||
eval "${SYSLOG_LOGGER} -t ${SYSLOG_IDENTIFIER} -p ${SYSLOG_NOTICE} $@"
|
|
||||||
}
|
|
||||||
|
|
||||||
function log_info() {
|
|
||||||
eval "${SYSLOG_LOGGER} -t ${SYSLOG_IDENTIFIER} -p ${SYSLOG_INFO} $@"
|
|
||||||
}
|
|
||||||
|
|
||||||
function wait_for_asic_chipup() {
|
|
||||||
|
|
||||||
local _ASIC_INIT="0"
|
|
||||||
local _ASIC_COUNT="0"
|
|
||||||
local _ASICS_CHIPUP="0"
|
|
||||||
|
|
||||||
local -i _WDOG_CNT="1"
|
|
||||||
local -ir _WDOG_MAX="300"
|
|
||||||
|
|
||||||
local -r _TIMEOUT="1s"
|
|
||||||
|
|
||||||
while [[ "${_WDOG_CNT}" -le "${_WDOG_MAX}" ]]; do
|
|
||||||
_ASIC_INIT="$(cat ${ASIC_INIT_DONE} 2>&1)"
|
|
||||||
_ASIC_COUNT="$(cat ${NUM_ASICS} 2>&1)"
|
|
||||||
_ASICS_CHIPUP="$(cat ${ASIC_CHIPUP_COMPLETED} 2>&1)"
|
|
||||||
|
|
||||||
if [[ "${_ASIC_INIT}" -eq 1 && "${_ASIC_COUNT}" -eq "${_ASICS_CHIPUP}" ]]; then
|
|
||||||
return "${EXIT_SUCCESS}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
let "_WDOG_CNT++"
|
|
||||||
sleep "${_TIMEOUT}"
|
|
||||||
done
|
|
||||||
|
|
||||||
log_error "Mellanox ASIC is not ready: INIT: ${_ASIC_INIT}, NUM_ASIC: ${_ASIC_COUNT}, CHIPUP: ${_ASICS_CHIPUP} timeout...."
|
|
||||||
return "${EXIT_TIMEOUT}"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_info "Wait for Mellanox ASIC to be ready"
|
|
||||||
|
|
||||||
wait_for_asic_chipup
|
|
||||||
EXIT_CODE="$?"
|
|
||||||
if [[ "${EXIT_CODE}" != "${EXIT_SUCCESS}" ]]; then
|
|
||||||
exit "${EXIT_CODE}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
log_notice "Mellanox ASIC is ready"
|
|
||||||
|
|
||||||
exit "${EXIT_SUCCESS}"
|
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
|
|
||||||
import glob
|
import glob
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
from . import utils
|
from . import utils
|
||||||
|
|
||||||
@ -167,8 +168,11 @@ class DeviceDataManager:
|
|||||||
@classmethod
|
@classmethod
|
||||||
@utils.read_only_cache()
|
@utils.read_only_cache()
|
||||||
def get_sfp_count(cls):
|
def get_sfp_count(cls):
|
||||||
sfp_count = utils.read_int_from_file('/run/hw-management/config/sfp_counter')
|
from sonic_py_common import device_info
|
||||||
return sfp_count if sfp_count > 0 else len(glob.glob('/sys/module/sx_core/asic0/module*'))
|
platform_path = device_info.get_path_to_platform_dir()
|
||||||
|
platform_json_path = os.path.join(platform_path, 'platform.json')
|
||||||
|
platform_data = utils.load_json_file(platform_json_path)
|
||||||
|
return len(platform_data['chassis']['sfps'])
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_linecard_sfp_count(cls, lc_index):
|
def get_linecard_sfp_count(cls, lc_index):
|
||||||
@ -244,3 +248,23 @@ class DeviceDataManager:
|
|||||||
sai_profile_file = os.path.join(hwsku_dir, 'sai.profile')
|
sai_profile_file = os.path.join(hwsku_dir, 'sai.profile')
|
||||||
data = utils.read_key_value_file(sai_profile_file, delimeter='=')
|
data = utils.read_key_value_file(sai_profile_file, delimeter='=')
|
||||||
return data.get('SAI_INDEPENDENT_MODULE_MODE') == '1'
|
return data.get('SAI_INDEPENDENT_MODULE_MODE') == '1'
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def wait_platform_ready(cls):
|
||||||
|
"""
|
||||||
|
Wait for Nvidia platform related services(SDK, hw-management) ready
|
||||||
|
Returns:
|
||||||
|
bool: True if wait success else timeout
|
||||||
|
"""
|
||||||
|
conditions = []
|
||||||
|
sysfs_nodes = ['power_mode', 'power_mode_policy', 'present', 'reset', 'status', 'statuserror']
|
||||||
|
if cls.is_independent_mode():
|
||||||
|
sysfs_nodes.extend(['control', 'frequency', 'frequency_support', 'hw_present', 'hw_reset',
|
||||||
|
'power_good', 'power_limit', 'power_on', 'temperature/input'])
|
||||||
|
else:
|
||||||
|
conditions.append(lambda: utils.read_int_from_file('/var/run/hw-management/config/asics_init_done') == 1)
|
||||||
|
sfp_count = cls.get_sfp_count()
|
||||||
|
for sfp_index in range(sfp_count):
|
||||||
|
for sysfs_node in sysfs_nodes:
|
||||||
|
conditions.append(lambda: os.path.exists(f'/sys/module/sx_core/asic0/module{sfp_index}/{sysfs_node}'))
|
||||||
|
return utils.wait_until_conditions(conditions, 300, 1)
|
||||||
|
@ -290,6 +290,30 @@ def wait_until(predict, timeout, interval=1, *args, **kwargs):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def wait_until_conditions(conditions, timeout, interval=1):
|
||||||
|
"""
|
||||||
|
Wait until all the conditions become true
|
||||||
|
Args:
|
||||||
|
conditions (list): a list of callable which generate True|False
|
||||||
|
timeout (int): wait time in seconds
|
||||||
|
interval (int, optional): interval to check the predict. Defaults to 1.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if wait success else False
|
||||||
|
"""
|
||||||
|
while timeout > 0:
|
||||||
|
pending_conditions = []
|
||||||
|
for condition in conditions:
|
||||||
|
if not condition():
|
||||||
|
pending_conditions.append(condition)
|
||||||
|
if not pending_conditions:
|
||||||
|
return True
|
||||||
|
conditions = pending_conditions
|
||||||
|
time.sleep(interval)
|
||||||
|
timeout -= interval
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class TimerEvent:
|
class TimerEvent:
|
||||||
def __init__(self, interval, cb, repeat):
|
def __init__(self, interval, cb, repeat):
|
||||||
self.interval = interval
|
self.interval = interval
|
||||||
|
@ -60,6 +60,26 @@ class TestDeviceData:
|
|||||||
mock_read.return_value = {'SAI_INDEPENDENT_MODULE_MODE': '1'}
|
mock_read.return_value = {'SAI_INDEPENDENT_MODULE_MODE': '1'}
|
||||||
assert DeviceDataManager.is_independent_mode()
|
assert DeviceDataManager.is_independent_mode()
|
||||||
|
|
||||||
|
@mock.patch('sonic_py_common.device_info.get_path_to_platform_dir', mock.MagicMock(return_value='/tmp'))
|
||||||
|
@mock.patch('sonic_platform.device_data.utils.load_json_file')
|
||||||
|
def test_get_sfp_count(self, mock_load_json):
|
||||||
|
mock_load_json.return_value = {
|
||||||
|
'chassis': {
|
||||||
|
'sfps': [1,2,3]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert DeviceDataManager.get_sfp_count() == 3
|
||||||
|
|
||||||
|
@mock.patch('sonic_platform.device_data.time.sleep', mock.MagicMock())
|
||||||
|
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_sfp_count', mock.MagicMock(return_value=3))
|
||||||
|
@mock.patch('sonic_platform.device_data.utils.read_int_from_file', mock.MagicMock(return_value=1))
|
||||||
|
@mock.patch('sonic_platform.device_data.os.path.exists')
|
||||||
|
@mock.patch('sonic_platform.device_data.DeviceDataManager.is_independent_mode')
|
||||||
|
def test_wait_platform_ready(self, mock_is_indep, mock_exists):
|
||||||
|
mock_exists.return_value = True
|
||||||
|
mock_is_indep.return_value = True
|
||||||
|
assert DeviceDataManager.wait_platform_ready()
|
||||||
|
mock_is_indep.return_value = False
|
||||||
|
assert DeviceDataManager.wait_platform_ready()
|
||||||
|
mock_exists.return_value = False
|
||||||
|
assert not DeviceDataManager.wait_platform_ready()
|
||||||
|
@ -196,6 +196,13 @@ class TestUtils:
|
|||||||
with mock.patch('sonic_platform.utils.open', mock_os_open):
|
with mock.patch('sonic_platform.utils.open', mock_os_open):
|
||||||
assert utils.read_key_value_file('some_file', delimeter='=') == {'a':'b'}
|
assert utils.read_key_value_file('some_file', delimeter='=') == {'a':'b'}
|
||||||
|
|
||||||
|
@mock.patch('sonic_platform.utils.time.sleep', mock.MagicMock())
|
||||||
|
def test_wait_until_conditions(self):
|
||||||
|
conditions = [lambda: True]
|
||||||
|
assert utils.wait_until_conditions(conditions, 1)
|
||||||
|
conditions = [lambda: False]
|
||||||
|
assert not utils.wait_until_conditions(conditions, 1)
|
||||||
|
|
||||||
def test_timer(self):
|
def test_timer(self):
|
||||||
timer = utils.Timer()
|
timer = utils.Timer()
|
||||||
timer.start()
|
timer.start()
|
||||||
|
Reference in New Issue
Block a user