From d19d904f6a47ea244e93c57f644e95061c3ab308 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Sat, 26 Aug 2023 08:01:37 +0800 Subject: [PATCH] [Mellanox] Fix issue: watchdogutil command does not work (#16091) (#16260) - Why I did it watchdogutil uses platform API watchdog instance to control/query watchdog status. In Nvidia watchdog status, it caches "armed" status in a object member "WatchdogImplBase.armed". This is not working for CLI infrastructure because each CLI will create a new watchdog instance, the status cached in previous instance will totally lose. Consider following commands: admin@sonic:~$ sudo watchdogutil arm -s 100 =====> watchdog instance1, armed=True Watchdog armed for 100 seconds admin@sonic:~$ sudo watchdogutil status ======> watchdog instance2, armed=False Status: Unarmed admin@sonic:~$ sudo watchdogutil disarm =======> watchdog instance3, armed=False Failed to disarm Watchdog - How I did it Use sysfs to query watchdog status - How to verify it Manual test Unit test Conflicts: platform/mellanox/mlnx-platform-api/sonic_platform/watchdog.py platform/mellanox/mlnx-platform-api/tests/test_watchdog.py --- .../sonic_platform/watchdog.py | 51 ++++--- .../mlnx-platform-api/tests/test_watchdog.py | 141 ++++++++++++++++++ 2 files changed, 166 insertions(+), 26 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/tests/test_watchdog.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/watchdog.py b/platform/mellanox/mlnx-platform-api/sonic_platform/watchdog.py index 879aabfd35..5635b18692 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/watchdog.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/watchdog.py @@ -27,6 +27,7 @@ import array import time from sonic_platform_base.watchdog_base import WatchdogBase +from . import utils """ ioctl constants """ IO_WRITE = 0x40000000 @@ -80,16 +81,18 @@ class WatchdogImplBase(WatchdogBase): super(WatchdogImplBase, self).__init__() self.watchdog_path = wd_device_path - self.watchdog = os.open(self.watchdog_path, os.O_WRONLY) - - # Opening a watchdog descriptor starts - # watchdog timer; - # by default it should be stopped - self._disablecard() - self.armed = False - + self._watchdog = None self.timeout = self._gettimeout() + @property + def watchdog(self): + if self._watchdog is None: + self._watchdog = self.open_handle() + return self._watchdog + + def open_handle(self): + return os.open(self.watchdog_path, os.O_WRONLY) + def _enablecard(self): """ Turn on the watchdog timer @@ -131,10 +134,7 @@ class WatchdogImplBase(WatchdogBase): @return watchdog timeout """ - req = array.array('I', [0]) - fcntl.ioctl(self.watchdog, WDIOC_GETTIMEOUT, req, True) - - return int(req[0]) + return utils.read_int_from_file('/run/hw-management/watchdog/main/timeout') def _gettimeleft(self): """ @@ -142,10 +142,7 @@ class WatchdogImplBase(WatchdogBase): @return time left in seconds """ - req = array.array('I', [0]) - fcntl.ioctl(self.watchdog, WDIOC_GETTIMELEFT, req, True) - - return int(req[0]) + return utils.read_int_from_file('/run/hw-management/watchdog/main/timeleft') def arm(self, seconds): """ @@ -159,11 +156,10 @@ class WatchdogImplBase(WatchdogBase): try: if self.timeout != seconds: self.timeout = self._settimeout(seconds) - if self.armed: + if self.is_armed(): self._keepalive() else: self._enablecard() - self.armed = True ret = self.timeout except IOError: pass @@ -176,10 +172,9 @@ class WatchdogImplBase(WatchdogBase): """ disarmed = False - if self.armed: + if self.is_armed(): try: self._disablecard() - self.armed = False disarmed = True except IOError: pass @@ -191,7 +186,7 @@ class WatchdogImplBase(WatchdogBase): Implements is_armed WatchdogBase API """ - return self.armed + return utils.read_str_from_file('/run/hw-management/watchdog/main/state') == 'active' def get_remaining_time(self): """ @@ -200,7 +195,7 @@ class WatchdogImplBase(WatchdogBase): timeleft = WD_COMMON_ERROR - if self.armed: + if self.is_armed(): try: timeleft = self._gettimeleft() except IOError: @@ -213,13 +208,15 @@ class WatchdogImplBase(WatchdogBase): Close watchdog """ - os.close(self.watchdog) + if self._watchdog is not None: + os.close(self._watchdog) class WatchdogType1(WatchdogImplBase): """ Watchdog type 1 """ + TIMESTAMP_FILE = '/tmp/nvidia/watchdog_timestamp' def arm(self, seconds): """ @@ -230,7 +227,8 @@ class WatchdogType1(WatchdogImplBase): ret = WatchdogImplBase.arm(self, seconds) # Save the watchdog arm timestamp # requiered for get_remaining_time() - self.arm_timestamp = time.time() + os.makedirs('/tmp/nvidia', exist_ok=True) + utils.write_file(self.TIMESTAMP_FILE, str(time.time())) return ret @@ -243,8 +241,9 @@ class WatchdogType1(WatchdogImplBase): timeleft = WD_COMMON_ERROR - if self.armed: - timeleft = int(self.timeout - (time.time() - self.arm_timestamp)) + if self.is_armed(): + arm_timestamp = utils.read_float_from_file(self.TIMESTAMP_FILE) + timeleft = int(self.timeout - (time.time() - arm_timestamp)) return timeleft diff --git a/platform/mellanox/mlnx-platform-api/tests/test_watchdog.py b/platform/mellanox/mlnx-platform-api/tests/test_watchdog.py new file mode 100644 index 0000000000..68d29d38a6 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_watchdog.py @@ -0,0 +1,141 @@ +# +# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import pytest +import sys +if sys.version_info.major == 3: + from unittest import mock +else: + import mock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform.chassis import Chassis +from sonic_platform.watchdog import get_watchdog, \ + WatchdogType2, \ + WatchdogType1, \ + is_mlnx_wd_main, \ + is_wd_type2 + + +class TestWatchdog: + @mock.patch('sonic_platform.watchdog.is_mlnx_wd_main') + @mock.patch('sonic_platform.watchdog.os.listdir') + def test_get_watchdog_no_device(self, mock_listdir, mock_is_main): + mock_listdir.return_value = [] + assert get_watchdog() is None + + mock_listdir.return_value = ['invalid'] + mock_is_main.return_value = True + assert get_watchdog() is None + + mock_listdir.return_value = ['watchdog1'] + mock_is_main.return_value = False + assert get_watchdog() is None + + @mock.patch('sonic_platform.watchdog.is_mlnx_wd_main') + @mock.patch('sonic_platform.watchdog.is_wd_type2') + @mock.patch('sonic_platform.watchdog.os.listdir', mock.MagicMock(return_value=['watchdog1', 'watchdog2'])) + @mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock()) + @mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock()) + @pytest.mark.parametrize('test_para', + [(True, WatchdogType2), (False, WatchdogType1)]) + def test_get_watchdog(self, mock_is_type2, mock_is_main, test_para): + mock_is_main.side_effect = lambda dev: dev == 'watchdog2' + mock_is_type2.return_value = test_para[0] + chassis = Chassis() + watchdog = chassis.get_watchdog() + assert isinstance(watchdog, test_para[1]) + assert watchdog.watchdog_path == '/dev/watchdog2' + + def test_is_mlnx_wd_main(self): + mock_os_open = mock.mock_open(read_data='mlx-wdt-main') + with mock.patch('sonic_platform.watchdog.open', mock_os_open): + assert is_mlnx_wd_main('') + + mock_os_open = mock.mock_open(read_data='invalid') + with mock.patch('sonic_platform.watchdog.open', mock_os_open): + assert not is_mlnx_wd_main('') + mock_os_open.side_effect = IOError + with mock.patch('sonic_platform.watchdog.open', mock_os_open): + assert not is_mlnx_wd_main('') + + @mock.patch('sonic_platform.watchdog.os.path.exists') + @pytest.mark.parametrize('test_para', + [True, False]) + def test_is_wd_type2(self, mock_exists, test_para): + mock_exists.return_value = test_para + assert is_wd_type2('') is test_para + + @mock.patch('sonic_platform.utils.read_str_from_file') + def test_is_armed(self, mock_read): + watchdog = WatchdogType2('watchdog2') + mock_read.return_value = 'inactive' + assert not watchdog.is_armed() + mock_read.return_value = 'active' + assert watchdog.is_armed() + + @mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock()) + @mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock()) + @mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed') + def test_arm_disarm_watchdog2(self, mock_is_armed): + watchdog = WatchdogType2('watchdog2') + assert watchdog.arm(-1) == -1 + mock_is_armed.return_value = False + watchdog.arm(10) + mock_is_armed.return_value = True + watchdog.arm(5) + watchdog.disarm() + + @mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock()) + @mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock()) + @mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed') + def test_arm_disarm_watchdog1(self, mock_is_armed): + watchdog = WatchdogType1('watchdog1') + assert watchdog.arm(-1) == -1 + mock_is_armed.return_value = False + watchdog.arm(10) + mock_is_armed.return_value = True + watchdog.arm(5) + watchdog.disarm() + + @mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock()) + @mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock()) + @mock.patch('sonic_platform.watchdog.WatchdogImplBase._gettimeleft', mock.MagicMock(return_value=10)) + @mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed') + def test_get_remaining_time_watchdog2(self, mock_is_armed): + watchdog = WatchdogType2('watchdog2') + mock_is_armed.return_value = False + assert watchdog.get_remaining_time() == -1 + watchdog.arm(10) + mock_is_armed.return_value = True + assert watchdog.get_remaining_time() == 10 + + @mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock()) + @mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock()) + @mock.patch('sonic_platform.watchdog.WatchdogImplBase._gettimeleft', mock.MagicMock(return_value=10)) + @mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed') + def test_get_remaining_time_watchdog1(self, mock_is_armed): + watchdog = WatchdogType1('watchdog2') + mock_is_armed.return_value = False + assert watchdog.get_remaining_time() == -1 + watchdog.arm(10) + mock_is_armed.return_value = True + assert watchdog.get_remaining_time() > 0