[Mellanox] Fix race condition while creating SFP (#17544)

Why I did it
Fix issue xcvrd crashes due to cannot import name 'initialize_sfp_thermal':

Nov 27 09:47:16.388639 sonic ERR pmon#xcvrd: Exception occured at CmisManagerTask thread due to ImportError("cannot import name 'initialize_sfp_thermal' from partially initialized module 'sonic_platform.thermal' (most likely due to a circular import) (/usr/local/lib/python3.9/dist-packages/sonic_platform/thermal.py)")
Nov 27 09:47:16.392544 sonic ERR pmon#xcvrd: Traceback (most recent call last):
Nov 27 09:47:16.392643 sonic ERR pmon#xcvrd:   File "/usr/local/lib/python3.9/dist-packages/xcvrd/xcvrd.py", line 1518, in run
Nov 27 09:47:16.392757 sonic ERR pmon#xcvrd:     self.task_worker()
Nov 27 09:47:16.392757 sonic ERR pmon#xcvrd:   File "/usr/local/lib/python3.9/dist-packages/xcvrd/xcvrd.py", line 1240, in task_worker
Nov 27 09:47:16.392757 sonic ERR pmon#xcvrd:     sfp = platform_chassis.get_sfp(pport)
Nov 27 09:47:16.392793 sonic ERR pmon#xcvrd:   File "/usr/local/lib/python3.9/dist-packages/sonic_platform/chassis.py", line 346, in get_sfp
Nov 27 09:47:16.392830 sonic ERR pmon#xcvrd:     self.initialize_single_sfp(index)
Nov 27 09:47:16.392830 sonic ERR pmon#xcvrd:   File "/usr/local/lib/python3.9/dist-packages/sonic_platform/chassis.py", line 288, in initialize_single_sfp
Nov 27 09:47:16.392830 sonic ERR pmon#xcvrd:     self._sfp_list[index] = sfp_module.SFP(index)
Nov 27 09:47:16.392830 sonic ERR pmon#xcvrd:   File "/usr/local/lib/python3.9/dist-packages/sonic_platform/sfp.py", line 272, in __init__
Nov 27 09:47:16.392866 sonic ERR pmon#xcvrd:     from .thermal import initialize_sfp_thermal
Nov 27 09:47:16.392918 sonic ERR pmon#xcvrd: ImportError: cannot import name 'initialize_sfp_thermal' from partially initialized module 'sonic_platform.thermal' (most likely due to a circular import) (/usr/local/lib/python3.9/dist-packages/sonic_platform/thermal.py)
Nov 27 09:47:16.393103 sonic ERR pmon#xcvrd: Xcvrd: exception found at child thread CmisManagerTask due to ImportError("cannot import name 'initialize_sfp_thermal' from partially initialized module 'sonic_platform.thermal' (most likely due to a circular import) (/usr/local/lib/python3.9/dist-packages/sonic_platform/thermal.py)")
Nov 27 09:47:16.393103 sonic ERR pmon#xcvrd: Exiting main loop as child thread raised exception!
Work item tracking
Microsoft ADO (number only):
How I did it
Add lock for creating SFP object

How to verify it
UNIT TEST
Manual Test
This commit is contained in:
Junchao-Mellanox 2023-12-19 19:52:42 +08:00 committed by GitHub
parent 7f03584d9d
commit 9240bcb5b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 68 additions and 28 deletions

View File

@ -32,6 +32,7 @@ try:
from .device_data import DeviceDataManager
import re
import time
import threading
except ImportError as e:
raise ImportError (str(e) + "- required module not found")
@ -119,6 +120,7 @@ class Chassis(ChassisBase):
self.reboot_cause_initialized = False
self.sfp_module = None
self.sfp_lock = threading.Lock()
# Build the RJ45 port list from platform.json and hwsku.json
self._RJ45_port_inited = False
@ -266,38 +268,49 @@ class Chassis(ChassisBase):
def initialize_single_sfp(self, index):
sfp_count = self.get_num_sfps()
# Use double checked locking mechanism for:
# 1. protect shared resource self._sfp_list
# 2. performance (avoid locking every time)
if index < sfp_count:
if not self._sfp_list:
self._sfp_list = [None] * sfp_count
if not self._sfp_list or not self._sfp_list[index]:
with self.sfp_lock:
if not self._sfp_list:
self._sfp_list = [None] * sfp_count
if not self._sfp_list[index]:
sfp_module = self._import_sfp_module()
if self.RJ45_port_list and index in self.RJ45_port_list:
self._sfp_list[index] = sfp_module.RJ45Port(index)
else:
self._sfp_list[index] = sfp_module.SFP(index)
self.sfp_initialized_count += 1
if not self._sfp_list[index]:
sfp_module = self._import_sfp_module()
if self.RJ45_port_list and index in self.RJ45_port_list:
self._sfp_list[index] = sfp_module.RJ45Port(index)
else:
self._sfp_list[index] = sfp_module.SFP(index)
self.sfp_initialized_count += 1
def initialize_sfp(self):
if not self._sfp_list:
sfp_module = self._import_sfp_module()
sfp_count = self.get_num_sfps()
for index in range(sfp_count):
if self.RJ45_port_list and index in self.RJ45_port_list:
sfp_object = sfp_module.RJ45Port(index)
else:
sfp_object = sfp_module.SFP(index)
self._sfp_list.append(sfp_object)
self.sfp_initialized_count = sfp_count
elif self.sfp_initialized_count != len(self._sfp_list):
sfp_module = self._import_sfp_module()
for index in range(len(self._sfp_list)):
if self._sfp_list[index] is None:
if self.RJ45_port_list and index in self.RJ45_port_list:
self._sfp_list[index] = sfp_module.RJ45Port(index)
else:
self._sfp_list[index] = sfp_module.SFP(index)
self.sfp_initialized_count = len(self._sfp_list)
sfp_count = self.get_num_sfps()
# Use double checked locking mechanism for:
# 1. protect shared resource self._sfp_list
# 2. performance (avoid locking every time)
if sfp_count != self.sfp_initialized_count:
with self.sfp_lock:
if sfp_count != self.sfp_initialized_count:
if not self._sfp_list:
sfp_module = self._import_sfp_module()
for index in range(sfp_count):
if self.RJ45_port_list and index in self.RJ45_port_list:
sfp_object = sfp_module.RJ45Port(index)
else:
sfp_object = sfp_module.SFP(index)
self._sfp_list.append(sfp_object)
self.sfp_initialized_count = sfp_count
elif self.sfp_initialized_count != len(self._sfp_list):
sfp_module = self._import_sfp_module()
for index in range(len(self._sfp_list)):
if self._sfp_list[index] is None:
if self.RJ45_port_list and index in self.RJ45_port_list:
self._sfp_list[index] = sfp_module.RJ45Port(index)
else:
self._sfp_list[index] = sfp_module.SFP(index)
self.sfp_initialized_count = len(self._sfp_list)
def get_num_sfps(self):
"""

View File

@ -16,8 +16,10 @@
#
import os
import random
import sys
import subprocess
import threading
from mock import MagicMock
if sys.version_info.major == 3:
@ -167,6 +169,31 @@ class TestChassis:
assert len(sfp_list) == 3
assert chassis.sfp_initialized_count == 3
def test_create_sfp_in_multi_thread(self):
DeviceDataManager.get_sfp_count = mock.MagicMock(return_value=3)
iteration_num = 100
while iteration_num > 0:
chassis = Chassis()
assert chassis.sfp_initialized_count == 0
t1 = threading.Thread(target=lambda: chassis.get_sfp(1))
t2 = threading.Thread(target=lambda: chassis.get_sfp(1))
t3 = threading.Thread(target=lambda: chassis.get_all_sfps())
t4 = threading.Thread(target=lambda: chassis.get_all_sfps())
threads = [t1, t2, t3, t4]
random.shuffle(threads)
for t in threads:
t.start()
for t in threads:
t.join()
assert len(chassis.get_all_sfps()) == 3
assert chassis.sfp_initialized_count == 3
for index, s in enumerate(chassis.get_all_sfps()):
assert s.sdk_index == index
iteration_num -= 1
@mock.patch('sonic_platform.sfp_event.sfp_event.check_sfp_status', MagicMock())
@mock.patch('sonic_platform.sfp_event.sfp_event.__init__', MagicMock(return_value=None))
@mock.patch('sonic_platform.sfp_event.sfp_event.initialize', MagicMock())