diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 01f5fb1547..41d237143d 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -18,6 +18,7 @@ try: from sonic_platform.watchdog import get_watchdog from sonic_daemon_base.daemon_base import Logger from eeprom import Eeprom + from sfp_event import sfp_event from os import listdir from os.path import isfile, join import sys @@ -28,6 +29,8 @@ try: except ImportError as e: raise ImportError (str(e) + "- required module not found") +MAX_SELECT_DELAY = 3600 + MLNX_NUM_PSU = 2 GET_HWSKU_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.hwsku" @@ -127,6 +130,14 @@ class Chassis(ChassisBase): self._component_name_list.append(COMPONENT_CPLD1) self._component_name_list.append(COMPONENT_CPLD2) + # Initialize sfp-change-listening stuff + self._init_sfp_change_event() + + def _init_sfp_change_event(self): + self.sfp_event = sfp_event() + self.sfp_event.initialize() + self.MAX_SELECT_EVENT_RETURNED = self.PORT_END + def _extract_num_of_fans_and_fan_drawers(self): num_of_fan = 0 num_of_drawer = 0 @@ -327,3 +338,76 @@ class Chassis(ChassisBase): return self._get_firmware_version() return None + + def _show_capabilities(self): + """ + This function is for debug purpose + Some features require a xSFP module to support some capabilities but it's unrealistic to + check those modules one by one. + So this function is introduce to show some capabilities of all xSFP modules mounted on the device. + """ + for s in self._sfp_list: + try: + print "index {} tx disable {} dom {} calibration {} temp {} volt {} power (tx {} rx {})".format(s.index, + s.dom_tx_disable_supported, + s.dom_supported, + s.calibration, + s.dom_temp_supported, + s.dom_volt_supported, + s.dom_rx_power_supported, + s.dom_tx_power_supported + ) + except: + print "fail to retrieve capabilities for module index {}".format(s.index) + + def get_change_event(self, timeout=0): + """ + Returns a nested dictionary containing all devices which have + experienced a change at chassis level + + Args: + timeout: Timeout in milliseconds (optional). If timeout == 0, + this method will block until a change is detected. + + Returns: + (bool, dict): + - True if call successful, False if not; + - A nested dictionary where key is a device type, + value is a dictionary with key:value pairs in the format of + {'device_id':'device_event'}, + where device_id is the device ID for this device and + device_event, + status='1' represents device inserted, + status='0' represents device removed. + Ex. {'fan':{'0':'0', '2':'1'}, 'sfp':{'11':'0'}} + indicates that fan 0 has been removed, fan 2 + has been inserted and sfp 11 has been removed. + """ + wait_for_ever = (timeout == 0) + port_dict = {} + if wait_for_ever: + timeout = MAX_SELECT_DELAY + while True: + status = self.sfp_event.check_sfp_status(port_dict, timeout) + if not port_dict == {}: + break + else: + status = self.sfp_event.check_sfp_status(port_dict, timeout) + + if status: + # get_change_event has the meaning of retrieving all the notifications through a single call. + # Typically this is implemented via a select framework which requires the underlay file-reading + # interface able to retrieve all notifications without blocking once the fd has been selected. + # However, sdk doesn't provide any interface satisfied the requirement. as a result, + # check_sfp_status returns only one notification may indicate more notifications in its queue. + # In this sense, we have to iterate in a loop to get all the notifications in case that + # the first call returns at least one. + i = 0 + while i < self.MAX_SELECT_EVENT_RETURNED: + status = self.sfp_event.check_sfp_status(port_dict, 0) + if not status: + break + i = i + 1 + return True, {'sfp':port_dict} + else: + return True, {} diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py index 6df3e6437b..9ea9c21899 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py @@ -327,6 +327,7 @@ class SFP(SfpBase): self.dom_volt_supported = False self.dom_rx_power_supported = False self.dom_tx_power_supported = False + self.calibration = 0 self.dom_tx_disable_supported = (int(sfp_dom_capability_raw[1], 16) & 0x40 != 0) else: self.dom_supported = False diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py new file mode 100644 index 0000000000..1e57603d38 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python +''' +listen to the SDK for the SFP change event and return to chassis. +''' + +from __future__ import print_function +import sys, errno +import os +import time +import select +from python_sdk_api.sx_api import * +from sonic_daemon_base.daemon_base import Logger + +SYSLOG_IDENTIFIER = "sfp-event" + +SDK_SFP_STATE_IN = 0x1 +SDK_SFP_STATE_OUT = 0x2 +STATUS_PLUGIN = '1' +STATUS_PLUGOUT = '0' +STATUS_UNKNOWN = '2' + +sfp_value_status_dict = { + SDK_SFP_STATE_IN: STATUS_PLUGIN, + SDK_SFP_STATE_OUT: STATUS_PLUGOUT, +} + +PMPE_PACKET_SIZE = 2000 + +logger = Logger(SYSLOG_IDENTIFIER) + +class sfp_event: + ''' Listen to plugin/plugout cable events ''' + + SX_OPEN_RETRIES = 20 + + def __init__(self): + self.swid = 0 + self.handle = None + + def initialize(self): + # open SDK API handle. + # retry at most SX_OPEN_RETRIES times to wait until SDK is started during system startup + retry = 1 + while True: + rc, self.handle = sx_api_open(None) + if rc == SX_STATUS_SUCCESS: + break + + logger.log_info("failed to open SDK API handle... retrying {}".format(retry)) + + time.sleep(2 ** retry) + retry += 1 + + if retry > self.SX_OPEN_RETRIES: + raise RuntimeError("failed to open SDK API handle after {} retries".format(retry)) + + # Allocate SDK fd and user channel structures + self.rx_fd_p = new_sx_fd_t_p() + self.user_channel_p = new_sx_user_channel_t_p() + + rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p) + if rc != SX_STATUS_SUCCESS: + raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc)) + + self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD + self.user_channel_p.channel.fd = self.rx_fd_p + + rc = sx_api_host_ifc_trap_id_register_set(self.handle, + SX_ACCESS_CMD_REGISTER, + self.swid, + SX_TRAP_ID_PMPE, + self.user_channel_p) + if rc != SX_STATUS_SUCCESS: + raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(rc)) + + def deinitialize(self): + if self.handle is None: + return + + # unregister trap id + rc = sx_api_host_ifc_trap_id_register_set(self.handle, + SX_ACCESS_CMD_DEREGISTER, + self.swid, + SX_TRAP_ID_PMPE, + self.user_channel_p) + if rc != SX_STATUS_SUCCESS: + logger.log_error("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(rc)) + + rc = sx_api_host_ifc_close(self.handle, self.rx_fd_p) + if rc != SX_STATUS_SUCCESS: + logger.log_error("sx_api_host_ifc_close exited with error, rc {}".format(rc)) + + rc = sx_api_close(self.handle) + if rc != SX_STATUS_SUCCESS: + logger.log_error("sx_api_close exited with error, rc {}".format(rc)) + + delete_sx_fd_t_p(self.rx_fd_p) + delete_sx_user_channel_t_p(self.user_channel_p) + + def check_sfp_status(self, port_change, timeout): + """ + the meaning of timeout is aligned with select.select, which has the following meaning: + 0: poll, returns without blocked + arbitrary positive value: doesn't returns until at least fd in the set is ready or + seconds elapsed + Note: + check_sfp_status makes the use of select to retrieve the notifications, which means + it should has the logic of reading out all the notifications in the fd selected without blocked. + However, it fails to do that due to some sdk API's characteristics: + sx_lib_host_ifc_recv can only read one notification each time and will block when no notification in that fd. + sx_lib_host_ifc_recv_list can return all notification in the fd via a single reading operation but + not supported by PMPE register (I've tested it but failed) + as a result the only way to satisfy the logic is to call sx_lib_host_ifc_recv in a loop until all notifications + has been read and we have to find a way to check that. it seems the only way to check that is via using select. + in this sense, we return one notification each time check_sfp_status called and let the caller, get_change_event, + to repeat calling it with timeout = 0 in a loop until no new notification read (in this case it returns false). + by doing so all the notifications in the fd can be retrieved through a single call to get_change_event. + """ + found = 0 + + try: + read, _, _ = select.select([self.rx_fd_p.fd], [], [], timeout) + except select.error as err: + rc, msg = err + if rc == errno.EAGAIN or rc == errno.EINTR: + return False + else: + raise + + for fd in read: + if fd == self.rx_fd_p.fd: + success, port_list, module_state = self.on_pmpe(self.rx_fd_p) + if not success: + logger.log_error("failed to read from {}".format(fd)) + break + + sfp_state = sfp_value_status_dict.get(module_state, STATUS_UNKNOWN) + if sfp_state == STATUS_UNKNOWN: + # in the following sequence, STATUS_UNKNOWN can be returned. + # so we shouldn't raise exception here. + # 1. some sfp module is inserted + # 2. sfp_event gets stuck and fails to fetch the change event instantaneously + # 3. and then the sfp module is removed + # 4. sfp_event starts to try fetching the change event + # in this case found is increased so that True will be returned + logger.log_info("unknown module state {}, maybe the port suffers two adjacent insertion/removal".format(module_state)) + found += 1 + continue + + for port in port_list: + logger.log_info("SFP on port {} state {}".format(port, sfp_state)) + port_change[port] = sfp_state + found += 1 + + if found == 0: + return False + else: + return True + + def on_pmpe(self, fd_p): + ''' on port module plug event handler ''' + + # recv parameters + pkt_size = PMPE_PACKET_SIZE + pkt_size_p = new_uint32_t_p() + uint32_t_p_assign(pkt_size_p, pkt_size) + pkt = new_uint8_t_arr(pkt_size) + recv_info_p = new_sx_receive_info_t_p() + pmpe_t = sx_event_pmpe_t() + port_attributes_list = new_sx_port_attributes_t_arr(64) + port_cnt_p = new_uint32_t_p() + uint32_t_p_assign(port_cnt_p,64) + label_port_list = [] + module_state = 0 + + rc = sx_lib_host_ifc_recv(fd_p, pkt, pkt_size_p, recv_info_p) + if rc != 0: + logger.log_error("sx_lib_host_ifc_recv exited with error, rc %d" % rc) + status = False + else: + status = True + pmpe_t = recv_info_p.event_info.pmpe + port_list_size = pmpe_t.list_size + logical_port_list = pmpe_t.log_port_list + module_state = pmpe_t.module_state + + for i in xrange(port_list_size): + logical_port = sx_port_log_id_t_arr_getitem(logical_port_list, i) + rc = sx_api_port_device_get(self.handle, 1 , 0, port_attributes_list, port_cnt_p) + port_cnt = uint32_t_p_value(port_cnt_p) + + for i in xrange(port_cnt): + port_attributes = sx_port_attributes_t_arr_getitem(port_attributes_list,i) + if port_attributes.log_port == logical_port: + lable_port = port_attributes.port_mapping.module_port + break + label_port_list.append(lable_port) + + delete_uint32_t_p(pkt_size_p) + delete_uint8_t_arr(pkt) + delete_sx_receive_info_t_p(recv_info_p) + delete_sx_port_attributes_t_arr(port_attributes_list) + delete_uint32_t_p(port_cnt_p) + + return status, label_port_list, module_state,