From a5de31bf4388bcd8f6bb71334c9da238b0741261 Mon Sep 17 00:00:00 2001 From: Stephen Sun <5379172+stephenxs@users.noreply.github.com> Date: Sun, 28 Jul 2019 20:18:39 +0800 Subject: [PATCH] [Mellanox]new platform api -- support get_change_event (#3142) * [Mellanox]refractor the sfp event change notification logic for new platform api remove the standalong daemon which is in charge of polling sfp change event through sdk interface and move the polling stuff to the event in the chassis daemon. * rephase some comment * fix typo in sfp_event.sfp_event.initialize --- .../sonic_platform/chassis.py | 84 +++++++ .../mlnx-platform-api/sonic_platform/sfp.py | 1 + .../sonic_platform/sfp_event.py | 205 ++++++++++++++++++ 3 files changed, 290 insertions(+) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 01f5fb1547..41d237143d 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -18,6 +18,7 @@ try: from sonic_platform.watchdog import get_watchdog from sonic_daemon_base.daemon_base import Logger from eeprom import Eeprom + from sfp_event import sfp_event from os import listdir from os.path import isfile, join import sys @@ -28,6 +29,8 @@ try: except ImportError as e: raise ImportError (str(e) + "- required module not found") +MAX_SELECT_DELAY = 3600 + MLNX_NUM_PSU = 2 GET_HWSKU_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.hwsku" @@ -127,6 +130,14 @@ class Chassis(ChassisBase): self._component_name_list.append(COMPONENT_CPLD1) self._component_name_list.append(COMPONENT_CPLD2) + # Initialize sfp-change-listening stuff + self._init_sfp_change_event() + + def _init_sfp_change_event(self): + self.sfp_event = sfp_event() + self.sfp_event.initialize() + self.MAX_SELECT_EVENT_RETURNED = self.PORT_END + def _extract_num_of_fans_and_fan_drawers(self): num_of_fan = 0 num_of_drawer = 0 @@ -327,3 +338,76 @@ class Chassis(ChassisBase): return self._get_firmware_version() return None + + def _show_capabilities(self): + """ + This function is for debug purpose + Some features require a xSFP module to support some capabilities but it's unrealistic to + check those modules one by one. + So this function is introduce to show some capabilities of all xSFP modules mounted on the device. + """ + for s in self._sfp_list: + try: + print "index {} tx disable {} dom {} calibration {} temp {} volt {} power (tx {} rx {})".format(s.index, + s.dom_tx_disable_supported, + s.dom_supported, + s.calibration, + s.dom_temp_supported, + s.dom_volt_supported, + s.dom_rx_power_supported, + s.dom_tx_power_supported + ) + except: + print "fail to retrieve capabilities for module index {}".format(s.index) + + def get_change_event(self, timeout=0): + """ + Returns a nested dictionary containing all devices which have + experienced a change at chassis level + + Args: + timeout: Timeout in milliseconds (optional). If timeout == 0, + this method will block until a change is detected. + + Returns: + (bool, dict): + - True if call successful, False if not; + - A nested dictionary where key is a device type, + value is a dictionary with key:value pairs in the format of + {'device_id':'device_event'}, + where device_id is the device ID for this device and + device_event, + status='1' represents device inserted, + status='0' represents device removed. + Ex. {'fan':{'0':'0', '2':'1'}, 'sfp':{'11':'0'}} + indicates that fan 0 has been removed, fan 2 + has been inserted and sfp 11 has been removed. + """ + wait_for_ever = (timeout == 0) + port_dict = {} + if wait_for_ever: + timeout = MAX_SELECT_DELAY + while True: + status = self.sfp_event.check_sfp_status(port_dict, timeout) + if not port_dict == {}: + break + else: + status = self.sfp_event.check_sfp_status(port_dict, timeout) + + if status: + # get_change_event has the meaning of retrieving all the notifications through a single call. + # Typically this is implemented via a select framework which requires the underlay file-reading + # interface able to retrieve all notifications without blocking once the fd has been selected. + # However, sdk doesn't provide any interface satisfied the requirement. as a result, + # check_sfp_status returns only one notification may indicate more notifications in its queue. + # In this sense, we have to iterate in a loop to get all the notifications in case that + # the first call returns at least one. + i = 0 + while i < self.MAX_SELECT_EVENT_RETURNED: + status = self.sfp_event.check_sfp_status(port_dict, 0) + if not status: + break + i = i + 1 + return True, {'sfp':port_dict} + else: + return True, {} diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py index 6df3e6437b..9ea9c21899 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py @@ -327,6 +327,7 @@ class SFP(SfpBase): self.dom_volt_supported = False self.dom_rx_power_supported = False self.dom_tx_power_supported = False + self.calibration = 0 self.dom_tx_disable_supported = (int(sfp_dom_capability_raw[1], 16) & 0x40 != 0) else: self.dom_supported = False diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py new file mode 100644 index 0000000000..1e57603d38 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python +''' +listen to the SDK for the SFP change event and return to chassis. +''' + +from __future__ import print_function +import sys, errno +import os +import time +import select +from python_sdk_api.sx_api import * +from sonic_daemon_base.daemon_base import Logger + +SYSLOG_IDENTIFIER = "sfp-event" + +SDK_SFP_STATE_IN = 0x1 +SDK_SFP_STATE_OUT = 0x2 +STATUS_PLUGIN = '1' +STATUS_PLUGOUT = '0' +STATUS_UNKNOWN = '2' + +sfp_value_status_dict = { + SDK_SFP_STATE_IN: STATUS_PLUGIN, + SDK_SFP_STATE_OUT: STATUS_PLUGOUT, +} + +PMPE_PACKET_SIZE = 2000 + +logger = Logger(SYSLOG_IDENTIFIER) + +class sfp_event: + ''' Listen to plugin/plugout cable events ''' + + SX_OPEN_RETRIES = 20 + + def __init__(self): + self.swid = 0 + self.handle = None + + def initialize(self): + # open SDK API handle. + # retry at most SX_OPEN_RETRIES times to wait until SDK is started during system startup + retry = 1 + while True: + rc, self.handle = sx_api_open(None) + if rc == SX_STATUS_SUCCESS: + break + + logger.log_info("failed to open SDK API handle... retrying {}".format(retry)) + + time.sleep(2 ** retry) + retry += 1 + + if retry > self.SX_OPEN_RETRIES: + raise RuntimeError("failed to open SDK API handle after {} retries".format(retry)) + + # Allocate SDK fd and user channel structures + self.rx_fd_p = new_sx_fd_t_p() + self.user_channel_p = new_sx_user_channel_t_p() + + rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p) + if rc != SX_STATUS_SUCCESS: + raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc)) + + self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD + self.user_channel_p.channel.fd = self.rx_fd_p + + rc = sx_api_host_ifc_trap_id_register_set(self.handle, + SX_ACCESS_CMD_REGISTER, + self.swid, + SX_TRAP_ID_PMPE, + self.user_channel_p) + if rc != SX_STATUS_SUCCESS: + raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(rc)) + + def deinitialize(self): + if self.handle is None: + return + + # unregister trap id + rc = sx_api_host_ifc_trap_id_register_set(self.handle, + SX_ACCESS_CMD_DEREGISTER, + self.swid, + SX_TRAP_ID_PMPE, + self.user_channel_p) + if rc != SX_STATUS_SUCCESS: + logger.log_error("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(rc)) + + rc = sx_api_host_ifc_close(self.handle, self.rx_fd_p) + if rc != SX_STATUS_SUCCESS: + logger.log_error("sx_api_host_ifc_close exited with error, rc {}".format(rc)) + + rc = sx_api_close(self.handle) + if rc != SX_STATUS_SUCCESS: + logger.log_error("sx_api_close exited with error, rc {}".format(rc)) + + delete_sx_fd_t_p(self.rx_fd_p) + delete_sx_user_channel_t_p(self.user_channel_p) + + def check_sfp_status(self, port_change, timeout): + """ + the meaning of timeout is aligned with select.select, which has the following meaning: + 0: poll, returns without blocked + arbitrary positive value: doesn't returns until at least fd in the set is ready or + seconds elapsed + Note: + check_sfp_status makes the use of select to retrieve the notifications, which means + it should has the logic of reading out all the notifications in the fd selected without blocked. + However, it fails to do that due to some sdk API's characteristics: + sx_lib_host_ifc_recv can only read one notification each time and will block when no notification in that fd. + sx_lib_host_ifc_recv_list can return all notification in the fd via a single reading operation but + not supported by PMPE register (I've tested it but failed) + as a result the only way to satisfy the logic is to call sx_lib_host_ifc_recv in a loop until all notifications + has been read and we have to find a way to check that. it seems the only way to check that is via using select. + in this sense, we return one notification each time check_sfp_status called and let the caller, get_change_event, + to repeat calling it with timeout = 0 in a loop until no new notification read (in this case it returns false). + by doing so all the notifications in the fd can be retrieved through a single call to get_change_event. + """ + found = 0 + + try: + read, _, _ = select.select([self.rx_fd_p.fd], [], [], timeout) + except select.error as err: + rc, msg = err + if rc == errno.EAGAIN or rc == errno.EINTR: + return False + else: + raise + + for fd in read: + if fd == self.rx_fd_p.fd: + success, port_list, module_state = self.on_pmpe(self.rx_fd_p) + if not success: + logger.log_error("failed to read from {}".format(fd)) + break + + sfp_state = sfp_value_status_dict.get(module_state, STATUS_UNKNOWN) + if sfp_state == STATUS_UNKNOWN: + # in the following sequence, STATUS_UNKNOWN can be returned. + # so we shouldn't raise exception here. + # 1. some sfp module is inserted + # 2. sfp_event gets stuck and fails to fetch the change event instantaneously + # 3. and then the sfp module is removed + # 4. sfp_event starts to try fetching the change event + # in this case found is increased so that True will be returned + logger.log_info("unknown module state {}, maybe the port suffers two adjacent insertion/removal".format(module_state)) + found += 1 + continue + + for port in port_list: + logger.log_info("SFP on port {} state {}".format(port, sfp_state)) + port_change[port] = sfp_state + found += 1 + + if found == 0: + return False + else: + return True + + def on_pmpe(self, fd_p): + ''' on port module plug event handler ''' + + # recv parameters + pkt_size = PMPE_PACKET_SIZE + pkt_size_p = new_uint32_t_p() + uint32_t_p_assign(pkt_size_p, pkt_size) + pkt = new_uint8_t_arr(pkt_size) + recv_info_p = new_sx_receive_info_t_p() + pmpe_t = sx_event_pmpe_t() + port_attributes_list = new_sx_port_attributes_t_arr(64) + port_cnt_p = new_uint32_t_p() + uint32_t_p_assign(port_cnt_p,64) + label_port_list = [] + module_state = 0 + + rc = sx_lib_host_ifc_recv(fd_p, pkt, pkt_size_p, recv_info_p) + if rc != 0: + logger.log_error("sx_lib_host_ifc_recv exited with error, rc %d" % rc) + status = False + else: + status = True + pmpe_t = recv_info_p.event_info.pmpe + port_list_size = pmpe_t.list_size + logical_port_list = pmpe_t.log_port_list + module_state = pmpe_t.module_state + + for i in xrange(port_list_size): + logical_port = sx_port_log_id_t_arr_getitem(logical_port_list, i) + rc = sx_api_port_device_get(self.handle, 1 , 0, port_attributes_list, port_cnt_p) + port_cnt = uint32_t_p_value(port_cnt_p) + + for i in xrange(port_cnt): + port_attributes = sx_port_attributes_t_arr_getitem(port_attributes_list,i) + if port_attributes.log_port == logical_port: + lable_port = port_attributes.port_mapping.module_port + break + label_port_list.append(lable_port) + + delete_uint32_t_p(pkt_size_p) + delete_uint8_t_arr(pkt) + delete_sx_receive_info_t_p(recv_info_p) + delete_sx_port_attributes_t_arr(port_attributes_list) + delete_uint32_t_p(port_cnt_p) + + return status, label_port_list, module_state,