Skip to content

Commit

Permalink
[Mellanox] Fix thermal control bugs (sonic-net#4298)
Browse files Browse the repository at this point in the history
* [thermal control] Fix pmon docker stop issue on 3800
* [thermal fix] Fix QA test issue
* [thermal fix] change psu._get_power_available_status to psu.get_power_available_status
* [thermal fix] adjust log for PSU absence and power absence
* [thermal fix] add unit test for loading thermal policy file with duplicate conditions in different policies
* [thermal] fix fan.get_presence for non-removable SKU
* [thermal fix] fix issue: fan direction is based on drawer
* Fix issue: when fan is not present, should not read fan direction from sysfs but directly return N/A
* [thermal fix] add unit test for get_direction for absent FAN
* Unplugable PSU has no FAN, no need add a FAN object for this PSU
* Update submodules
  • Loading branch information
Junchao-Mellanox authored and pphuchar committed Apr 20, 2020
1 parent 6466930 commit b8ea901
Show file tree
Hide file tree
Showing 15 changed files with 506 additions and 51 deletions.
19 changes: 17 additions & 2 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from sonic_daemon_base.daemon_base import Logger
from os import listdir
from os.path import isfile, join
from glob import glob
import sys
import io
import re
Expand All @@ -34,6 +35,10 @@

HWMGMT_SYSTEM_ROOT = '/var/run/hw-management/system/'

MST_DEVICE_NAME_PATTERN = '/dev/mst/mt[0-9]*_pciconf0'
MST_DEVICE_RE_PATTERN = '/dev/mst/mt([0-9]*)_pciconf0'
SPECTRUM1_CHIP_ID = '52100'

#reboot cause related definitions
REBOOT_CAUSE_ROOT = HWMGMT_SYSTEM_ROOT

Expand Down Expand Up @@ -93,11 +98,21 @@ def initialize_fan(self):
num_of_fan, num_of_drawer = self._extract_num_of_fans_and_fan_drawers()
multi_rotor_in_drawer = num_of_fan > num_of_drawer

# Fan's direction isn't supported on spectrum 1 devices for now
mst_dev_list = glob(MST_DEVICE_NAME_PATTERN)
if not mst_dev_list:
raise RuntimeError("Can't get chip type due to {} not found".format(MST_DEVICE_NAME_PATTERN))
m = re.search(MST_DEVICE_RE_PATTERN, mst_dev_list[0])
if m.group(1) == SPECTRUM1_CHIP_ID:
has_fan_dir = False
else:
has_fan_dir = True

for index in range(num_of_fan):
if multi_rotor_in_drawer:
fan = Fan(index, index/2)
fan = Fan(has_fan_dir, index, index/2, False, self.sku_name)
else:
fan = Fan(index, index)
fan = Fan(has_fan_dir, index, index, False, self.sku_name)
self._fan_list.append(fan)


Expand Down
162 changes: 131 additions & 31 deletions platform/mellanox/mlnx-platform-api/sonic_platform/fan.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,33 @@
except ImportError as e:
raise ImportError (str(e) + "- required module not found")

LED_ON = 1
LED_OFF = 0
LED_ON = '1'
LED_OFF = '0'

PWM_MAX = 255

FAN_PATH = "/var/run/hw-management/thermal/"
LED_PATH = "/var/run/hw-management/led/"
# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches
FAN_DIR = "/var/run/hw-management/system/fan_dir"

# SKUs with unplugable FANs:
# 1. don't have fanX_status and should be treated as always present
hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100']

class Fan(FanBase):
"""Platform-specific Fan class"""
def __init__(self, fan_index, drawer_index = 1, psu_fan = False):

STATUS_LED_COLOR_ORANGE = "orange"

def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None):
# API index is starting from 0, Mellanox platform index is starting from 1
self.index = fan_index + 1
self.drawer_index = drawer_index + 1

self.is_psu_fan = psu_fan

self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True

self.fan_min_speed_path = "fan{}_min".format(self.index)
if not self.is_psu_fan:
self.fan_speed_get_path = "fan{}_speed_get".format(self.index)
Expand All @@ -42,14 +52,53 @@ def __init__(self, fan_index, drawer_index = 1, psu_fan = False):
else:
self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index)
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
self._name = 'psu_{}_fan_{}'.format(self.index, fan_index)
self._name = 'psu_{}_fan_{}'.format(self.index, 1)
self.fan_max_speed_path = None
self.fan_status_path = "fan{}_fault".format(self.index)
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index)
self.fan_orange_led_path = "led_fan{}_orange".format(self.drawer_index)
self.fan_pwm_path = "pwm1"
self.fan_led_cap_path = "led_fan{}_capability".format(self.drawer_index)
if has_fan_dir:
self.fan_dir = FAN_DIR
else:
self.fan_dir = None


def get_direction(self):
"""
Retrieves the fan's direction
Returns:
A string, either FAN_DIRECTION_INTAKE or FAN_DIRECTION_EXHAUST
depending on fan direction
Notes:
What Mellanox calls forward:
Air flows from fans side to QSFP side, for example: MSN2700-CS2F
which means intake in community
What Mellanox calls reverse:
Air flow from QSFP side to fans side, for example: MSN2700-CS2R
which means exhaust in community
According to hw-mgmt:
1 stands for forward, in other words intake
0 stands for reverse, in other words exhaust
"""
if not self.fan_dir or self.is_psu_fan or not self.get_presence():
return self.FAN_DIRECTION_NOT_APPLICABLE

try:
with open(os.path.join(self.fan_dir), 'r') as fan_dir:
fan_dir_bits = int(fan_dir.read())
fan_mask = 1 << self.drawer_index - 1
if fan_dir_bits & fan_mask:
return self.FAN_DIRECTION_INTAKE
else:
return self.FAN_DIRECTION_EXHAUST
except (ValueError, IOError) as e:
raise RuntimeError("Failed to read fan direction status to {}".format(repr(e)))


def get_name(self):
return self._name
Expand All @@ -63,15 +112,16 @@ def get_status(self):
"""
status = 0
if self.is_psu_fan:
status = 1
status = 0
else:
try:
with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status:
status = int(fault_status.read())
except (ValueError, IOError):
status = 0
status = 1

return status == 0

return status == 1

def get_presence(self):
"""
Expand All @@ -87,14 +137,18 @@ def get_presence(self):
else:
status = 0
else:
try:
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
status = int(presence_status.read())
except (ValueError, IOError):
status = 0
if self.always_presence:
status = 1
else:
try:
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
status = int(presence_status.read())
except (ValueError, IOError):
status = 0

return status == 1



def _get_min_speed_in_rpm(self):
speed = 0
try:
Expand All @@ -104,7 +158,8 @@ def _get_min_speed_in_rpm(self):
speed = 0

return speed



def _get_max_speed_in_rpm(self):
speed = 0
try:
Expand All @@ -115,6 +170,7 @@ def _get_max_speed_in_rpm(self):

return speed


def get_speed(self):
"""
Retrieves the speed of fan
Expand All @@ -135,9 +191,12 @@ def get_speed(self):

max_speed_in_rpm = self._get_max_speed_in_rpm()
speed = 100*speed_in_rpm/max_speed_in_rpm
if speed > 100:
speed = 100

return speed


def get_target_speed(self):
"""
Retrieves the expected speed of fan
Expand All @@ -159,6 +218,7 @@ def get_target_speed(self):

return speed


def set_speed(self, speed):
"""
Set fan speed to expected value
Expand All @@ -184,7 +244,8 @@ def set_speed(self, speed):
status = False

return status



def _get_led_capability(self):
cap_list = None
try:
Expand All @@ -196,6 +257,7 @@ def _get_led_capability(self):

return cap_list


def set_status_led(self, color):
"""
Set led to expected color
Expand All @@ -216,32 +278,70 @@ def set_status_led(self, color):
return False
status = False
try:
if color == 'green':
if color == self.STATUS_LED_COLOR_GREEN:
with open(os.path.join(LED_PATH, self.fan_green_led_path), 'w') as fan_led:
fan_led.write(str(LED_ON))
elif color == 'red':
fan_led.write(LED_ON)
status = True
elif color == self.STATUS_LED_COLOR_RED:
# Some fan don't support red led but support orange led, in this case we set led to orange
if 'red' in led_cap_list:
if self.STATUS_LED_COLOR_RED in led_cap_list:
led_path = os.path.join(LED_PATH, self.fan_red_led_path)
elif 'orange' in led_cap_list:
elif self.STATUS_LED_COLOR_ORANGE in led_cap_list:
led_path = os.path.join(LED_PATH, self.fan_orange_led_path)
else:
return False
with open(led_path, 'w') as fan_led:
fan_led.write(str(LED_ON))

elif color == 'off':
with open(os.path.join(LED_PATH, self.fan_green_led_path), 'w') as fan_led:
fan_led.write(str(LED_OFF))

with open(os.path.join(LED_PATH, self.fan_red_led_path), 'w') as fan_led:
fan_led.write(str(LED_OFF))
fan_led.write(LED_ON)
status = True
elif color == self.STATUS_LED_COLOR_OFF:
if self.STATUS_LED_COLOR_GREEN in led_cap_list:
with open(os.path.join(LED_PATH, self.fan_green_led_path), 'w') as fan_led:
fan_led.write(str(LED_OFF))
if self.STATUS_LED_COLOR_RED in led_cap_list:
with open(os.path.join(LED_PATH, self.fan_red_led_path), 'w') as fan_led:
fan_led.write(str(LED_OFF))
if self.STATUS_LED_COLOR_ORANGE in led_cap_list:
with open(os.path.join(LED_PATH, self.fan_orange_led_path), 'w') as fan_led:
fan_led.write(str(LED_OFF))

status = True
else:
status = False
except (ValueError, IOError):
status = False
status = False

return status


def get_status_led(self):
"""
Gets the state of the fan status LED
Returns:
A string, one of the predefined STATUS_LED_COLOR_* strings above
"""
led_cap_list = self._get_led_capability()
if led_cap_list is None:
return self.STATUS_LED_COLOR_OFF

try:
with open(os.path.join(LED_PATH, self.fan_green_led_path), 'r') as fan_led:
if LED_OFF != fan_led.read().rstrip('\n'):
return self.STATUS_LED_COLOR_GREEN
if self.STATUS_LED_COLOR_RED in led_cap_list:
with open(os.path.join(LED_PATH, self.fan_red_led_path), 'r') as fan_led:
if LED_OFF != fan_led.read().rstrip('\n'):
return self.STATUS_LED_COLOR_RED
if self.STATUS_LED_COLOR_ORANGE in led_cap_list:
with open(os.path.join(LED_PATH, self.fan_orange_led_path), 'r') as fan_led:
if LED_OFF != fan_led.read().rstrip('\n'):
return self.STATUS_LED_COLOR_RED
except (ValueError, IOError) as e:
raise RuntimeError("Failed to read led status for fan {} due to {}".format(self.index, repr(e)))

return self.STATUS_LED_COLOR_OFF


def get_speed_tolerance(self):
"""
Retrieves the speed tolerance of the fan
Expand All @@ -251,4 +351,4 @@ def get_speed_tolerance(self):
considered tolerable
"""
# The tolerance value is fixed as 20% for all the Mellanox platform
return 20
return 20
Loading

0 comments on commit b8ea901

Please sign in to comment.