Skip to content

Commit

Permalink
[Mellanox] Fix thermal control bugs (#4298)
Browse files Browse the repository at this point in the history
* [thermal control] Fix pmon docker stop issue on 3800
* [thermal fix] Fix QA test issue
* [thermal fix] change psu._get_power_available_status to psu.get_power_available_status
* [thermal fix] adjust log for PSU absence and power absence
* [thermal fix] add unit test for loading thermal policy file with duplicate conditions in different policies
* [thermal] fix fan.get_presence for non-removable SKU
* [thermal fix] fix issue: fan direction is based on drawer
* Fix issue: when fan is not present, should not read fan direction from sysfs but directly return N/A
* [thermal fix] add unit test for get_direction for absent FAN
* Unplugable PSU has no FAN, no need add a FAN object for this PSU
* Update submodules

Co-authored-by: Stephen Sun <5379172+stephenxs@users.noreply.github.com>
  • Loading branch information
Junchao-Mellanox and stephenxs authored Mar 25, 2020
1 parent d2bab44 commit 80bf061
Show file tree
Hide file tree
Showing 15 changed files with 270 additions and 34 deletions.
4 changes: 2 additions & 2 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,9 @@ def initialize_fan(self):

for index in range(num_of_fan):
if multi_rotor_in_drawer:
fan = Fan(has_fan_dir, index, index/2)
fan = Fan(has_fan_dir, index, index/2, False, self.sku_name)
else:
fan = Fan(has_fan_dir, index, index)
fan = Fan(has_fan_dir, index, index, False, self.sku_name)
self._fan_list.append(fan)


Expand Down
34 changes: 22 additions & 12 deletions platform/mellanox/mlnx-platform-api/sonic_platform/fan.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,22 @@
# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches
FAN_DIR = "/var/run/hw-management/system/fan_dir"

# SKUs with unplugable FANs:
# 1. don't have fanX_status and should be treated as always present
hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100']

class Fan(FanBase):
"""Platform-specific Fan class"""

STATUS_LED_COLOR_ORANGE = "orange"

def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False):
def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None):
# API index is starting from 0, Mellanox platform index is starting from 1
self.index = fan_index + 1
self.drawer_index = drawer_index + 1

self.is_psu_fan = psu_fan
self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True

self.fan_min_speed_path = "fan{}_min".format(self.index)
if not self.is_psu_fan:
Expand All @@ -47,7 +52,7 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False):
else:
self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index)
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
self._name = 'psu_{}_fan_{}'.format(self.index, fan_index)
self._name = 'psu_{}_fan_{}'.format(self.index, 1)
self.fan_max_speed_path = None
self.fan_status_path = "fan{}_fault".format(self.index)
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
Expand Down Expand Up @@ -80,13 +85,13 @@ def get_direction(self):
1 stands for forward, in other words intake
0 stands for reverse, in other words exhaust
"""
if not self.fan_dir or self.is_psu_fan:
if not self.fan_dir or self.is_psu_fan or not self.get_presence():
return self.FAN_DIRECTION_NOT_APPLICABLE

try:
with open(os.path.join(self.fan_dir), 'r') as fan_dir:
fan_dir_bits = int(fan_dir.read())
fan_mask = 1 << self.index - 1
fan_mask = 1 << self.drawer_index - 1
if fan_dir_bits & fan_mask:
return self.FAN_DIRECTION_INTAKE
else:
Expand All @@ -107,15 +112,15 @@ def get_status(self):
"""
status = 0
if self.is_psu_fan:
status = 1
status = 0
else:
try:
with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status:
status = int(fault_status.read())
except (ValueError, IOError):
status = 0
status = 1

return status == 1
return status == 0


def get_presence(self):
Expand All @@ -132,11 +137,14 @@ def get_presence(self):
else:
status = 0
else:
try:
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
status = int(presence_status.read())
except (ValueError, IOError):
status = 0
if self.always_presence:
status = 1
else:
try:
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
status = int(presence_status.read())
except (ValueError, IOError):
status = 0

return status == 1

Expand Down Expand Up @@ -183,6 +191,8 @@ def get_speed(self):

max_speed_in_rpm = self._get_max_speed_in_rpm()
speed = 100*speed_in_rpm/max_speed_in_rpm
if speed > 100:
speed = 100

return speed

Expand Down
30 changes: 25 additions & 5 deletions platform/mellanox/mlnx-platform-api/sonic_platform/psu.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,19 +99,21 @@ def __init__(self, psu_index, sku):
psu_presence = os.path.join(self.psu_path, psu_presence)
self.psu_presence = psu_presence

fan = Fan(sku, psu_index, psu_index, True)
if fan.get_presence():
# unplugable PSU has no FAN
if sku not in hwsku_dict_with_unplugable_psu:
fan = Fan(sku, psu_index, psu_index, True)
self._fan_list.append(fan)

def get_name(self):
return self._name

self.psu_green_led_path = "led_psu_green"
self.psu_red_led_path = "led_psu_red"
self.psu_orange_led_path = "led_psu_orange"
self.psu_led_cap_path = "led_psu_capability"


def get_name(self):
return self._name


def _read_generic_file(self, filename, len):
"""
Read a generic file, returns the contents of the file
Expand Down Expand Up @@ -287,3 +289,21 @@ def get_status_led(self):
raise RuntimeError("Failed to read led status for psu due to {}".format(repr(e)))

return self.STATUS_LED_COLOR_OFF


def get_power_available_status(self):
"""
Gets the power available status
Returns:
True if power is present and power on.
False and "absence of PSU" if power is not present.
False and "absence of power" if power is present but not power on.
"""
if not self.get_presence():
return False, "absence of PSU"
elif not self.get_powergood_status():
return False, "absence of power"
else:
return True, ""

30 changes: 19 additions & 11 deletions platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@
}
thermal_api_handler_gearbox = {
THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input",
THERMAL_API_GET_HIGH_THRESHOLD:None
THERMAL_API_GET_HIGH_THRESHOLD:None,
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None
}
thermal_ambient_apis = {
THERMAL_DEV_ASIC_AMBIENT : "asic",
Expand Down Expand Up @@ -279,7 +280,7 @@ def initialize_thermals(sku, thermal_list, psu_list):
else:
if category == THERMAL_DEV_CATEGORY_PSU:
for index in range(count):
thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off")
thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status)
thermal_list.append(thermal)
else:
for index in range(count):
Expand All @@ -289,7 +290,7 @@ def initialize_thermals(sku, thermal_list, psu_list):


class Thermal(ThermalBase):
def __init__(self, category, index, has_index, dependency = None, hint = None):
def __init__(self, category, index, has_index, dependency = None):
"""
index should be a string for category ambient and int for other categories
"""
Expand All @@ -308,7 +309,6 @@ def __init__(self, category, index, has_index, dependency = None, hint = None):
self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD)
self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD)
self.dependency = dependency
self.dependent_hint = hint


def get_name(self):
Expand Down Expand Up @@ -360,13 +360,11 @@ def get_temperature(self):
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
if self.dependency and not self.dependency():
if self.dependent_hint:
hint = self.dependent_hint
else:
hint = "unknown reason"
logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint))
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.temperature, 0)
if value_str is None:
return None
Expand All @@ -386,6 +384,11 @@ def get_high_threshold(self):
"""
if self.high_threshold is None:
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.high_threshold, 0)
if value_str is None:
return None
Expand All @@ -405,6 +408,11 @@ def get_high_critical_threshold(self):
"""
if self.high_critical_threshold is None:
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.high_critical_threshold, 0)
if value_str is None:
return None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@ def collect(self, chassis):
"""
self._status_changed = False
for psu in chassis.get_all_psus():
if psu.get_presence() and psu not in self._presence_psus:
if psu.get_presence() and psu.get_powergood_status() and psu not in self._presence_psus:
self._presence_psus.add(psu)
self._status_changed = True
if psu in self._absence_psus:
self._absence_psus.remove(psu)
elif not psu.get_presence() and psu not in self._absence_psus:
elif (not psu.get_presence() or not psu.get_powergood_status()) and psu not in self._absence_psus:
self._absence_psus.add(psu)
self._status_changed = True
if psu in self._presence_psus:
Expand Down
18 changes: 18 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/duplicate_action.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
},
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}
17 changes: 17 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
},
{
"type": "fan.any.absence"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}
10 changes: 10 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/empty_action.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
}
],
"actions": [
]
}
11 changes: 11 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/empty_condition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "any fan absence",
"conditions": [
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}
4 changes: 4 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@ def set_speed(self, speed):
class MockPsu:
def __init__(self):
self.presence = True
self.powergood = True

def get_presence(self):
return self.presence

def get_powergood_status(self):
return self.powergood


class MockChassis:
def __init__(self):
Expand Down
Loading

0 comments on commit 80bf061

Please sign in to comment.