Skip to content

Commit

Permalink
[Mellanox] Auto correct PSU voltage threshold (WA) (#10394)
Browse files Browse the repository at this point in the history
- Why I did it
There is a hardware bug that PSU voltage threshold sysfs returns incorrect value. The workaround is to call "sensor -s" to refresh it.

- How I did it
Call "sensor -s" when the threshold value is not incorrect and PSU is "DELTA 1100"

- How to verify it
Unit test and Manual test
  • Loading branch information
Junchao-Mellanox authored Apr 14, 2022
1 parent 812f17d commit 0191300
Show file tree
Hide file tree
Showing 10 changed files with 169 additions and 0 deletions.
6 changes: 6 additions & 0 deletions device/mellanox/x86_64-mlnx_msn3700-r0/sensors.conf
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2 12V Rail Pwr (out)"
label curr1 "PSU-2 220V Rail Curr (in)"
label curr2 "PSU-2 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-1 220V Rail (in)"
ignore in2
Expand All @@ -99,6 +102,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1 12V Rail Pwr (out)"
label curr1 "PSU-1 220V Rail Curr (in)"
label curr2 "PSU-1 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952

# Chassis fans
chip "mlxreg_fan-isa-*"
Expand Down
6 changes: 6 additions & 0 deletions device/mellanox/x86_64-mlnx_msn3700c-r0/sensors.conf
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2 12V Rail Pwr (out)"
label curr1 "PSU-2 220V Rail Curr (in)"
label curr2 "PSU-2 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-1 220V Rail (in)"
ignore in2
Expand All @@ -99,6 +102,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1 12V Rail Pwr (out)"
label curr1 "PSU-1 220V Rail Curr (in)"
label curr2 "PSU-1 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952

# Chassis fans
chip "mlxreg_fan-isa-*"
Expand Down
6 changes: 6 additions & 0 deletions device/mellanox/x86_64-mlnx_msn3800-r0/sensors.conf
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2 12V Rail Pwr (out)"
label curr1 "PSU-2 220V Rail Curr (in)"
label curr2 "PSU-2 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-1 220V Rail (in)"
ignore in2
Expand All @@ -120,6 +123,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1 12V Rail Pwr (out)"
label curr1 "PSU-1 220V Rail Curr (in)"
label curr2 "PSU-1 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952

# Chassis fans
chip "mlxreg_fan-isa-*"
Expand Down
6 changes: 6 additions & 0 deletions device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1(L) 12V Rail Pwr (out)"
label curr1 "PSU-1(L) 220V Rail Curr (in)"
label curr2 "PSU-1(L) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-2(R) 220V Rail (in)"
ignore in2
Expand All @@ -181,6 +184,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2(R) 12V Rail Pwr (out)"
label curr1 "PSU-2(R) 220V Rail Curr (in)"
label curr2 "PSU-2(R) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952

# Chassis fans
chip "mlxreg_fan-isa-*"
Expand Down
6 changes: 6 additions & 0 deletions device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf.a1
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-1(L) 12V Rail Pwr (out)"
label curr1 "PSU-1(L) 220V Rail Curr (in)"
label curr2 "PSU-1(L) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952
chip "dps460-i2c-*-59"
label in1 "PSU-2(R) 220V Rail (in)"
ignore in2
Expand All @@ -137,6 +140,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)"
label power2 "PSU-2(R) 12V Rail Pwr (out)"
label curr1 "PSU-2(R) 220V Rail Curr (in)"
label curr2 "PSU-2(R) 12V Rail Curr (out)"
set in3_lcrit in3_crit * 0.662
set in3_min in3_crit * 0.745
set in3_max in3_crit * 0.952

# Chassis fans
chip "mlxreg_fan-isa-*"
Expand Down
70 changes: 70 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/psu.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@

try:
import os
import time
from sonic_platform_base.psu_base import PsuBase
from sonic_py_common.logger import Logger
from .device_data import DeviceDataManager
from .led import PsuLed, SharedLed, ComponentFaultyIndicator
from . import utils
from .vpd_parser import VpdParser
Expand Down Expand Up @@ -411,6 +413,7 @@ def get_voltage_high_threshold(self):
capability = utils.read_str_from_file(self.psu_voltage_capability)
if 'max' in capability:
max_voltage = utils.read_int_from_file(self.psu_voltage_max, log_func=logger.log_info)
max_voltage = InvalidPsuVolWA.run(self, max_voltage, self.psu_voltage_max)
return float(max_voltage) / 1000

return None
Expand All @@ -431,6 +434,7 @@ def get_voltage_low_threshold(self):
capability = utils.read_str_from_file(self.psu_voltage_capability)
if 'min' in capability:
min_voltage = utils.read_int_from_file(self.psu_voltage_min, log_func=logger.log_info)
min_voltage = InvalidPsuVolWA.run(self, min_voltage, self.psu_voltage_min)
return float(min_voltage) / 1000

return None
Expand All @@ -448,3 +452,69 @@ def get_maximum_supplied_power(self):
return float(power_max) / 1000000
else:
return None


class InvalidPsuVolWA:
"""This class is created as a workaround for a known hardware issue that the PSU voltage threshold could be a
invalid value 127998. Once we read a voltage threshold value equal to 127998, we should do following:
1. Check the PSU vendor, it should be Delta
2. Generate a temp sensor configuration file which contains a few set commands. Those set commands are the WA provided by low level team.
3. Call "sensors -s -c <tmp_conf_file>"
4. Wait for it to take effect
This issue is found on 3700, 3700c, 3800, 4600c
"""

INVALID_VOLTAGE_VALUE = 127998
EXPECT_VENDOR_NAME = 'DELTA'
EXPECT_CAPACITY = '1100'
EXPECT_PLATFORMS = ['x86_64-mlnx_msn3700-r0', 'x86_64-mlnx_msn3700c-r0', 'x86_64-mlnx_msn3800-r0', 'x86_64-mlnx_msn4600c-r0']
MFR_FIELD = 'MFR_NAME'
CAPACITY_FIELD = 'CAPACITY'
WAIT_TIME = 5

@classmethod
def run(cls, psu, threshold_value, threshold_file):
if threshold_value != cls.INVALID_VOLTAGE_VALUE:
# If the threshold value is not an invalid value, just return
return threshold_value

platform_name = DeviceDataManager.get_platform_name()
# Apply the WA to specified platforms
if platform_name not in cls.EXPECT_PLATFORMS:
# It is unlikely to go to this branch, so we log a warning here
logger.log_warning('PSU {} threshold file {} value {}, but platform is {}'.format(psu.index, threshold_file, threshold_value, platform_name))
return threshold_value

# Check PSU vendor, make sure it is DELTA
vendor_name = psu.vpd_parser.get_entry_value(cls.MFR_FIELD)
if vendor_name != 'N/A' and vendor_name != cls.EXPECT_VENDOR_NAME:
# It is unlikely to go to this branch, so we log a warning here
logger.log_warning('PSU {} threshold file {} value {}, but its vendor is {}'.format(psu.index, threshold_file, threshold_value, vendor_name))
return threshold_value

# Check PSU version, make sure it is 1100
capacity = psu.vpd_parser.get_entry_value(cls.CAPACITY_FIELD)
if capacity != 'N/A' and capacity != cls.EXPECT_CAPACITY:
logger.log_warning('PSU {} threshold file {} value {}, but its capacity is {}'.format(psu.index, threshold_file, threshold_value, capacity))
return threshold_value

# Run a sensor -s command to triger hardware to get the real threashold value
utils.run_command('sensor -s')

# Wait for the threshold value change
return cls.wait_set_done(threshold_file)

@classmethod
def wait_set_done(cls, threshold_file):
wait_time = cls.WAIT_TIME
while wait_time > 0:
value = utils.read_int_from_file(threshold_file, log_func=logger.log_info)
if value != cls.INVALID_VOLTAGE_VALUE:
return value

wait_time -= 1
time.sleep(1)

logger.log_error('sensor -s does not recover PSU threshold sensor after {} seconds'.format(cls.WAIT_TIME))
return None
13 changes: 13 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,16 @@ def _impl(*args, **kwargs):
return return_value
return _impl
return wrapper


def run_command(command):
"""
Utility function to run an shell command and return the output.
:param command: Shell command string.
:return: Output of the shell command.
"""
try:
process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return process.communicate()[0].strip()
except Exception:
return None
15 changes: 15 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/vpd_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
SN_VPD_FIELD = "SN_VPD_FIELD"
PN_VPD_FIELD = "PN_VPD_FIELD"
REV_VPD_FIELD = "REV_VPD_FIELD"
MFR_VPD_FIELD = "MFR_NAME"


class VpdParser:
Expand Down Expand Up @@ -82,3 +83,17 @@ def get_revision(self):
logger.log_error("Fail to read revision: No key {} in VPD {}".format(REV_VPD_FIELD, self.vpd_file))
return 'N/A'
return self.vpd_data.get(REV_VPD_FIELD, 'N/A')

def get_entry_value(self, key):
"""
Retrieves an vpd entry of the device
Returns:
string: Vpd entry value of device
"""
if self._get_data() and key not in self.vpd_data:
logger.log_warning("Fail to read vpd info: No key {} in VPD {}".format(key, self.vpd_file))
return 'N/A'
return self.vpd_data.get(key, 'N/A')


37 changes: 37 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_psu.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,40 @@ def test_psu_vpd(self):
assert psu.get_model() == 'MTEF-PSF-AC-C'
assert psu.get_serial() == 'MT1946X07684'
assert psu.get_revision() == 'A3'

assert psu.vpd_parser.get_entry_value('MFR_NAME') == 'DELTA'

@mock.patch('sonic_platform.utils.read_int_from_file', mock.MagicMock(return_value=9999))
@mock.patch('sonic_platform.utils.run_command')
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_name')
@mock.patch('sonic_platform.vpd_parser.VpdParser.get_entry_value')
def test_psu_workaround(self, mock_get_entry_value, mock_get_platform_name, mock_run_command):
from sonic_platform.psu import InvalidPsuVolWA
psu = Psu(0)
# Threshold value is not InvalidPsuVolWA.INVALID_VOLTAGE_VALUE
assert InvalidPsuVolWA.run(psu, 9999, '') == 9999

# Platform name is not in InvalidPsuVolWA.EXPECT_PLATFORMS
mock_get_platform_name.return_value = 'some platform'
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE

# PSU vendor is not InvalidPsuVolWA.EXPECT_VENDOR_NAME
vpd_info = {
InvalidPsuVolWA.MFR_FIELD: 'some psu',
InvalidPsuVolWA.CAPACITY_FIELD: 'some capacity'
}
def get_entry_value(key):
return vpd_info[key]

mock_get_entry_value.side_effect = get_entry_value
mock_get_platform_name.return_value = 'x86_64-mlnx_msn3700-r0'
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE

# PSU capacity is not InvalidPsuVolWA.EXPECT_CAPACITY
vpd_info[InvalidPsuVolWA.MFR_FIELD] = InvalidPsuVolWA.EXPECT_VENDOR_NAME
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE

# Normal
vpd_info[InvalidPsuVolWA.CAPACITY_FIELD] = InvalidPsuVolWA.EXPECT_CAPACITY
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == 9999
mock_run_command.assert_called_with('sensor -s')
4 changes: 4 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,7 @@ def func():

assert func() == 100
assert mock_log.call_count == 1

def test_run_command(self):
output = utils.run_command('ls')
assert output

0 comments on commit 0191300

Please sign in to comment.