From bdbb3d708d7a081f7a7d88c39759cbfb5781e00d Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Thu, 14 Apr 2022 13:14:40 +0800 Subject: [PATCH] [Mellanox] Auto correct PSU voltage threshold (WA) (#10394) - Why I did it There is a hardware bug that PSU voltage threshold sysfs returns incorrect value. The workaround is to call "sensor -s" to refresh it. - How I did it Call "sensor -s" when the threshold value is not incorrect and PSU is "DELTA 1100" - How to verify it Unit test and Manual test --- .../x86_64-mlnx_msn3700-r0/sensors.conf | 6 ++ .../x86_64-mlnx_msn3700c-r0/sensors.conf | 6 ++ .../x86_64-mlnx_msn3800-r0/sensors.conf | 6 ++ .../x86_64-mlnx_msn4600c-r0/sensors.conf | 6 ++ .../x86_64-mlnx_msn4600c-r0/sensors.conf.a1 | 6 ++ .../mlnx-platform-api/sonic_platform/psu.py | 70 +++++++++++++++++++ .../mlnx-platform-api/sonic_platform/utils.py | 13 ++++ .../sonic_platform/vpd_parser.py | 15 ++++ .../mlnx-platform-api/tests/test_psu.py | 37 ++++++++++ .../mlnx-platform-api/tests/test_utils.py | 4 ++ 10 files changed, 169 insertions(+) diff --git a/device/mellanox/x86_64-mlnx_msn3700-r0/sensors.conf b/device/mellanox/x86_64-mlnx_msn3700-r0/sensors.conf index 281f0a54dfa3..59c99ac98e39 100644 --- a/device/mellanox/x86_64-mlnx_msn3700-r0/sensors.conf +++ b/device/mellanox/x86_64-mlnx_msn3700-r0/sensors.conf @@ -85,6 +85,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)" label power2 "PSU-2 12V Rail Pwr (out)" label curr1 "PSU-2 220V Rail Curr (in)" label curr2 "PSU-2 12V Rail Curr (out)" + set in3_lcrit in3_crit * 0.662 + set in3_min in3_crit * 0.745 + set in3_max in3_crit * 0.952 chip "dps460-i2c-*-59" label in1 "PSU-1 220V Rail (in)" ignore in2 @@ -99,6 +102,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)" label power2 "PSU-1 12V Rail Pwr (out)" label curr1 "PSU-1 220V Rail Curr (in)" label curr2 "PSU-1 12V Rail Curr (out)" + set in3_lcrit in3_crit * 0.662 + set in3_min in3_crit * 0.745 + set in3_max in3_crit * 0.952 # Chassis fans chip "mlxreg_fan-isa-*" diff --git a/device/mellanox/x86_64-mlnx_msn3700c-r0/sensors.conf b/device/mellanox/x86_64-mlnx_msn3700c-r0/sensors.conf index 094cd78cde57..343385fcd4de 100644 --- a/device/mellanox/x86_64-mlnx_msn3700c-r0/sensors.conf +++ b/device/mellanox/x86_64-mlnx_msn3700c-r0/sensors.conf @@ -85,6 +85,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)" label power2 "PSU-2 12V Rail Pwr (out)" label curr1 "PSU-2 220V Rail Curr (in)" label curr2 "PSU-2 12V Rail Curr (out)" + set in3_lcrit in3_crit * 0.662 + set in3_min in3_crit * 0.745 + set in3_max in3_crit * 0.952 chip "dps460-i2c-*-59" label in1 "PSU-1 220V Rail (in)" ignore in2 @@ -99,6 +102,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)" label power2 "PSU-1 12V Rail Pwr (out)" label curr1 "PSU-1 220V Rail Curr (in)" label curr2 "PSU-1 12V Rail Curr (out)" + set in3_lcrit in3_crit * 0.662 + set in3_min in3_crit * 0.745 + set in3_max in3_crit * 0.952 # Chassis fans chip "mlxreg_fan-isa-*" diff --git a/device/mellanox/x86_64-mlnx_msn3800-r0/sensors.conf b/device/mellanox/x86_64-mlnx_msn3800-r0/sensors.conf index 7ba5f9c8c050..0b1cfc75548a 100644 --- a/device/mellanox/x86_64-mlnx_msn3800-r0/sensors.conf +++ b/device/mellanox/x86_64-mlnx_msn3800-r0/sensors.conf @@ -106,6 +106,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)" label power2 "PSU-2 12V Rail Pwr (out)" label curr1 "PSU-2 220V Rail Curr (in)" label curr2 "PSU-2 12V Rail Curr (out)" + set in3_lcrit in3_crit * 0.662 + set in3_min in3_crit * 0.745 + set in3_max in3_crit * 0.952 chip "dps460-i2c-*-59" label in1 "PSU-1 220V Rail (in)" ignore in2 @@ -120,6 +123,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)" label power2 "PSU-1 12V Rail Pwr (out)" label curr1 "PSU-1 220V Rail Curr (in)" label curr2 "PSU-1 12V Rail Curr (out)" + set in3_lcrit in3_crit * 0.662 + set in3_min in3_crit * 0.745 + set in3_max in3_crit * 0.952 # Chassis fans chip "mlxreg_fan-isa-*" diff --git a/device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf b/device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf index 9c80350e19ad..3ff78f15023f 100644 --- a/device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf +++ b/device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf @@ -167,6 +167,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)" label power2 "PSU-1(L) 12V Rail Pwr (out)" label curr1 "PSU-1(L) 220V Rail Curr (in)" label curr2 "PSU-1(L) 12V Rail Curr (out)" + set in3_lcrit in3_crit * 0.662 + set in3_min in3_crit * 0.745 + set in3_max in3_crit * 0.952 chip "dps460-i2c-*-59" label in1 "PSU-2(R) 220V Rail (in)" ignore in2 @@ -181,6 +184,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)" label power2 "PSU-2(R) 12V Rail Pwr (out)" label curr1 "PSU-2(R) 220V Rail Curr (in)" label curr2 "PSU-2(R) 12V Rail Curr (out)" + set in3_lcrit in3_crit * 0.662 + set in3_min in3_crit * 0.745 + set in3_max in3_crit * 0.952 # Chassis fans chip "mlxreg_fan-isa-*" diff --git a/device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf.a1 b/device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf.a1 index 44fff272e544..a0ebc677ad56 100644 --- a/device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf.a1 +++ b/device/mellanox/x86_64-mlnx_msn4600c-r0/sensors.conf.a1 @@ -123,6 +123,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)" label power2 "PSU-1(L) 12V Rail Pwr (out)" label curr1 "PSU-1(L) 220V Rail Curr (in)" label curr2 "PSU-1(L) 12V Rail Curr (out)" + set in3_lcrit in3_crit * 0.662 + set in3_min in3_crit * 0.745 + set in3_max in3_crit * 0.952 chip "dps460-i2c-*-59" label in1 "PSU-2(R) 220V Rail (in)" ignore in2 @@ -137,6 +140,9 @@ bus "i2c-4" "i2c-1-mux (chan_id 3)" label power2 "PSU-2(R) 12V Rail Pwr (out)" label curr1 "PSU-2(R) 220V Rail Curr (in)" label curr2 "PSU-2(R) 12V Rail Curr (out)" + set in3_lcrit in3_crit * 0.662 + set in3_min in3_crit * 0.745 + set in3_max in3_crit * 0.952 # Chassis fans chip "mlxreg_fan-isa-*" diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 64aa0166086f..d32a641d4d08 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -24,8 +24,10 @@ try: import os + import time from sonic_platform_base.psu_base import PsuBase from sonic_py_common.logger import Logger + from .device_data import DeviceDataManager from .led import PsuLed, SharedLed, ComponentFaultyIndicator from . import utils from .vpd_parser import VpdParser @@ -411,6 +413,7 @@ def get_voltage_high_threshold(self): capability = utils.read_str_from_file(self.psu_voltage_capability) if 'max' in capability: max_voltage = utils.read_int_from_file(self.psu_voltage_max, log_func=logger.log_info) + max_voltage = InvalidPsuVolWA.run(self, max_voltage, self.psu_voltage_max) return float(max_voltage) / 1000 return None @@ -431,6 +434,7 @@ def get_voltage_low_threshold(self): capability = utils.read_str_from_file(self.psu_voltage_capability) if 'min' in capability: min_voltage = utils.read_int_from_file(self.psu_voltage_min, log_func=logger.log_info) + min_voltage = InvalidPsuVolWA.run(self, min_voltage, self.psu_voltage_min) return float(min_voltage) / 1000 return None @@ -448,3 +452,69 @@ def get_maximum_supplied_power(self): return float(power_max) / 1000000 else: return None + + +class InvalidPsuVolWA: + """This class is created as a workaround for a known hardware issue that the PSU voltage threshold could be a + invalid value 127998. Once we read a voltage threshold value equal to 127998, we should do following: + 1. Check the PSU vendor, it should be Delta + 2. Generate a temp sensor configuration file which contains a few set commands. Those set commands are the WA provided by low level team. + 3. Call "sensors -s -c " + 4. Wait for it to take effect + + This issue is found on 3700, 3700c, 3800, 4600c + """ + + INVALID_VOLTAGE_VALUE = 127998 + EXPECT_VENDOR_NAME = 'DELTA' + EXPECT_CAPACITY = '1100' + EXPECT_PLATFORMS = ['x86_64-mlnx_msn3700-r0', 'x86_64-mlnx_msn3700c-r0', 'x86_64-mlnx_msn3800-r0', 'x86_64-mlnx_msn4600c-r0'] + MFR_FIELD = 'MFR_NAME' + CAPACITY_FIELD = 'CAPACITY' + WAIT_TIME = 5 + + @classmethod + def run(cls, psu, threshold_value, threshold_file): + if threshold_value != cls.INVALID_VOLTAGE_VALUE: + # If the threshold value is not an invalid value, just return + return threshold_value + + platform_name = DeviceDataManager.get_platform_name() + # Apply the WA to specified platforms + if platform_name not in cls.EXPECT_PLATFORMS: + # It is unlikely to go to this branch, so we log a warning here + logger.log_warning('PSU {} threshold file {} value {}, but platform is {}'.format(psu.index, threshold_file, threshold_value, platform_name)) + return threshold_value + + # Check PSU vendor, make sure it is DELTA + vendor_name = psu.vpd_parser.get_entry_value(cls.MFR_FIELD) + if vendor_name != 'N/A' and vendor_name != cls.EXPECT_VENDOR_NAME: + # It is unlikely to go to this branch, so we log a warning here + logger.log_warning('PSU {} threshold file {} value {}, but its vendor is {}'.format(psu.index, threshold_file, threshold_value, vendor_name)) + return threshold_value + + # Check PSU version, make sure it is 1100 + capacity = psu.vpd_parser.get_entry_value(cls.CAPACITY_FIELD) + if capacity != 'N/A' and capacity != cls.EXPECT_CAPACITY: + logger.log_warning('PSU {} threshold file {} value {}, but its capacity is {}'.format(psu.index, threshold_file, threshold_value, capacity)) + return threshold_value + + # Run a sensor -s command to triger hardware to get the real threashold value + utils.run_command('sensor -s') + + # Wait for the threshold value change + return cls.wait_set_done(threshold_file) + + @classmethod + def wait_set_done(cls, threshold_file): + wait_time = cls.WAIT_TIME + while wait_time > 0: + value = utils.read_int_from_file(threshold_file, log_func=logger.log_info) + if value != cls.INVALID_VOLTAGE_VALUE: + return value + + wait_time -= 1 + time.sleep(1) + + logger.log_error('sensor -s does not recover PSU threshold sensor after {} seconds'.format(cls.WAIT_TIME)) + return None diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py b/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py index 0650d9af1a1c..22ef4bb1f27d 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/utils.py @@ -194,3 +194,16 @@ def _impl(*args, **kwargs): return return_value return _impl return wrapper + + +def run_command(command): + """ + Utility function to run an shell command and return the output. + :param command: Shell command string. + :return: Output of the shell command. + """ + try: + process = subprocess.Popen(command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return process.communicate()[0].strip() + except Exception: + return None \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/vpd_parser.py b/platform/mellanox/mlnx-platform-api/sonic_platform/vpd_parser.py index ea66234e4bff..e53d825adfd6 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/vpd_parser.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/vpd_parser.py @@ -24,6 +24,7 @@ SN_VPD_FIELD = "SN_VPD_FIELD" PN_VPD_FIELD = "PN_VPD_FIELD" REV_VPD_FIELD = "REV_VPD_FIELD" +MFR_VPD_FIELD = "MFR_NAME" class VpdParser: @@ -82,3 +83,17 @@ def get_revision(self): logger.log_error("Fail to read revision: No key {} in VPD {}".format(REV_VPD_FIELD, self.vpd_file)) return 'N/A' return self.vpd_data.get(REV_VPD_FIELD, 'N/A') + + def get_entry_value(self, key): + """ + Retrieves an vpd entry of the device + + Returns: + string: Vpd entry value of device + """ + if self._get_data() and key not in self.vpd_data: + logger.log_warning("Fail to read vpd info: No key {} in VPD {}".format(key, self.vpd_file)) + return 'N/A' + return self.vpd_data.get(key, 'N/A') + + diff --git a/platform/mellanox/mlnx-platform-api/tests/test_psu.py b/platform/mellanox/mlnx-platform-api/tests/test_psu.py index 34fa70c7beca..db8912debff4 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_psu.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_psu.py @@ -116,3 +116,40 @@ def test_psu_vpd(self): assert psu.get_model() == 'MTEF-PSF-AC-C' assert psu.get_serial() == 'MT1946X07684' assert psu.get_revision() == 'A3' + + assert psu.vpd_parser.get_entry_value('MFR_NAME') == 'DELTA' + + @mock.patch('sonic_platform.utils.read_int_from_file', mock.MagicMock(return_value=9999)) + @mock.patch('sonic_platform.utils.run_command') + @mock.patch('sonic_platform.device_data.DeviceDataManager.get_platform_name') + @mock.patch('sonic_platform.vpd_parser.VpdParser.get_entry_value') + def test_psu_workaround(self, mock_get_entry_value, mock_get_platform_name, mock_run_command): + from sonic_platform.psu import InvalidPsuVolWA + psu = Psu(0) + # Threshold value is not InvalidPsuVolWA.INVALID_VOLTAGE_VALUE + assert InvalidPsuVolWA.run(psu, 9999, '') == 9999 + + # Platform name is not in InvalidPsuVolWA.EXPECT_PLATFORMS + mock_get_platform_name.return_value = 'some platform' + assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE + + # PSU vendor is not InvalidPsuVolWA.EXPECT_VENDOR_NAME + vpd_info = { + InvalidPsuVolWA.MFR_FIELD: 'some psu', + InvalidPsuVolWA.CAPACITY_FIELD: 'some capacity' + } + def get_entry_value(key): + return vpd_info[key] + + mock_get_entry_value.side_effect = get_entry_value + mock_get_platform_name.return_value = 'x86_64-mlnx_msn3700-r0' + assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE + + # PSU capacity is not InvalidPsuVolWA.EXPECT_CAPACITY + vpd_info[InvalidPsuVolWA.MFR_FIELD] = InvalidPsuVolWA.EXPECT_VENDOR_NAME + assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == InvalidPsuVolWA.INVALID_VOLTAGE_VALUE + + # Normal + vpd_info[InvalidPsuVolWA.CAPACITY_FIELD] = InvalidPsuVolWA.EXPECT_CAPACITY + assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == 9999 + mock_run_command.assert_called_with('sensor -s') diff --git a/platform/mellanox/mlnx-platform-api/tests/test_utils.py b/platform/mellanox/mlnx-platform-api/tests/test_utils.py index 7da17dc5e7bc..bbc3ab28e58c 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_utils.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_utils.py @@ -116,3 +116,7 @@ def func(): assert func() == 100 assert mock_log.call_count == 1 + + def test_run_command(self): + output = utils.run_command('ls') + assert output