Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox] Fix issue: watchdogutil command does not work (#16091) - Why I did it watchdogutil uses platform API watchdog instance to control/query watchdog status. In Nvidia watchdog status, it caches "armed" status in a object member "WatchdogImplBase.armed". This is not working for CLI infrastructure because each CLI will create a new watchdog instance, the status cached in previous instance will totally lose. Consider following commands: admin@sonic:~$ sudo watchdogutil arm -s 100 =====> watchdog instance1, armed=True Watchdog armed for 100 seconds admin@sonic:~$ sudo watchdogutil status ======> watchdog instance2, armed=False Status: Unarmed admin@sonic:~$ sudo watchdogutil disarm =======> watchdog instance3, armed=False Failed to disarm Watchdog - How I did it Use sysfs to query watchdog status - How to verify it Manual test Unit test Conflicts: platform/mellanox/mlnx-platform-api/sonic_platform/watchdog.py platform/mellanox/mlnx-platform-api/tests/test_watchdog.py #189

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 24 additions & 25 deletions platform/mellanox/mlnx-platform-api/sonic_platform/watchdog.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import time

from sonic_platform_base.watchdog_base import WatchdogBase
from . import utils

""" ioctl constants """
IO_WRITE = 0x40000000
Expand Down Expand Up @@ -80,15 +81,17 @@ def __init__(self, wd_device_path):
super(WatchdogImplBase, self).__init__()

self.watchdog_path = wd_device_path
self.watchdog = os.open(self.watchdog_path, os.O_WRONLY)
self._watchdog = None
self.timeout = self._gettimeout()

# Opening a watchdog descriptor starts
# watchdog timer;
# by default it should be stopped
self._disablecard()
self.armed = False
@property
def watchdog(self):
if self._watchdog is None:
self._watchdog = self.open_handle()
return self._watchdog

self.timeout = self._gettimeout()
def open_handle(self):
return os.open(self.watchdog_path, os.O_WRONLY)

def _enablecard(self):
"""
Expand Down Expand Up @@ -131,21 +134,15 @@ def _gettimeout(self):
@return watchdog timeout
"""

req = array.array('I', [0])
fcntl.ioctl(self.watchdog, WDIOC_GETTIMEOUT, req, True)

return int(req[0])
return utils.read_int_from_file('/run/hw-management/watchdog/main/timeout')

def _gettimeleft(self):
"""
Get time left before watchdog timer expires
@return time left in seconds
"""

req = array.array('I', [0])
fcntl.ioctl(self.watchdog, WDIOC_GETTIMELEFT, req, True)

return int(req[0])
return utils.read_int_from_file('/run/hw-management/watchdog/main/timeleft')

def arm(self, seconds):
"""
Expand All @@ -159,11 +156,10 @@ def arm(self, seconds):
try:
if self.timeout != seconds:
self.timeout = self._settimeout(seconds)
if self.armed:
if self.is_armed():
self._keepalive()
else:
self._enablecard()
self.armed = True
ret = self.timeout
except IOError:
pass
Expand All @@ -176,10 +172,9 @@ def disarm(self):
"""

disarmed = False
if self.armed:
if self.is_armed():
try:
self._disablecard()
self.armed = False
disarmed = True
except IOError:
pass
Expand All @@ -191,7 +186,7 @@ def is_armed(self):
Implements is_armed WatchdogBase API
"""

return self.armed
return utils.read_str_from_file('/run/hw-management/watchdog/main/state') == 'active'

def get_remaining_time(self):
"""
Expand All @@ -200,7 +195,7 @@ def get_remaining_time(self):

timeleft = WD_COMMON_ERROR

if self.armed:
if self.is_armed():
try:
timeleft = self._gettimeleft()
except IOError:
Expand All @@ -213,13 +208,15 @@ def __del__(self):
Close watchdog
"""

os.close(self.watchdog)
if self._watchdog is not None:
os.close(self._watchdog)


class WatchdogType1(WatchdogImplBase):
"""
Watchdog type 1
"""
TIMESTAMP_FILE = '/tmp/nvidia/watchdog_timestamp'

def arm(self, seconds):
"""
Expand All @@ -230,7 +227,8 @@ def arm(self, seconds):
ret = WatchdogImplBase.arm(self, seconds)
# Save the watchdog arm timestamp
# requiered for get_remaining_time()
self.arm_timestamp = time.time()
os.makedirs('/tmp/nvidia', exist_ok=True)
utils.write_file(self.TIMESTAMP_FILE, str(time.time()))

return ret

Expand All @@ -243,8 +241,9 @@ def get_remaining_time(self):

timeleft = WD_COMMON_ERROR

if self.armed:
timeleft = int(self.timeout - (time.time() - self.arm_timestamp))
if self.is_armed():
arm_timestamp = utils.read_float_from_file(self.TIMESTAMP_FILE)
timeleft = int(self.timeout - (time.time() - arm_timestamp))

return timeleft

Expand Down
141 changes: 141 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_watchdog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import pytest
import sys
if sys.version_info.major == 3:
from unittest import mock
else:
import mock

test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)

from sonic_platform.chassis import Chassis
from sonic_platform.watchdog import get_watchdog, \
WatchdogType2, \
WatchdogType1, \
is_mlnx_wd_main, \
is_wd_type2


class TestWatchdog:
@mock.patch('sonic_platform.watchdog.is_mlnx_wd_main')
@mock.patch('sonic_platform.watchdog.os.listdir')
def test_get_watchdog_no_device(self, mock_listdir, mock_is_main):
mock_listdir.return_value = []
assert get_watchdog() is None

mock_listdir.return_value = ['invalid']
mock_is_main.return_value = True
assert get_watchdog() is None

mock_listdir.return_value = ['watchdog1']
mock_is_main.return_value = False
assert get_watchdog() is None

@mock.patch('sonic_platform.watchdog.is_mlnx_wd_main')
@mock.patch('sonic_platform.watchdog.is_wd_type2')
@mock.patch('sonic_platform.watchdog.os.listdir', mock.MagicMock(return_value=['watchdog1', 'watchdog2']))
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
@pytest.mark.parametrize('test_para',
[(True, WatchdogType2), (False, WatchdogType1)])
def test_get_watchdog(self, mock_is_type2, mock_is_main, test_para):
mock_is_main.side_effect = lambda dev: dev == 'watchdog2'
mock_is_type2.return_value = test_para[0]
chassis = Chassis()
watchdog = chassis.get_watchdog()
assert isinstance(watchdog, test_para[1])
assert watchdog.watchdog_path == '/dev/watchdog2'

def test_is_mlnx_wd_main(self):
mock_os_open = mock.mock_open(read_data='mlx-wdt-main')
with mock.patch('sonic_platform.watchdog.open', mock_os_open):
assert is_mlnx_wd_main('')

mock_os_open = mock.mock_open(read_data='invalid')
with mock.patch('sonic_platform.watchdog.open', mock_os_open):
assert not is_mlnx_wd_main('')
mock_os_open.side_effect = IOError
with mock.patch('sonic_platform.watchdog.open', mock_os_open):
assert not is_mlnx_wd_main('')

@mock.patch('sonic_platform.watchdog.os.path.exists')
@pytest.mark.parametrize('test_para',
[True, False])
def test_is_wd_type2(self, mock_exists, test_para):
mock_exists.return_value = test_para
assert is_wd_type2('') is test_para

@mock.patch('sonic_platform.utils.read_str_from_file')
def test_is_armed(self, mock_read):
watchdog = WatchdogType2('watchdog2')
mock_read.return_value = 'inactive'
assert not watchdog.is_armed()
mock_read.return_value = 'active'
assert watchdog.is_armed()

@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
def test_arm_disarm_watchdog2(self, mock_is_armed):
watchdog = WatchdogType2('watchdog2')
assert watchdog.arm(-1) == -1
mock_is_armed.return_value = False
watchdog.arm(10)
mock_is_armed.return_value = True
watchdog.arm(5)
watchdog.disarm()

@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
def test_arm_disarm_watchdog1(self, mock_is_armed):
watchdog = WatchdogType1('watchdog1')
assert watchdog.arm(-1) == -1
mock_is_armed.return_value = False
watchdog.arm(10)
mock_is_armed.return_value = True
watchdog.arm(5)
watchdog.disarm()

@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.WatchdogImplBase._gettimeleft', mock.MagicMock(return_value=10))
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
def test_get_remaining_time_watchdog2(self, mock_is_armed):
watchdog = WatchdogType2('watchdog2')
mock_is_armed.return_value = False
assert watchdog.get_remaining_time() == -1
watchdog.arm(10)
mock_is_armed.return_value = True
assert watchdog.get_remaining_time() == 10

@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
@mock.patch('sonic_platform.watchdog.WatchdogImplBase._gettimeleft', mock.MagicMock(return_value=10))
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
def test_get_remaining_time_watchdog1(self, mock_is_armed):
watchdog = WatchdogType1('watchdog2')
mock_is_armed.return_value = False
assert watchdog.get_remaining_time() == -1
watchdog.arm(10)
mock_is_armed.return_value = True
assert watchdog.get_remaining_time() > 0