From d17ec61aac532005569cfc7187e5fc2b9e3dc44b Mon Sep 17 00:00:00 2001 From: snider-nokia <76123698+snider-nokia@users.noreply.github.com> Date: Mon, 8 Jan 2024 14:38:46 -0500 Subject: [PATCH 1/2] [Nokia][sonic-platform] Update Nokia sonic-platform submodule and device data (#17378) These changes, in conjunction with NDK version >= 22.9.17 address the thermal logging issues discussed at Nokia-ION/ndk#27. While the changes contained at this PR do not require coupling to NDK version >= 22.9.17, thermal logging enhancements will not be available without updated NDK >= 22.9.17. Thus, coupling with NDK >=22.9.17 is preferred and recommended. Why I did it To address thermal logging deficiencies. Work item tracking Microsoft ADO (number only): 26365734 How I did it The following changes are included: Threshold configuration values are provided in the associated device data .json files. There is also a change included to better handle the condition where an SFP module read fails. Modify the module.py reboot to support reboot linecard from Supervisor - Modify reboot to call _reboot_imm for single IMM card reboot - Add log to the ndk_cmd to log the operation of "reboot-linecard" and "shutdown/satrtup the sfm" Add new nokia_cmd set command and modify show ndk-status output - Add a new function reboot_imm() to nokia_common.py to support reboot a single IMM slot from CPM - Added new command: nokia_cmd set reboot-linecard [forece] for CPM - Append a new column "RebootStatus" at the end of output of "nokia_cmd show ndk-status" - Provide ability for IMM to disable all transceiver module TX at reboot time - Remove defunct xcvr-resync service --- .../platform_ndk.json | 22 ++++++++++++++++++- .../platform_ndk.json | 20 +++++++++++++++++ .../broadcom/sonic-platform-modules-nokia | 2 +- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_ndk.json b/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_ndk.json index 4fbd84df39..f444a14a84 100644 --- a/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_ndk.json +++ b/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_ndk.json @@ -14,7 +14,7 @@ }, { "key": "monitor_action", - "stringval": "warn" + "stringval": "reboot" }, { "key": "grpc_thermal_monitor", @@ -43,6 +43,26 @@ { "key": "sonic_log_level", "stringval": "debug" + }, + { + "key": "thermal_low_margin_threshold", + "intval": 10 + }, + { + "key": "thermal_log_current_threshold", + "intval": 2 + }, + { + "key": "thermal_log_margin_threshold", + "intval": 2 + }, + { + "key": "thermal_log_min_threshold", + "intval": 2 + }, + { + "key": "thermal_log_max_threshold", + "intval": 1 } ] } diff --git a/device/nokia/x86_64-nokia_ixr7250e_sup-r0/platform_ndk.json b/device/nokia/x86_64-nokia_ixr7250e_sup-r0/platform_ndk.json index dddefb0bcf..9d68aacc59 100644 --- a/device/nokia/x86_64-nokia_ixr7250e_sup-r0/platform_ndk.json +++ b/device/nokia/x86_64-nokia_ixr7250e_sup-r0/platform_ndk.json @@ -31,6 +31,26 @@ { "key": "sonic_log_level", "stringval": "debug" + }, + { + "key": "thermal_low_margin_threshold", + "intval": 10 + }, + { + "key": "thermal_log_current_threshold", + "intval": 3 + }, + { + "key": "thermal_log_margin_threshold", + "intval": 3 + }, + { + "key": "thermal_log_min_threshold", + "intval": 5 + }, + { + "key": "thermal_log_max_threshold", + "intval": 1 } ] } diff --git a/platform/broadcom/sonic-platform-modules-nokia b/platform/broadcom/sonic-platform-modules-nokia index 6b07af449c..e82caa40d4 160000 --- a/platform/broadcom/sonic-platform-modules-nokia +++ b/platform/broadcom/sonic-platform-modules-nokia @@ -1 +1 @@ -Subproject commit 6b07af449c4b40e2a80e3154347ca0308817ebd5 +Subproject commit e82caa40d4eec59e574bd4acffb0e92fd7187da4 From 21c97d3baef0d71a8e909e108ca8d6d91097cb1a Mon Sep 17 00:00:00 2001 From: "Marty Y. Lok" <76118573+mlok-nokia@users.noreply.github.com> Date: Mon, 8 Jan 2024 14:39:30 -0500 Subject: [PATCH 2/2] [Nokia-IXR7250E] Modify the platform_reboot on the IXR7250E for PMON API reboot and Disable all SFPs (#17483) Why I did it When Supervisor card is rebooted by using PMON API, it takes about 90 seconds to trigger the shutdown in down path. At this time linecards have been up. This delays linecards database initialization which is trying to PING/PONG the database-chassis. To address this issue, we modified the NDK to use the system call with "sudo reboot" when the request is from PMON API on Supervisor case. The NDK version is 22.9.20 and greater. This new NDK requires this modifcaiton of platform_reboot to work with. Work item tracking Microsoft ADO (number only): 26365734 How I did it Modify the platform_reboot In Supervisor not to reboot all IMMs since it has been done in the function reboot() in module.py. Also handle the reboot-cause.txt for on the Supervisor when the reboot is request from PMON API. Modify the Nokia platform specific platform_reboot in linecard to disable all SPFs. This PR works with NDK version 22.9.20 and above Signed-off-by: mlok --- .../platform_reboot | 4 +++ .../platform_reboot | 34 ++++++++++++++++--- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_reboot b/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_reboot index 43aace70f7..eb0bebef0e 100755 --- a/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_reboot +++ b/device/nokia/x86_64-nokia_ixr7250e_36x400g-r0/platform_reboot @@ -17,6 +17,10 @@ update_reboot_cause() sync } +echo "Disable all SFPs" +python3 -c 'import sonic_platform.platform; platform_chassis = sonic_platform.platform.Platform().get_chassis(); platform_chassis.tx_disable_all_sfps()' +sleep 3 + # update the reboot_cuase file when reboot is trigger by device-mgr update_reboot_cause diff --git a/device/nokia/x86_64-nokia_ixr7250e_sup-r0/platform_reboot b/device/nokia/x86_64-nokia_ixr7250e_sup-r0/platform_reboot index 00ae76c9de..dc4f934339 100755 --- a/device/nokia/x86_64-nokia_ixr7250e_sup-r0/platform_reboot +++ b/device/nokia/x86_64-nokia_ixr7250e_sup-r0/platform_reboot @@ -1,15 +1,39 @@ #!/bin/bash -echo "Rebooting all Linecards" -python3 -c 'import sonic_platform.platform; platform_chassis = sonic_platform.platform.Platform().get_chassis(); platform_chassis.reboot_imms()' -sleep 3 + +DEVICE_MGR_REBOOT_FILE="/tmp/device_mgr_reboot" + +update_reboot_cause() +{ + DEVICE_MGR_REBOOT_FILE=/tmp/device_mgr_reboot + REBOOT_CAUSE_FILE=/host/reboot-cause/reboot-cause.txt + DEVICE_REBOOT_CAUSE_FILE=/etc/opt/srlinux/reboot-cause.txt + if [ -e $DEVICE_MGR_REBOOT_FILE ]; then + if [ -e $DEVICE_REBOOT_CAUSE_FILE ]; then + cp -f $DEVICE_REBOOT_CAUSE_FILE $REBOOT_CAUSE_FILE + fi + rm -f $DEVICE_MGR_REBOOT_FILE + else + touch /etc/opt/srlinux/devmgr_reboot_cause.done + rm -f $DEVICE_REBOOT_CAUSE_FILE &> /dev/null + fi + sync +} + +if [ ! -e $DEVICE_MGR_REBOOT_FILE ]; then + echo "Rebooting all Linecards" + python3 -c 'import sonic_platform.platform; platform_chassis = sonic_platform.platform.Platform().get_chassis(); platform_chassis.reboot_imms()' + sleep 3 +fi + +# update the reboot_cuase file when reboot is trigger by device-mgr +update_reboot_cause + systemctl stop nokia-watchdog.service sleep 2 echo "w" > /dev/watchdog kick_date=`date -u` echo "last watchdog kick $kick_date" > /var/log/nokia-watchdog-last.log rm -f /sys/firmware/efi/efivars/dump-* -touch /etc/opt/srlinux/devmgr_reboot_cause.done -rm -f /etc/opt/srlinux/reboot-cause.txt echo "Shutdown midplane" ifconfig xe0 down sync