Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support ASIC/SDK health event #3020

Merged
merged 4 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions orchagent/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ CFLAGS_SAI = -I /usr/include/sai
swssdir = $(datadir)/swss

dist_swss_DATA = \
eliminate_events.lua \
rif_rates.lua \
pfc_detect_innovium.lua \
pfc_detect_mellanox.lua \
Expand Down
63 changes: 63 additions & 0 deletions orchagent/eliminate_events.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
-- KEYS - None
-- ARGV - None
stephenxs marked this conversation as resolved.
Show resolved Hide resolved

local state_db = "6"
local config_db = "4"

local result = {}

redis.call('SELECT', config_db)
local severity_keys = redis.call('KEYS', 'SUPPRESS_ASIC_SDK_HEALTH_EVENT*')
if #severity_keys == 0 then
return result
end

local max_events = {}
for i = 1, #severity_keys, 1 do
local max_event = redis.call('HGET', severity_keys[i], 'max_events')
if max_event then
max_events[string.sub(severity_keys[i], 32, -1)] = tonumber(max_event)
end
end

if not next (max_events) then
return result
end

redis.call('SELECT', state_db)
local events = {}

local event_keys = redis.call('KEYS', 'ASIC_SDK_HEALTH_EVENT_TABLE*')

if #event_keys == 0 then
return result
end

for i = 1, #event_keys, 1 do
local severity = redis.call('HGET', event_keys[i], 'severity')
if max_events[severity] ~= nil then
if events[severity] == nil then
events[severity] = {}
end
table.insert(events[severity], event_keys[i])
end
end

for severity in pairs(max_events) do
local number_received_events = 0
if events[severity] ~= nil then
number_received_events = #events[severity]
end
if number_received_events > max_events[severity] then
table.sort(events[severity])
local number_to_eliminate = number_received_events - max_events[severity]
for i = 1, number_to_eliminate, 1 do
redis.call('DEL', events[severity][i])
end
table.insert(result, severity .. " events: maximum " .. max_events[severity] .. ", received " .. number_received_events .. ", eliminated " .. number_to_eliminate)
else
table.insert(result, severity .. " events: maximum " .. max_events[severity] .. ", received " .. number_received_events .. ", not exceeding the maximum")
end
end

return result
24 changes: 24 additions & 0 deletions orchagent/notifications.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ extern "C" {

#include "logger.h"
#include "notifications.h"
#include "switchorch.h"

extern SwitchOrch *gSwitchOrch;

#ifdef ASAN_ENABLED
#include <sanitizer/lsan_interface.h>
Expand Down Expand Up @@ -40,6 +43,12 @@ void on_switch_shutdown_request(sai_object_id_t switch_id)
/* TODO: Later a better restart story will be told here */
SWSS_LOG_ERROR("Syncd stopped");

if (gSwitchOrch->isFatalEventReceived())
{
SWSS_LOG_ERROR("Orchagent aborted due to fatal SAI error received");
abort();
stephenxs marked this conversation as resolved.
Show resolved Hide resolved
}

/*
The quick_exit() is used instead of the exit() to avoid a following data race:
* the exit() calls the destructors for global static variables (e.g.BufferOrch::m_buffer_type_maps)
Expand All @@ -59,3 +68,18 @@ void on_port_host_tx_ready(sai_object_id_t switch_id, sai_object_id_t port_id, s
// don't use this event handler, because it runs by libsairedis in a separate thread
// which causes concurrency access to the DB
}

void on_switch_asic_sdk_health_event(sai_object_id_t switch_id,
sai_switch_asic_sdk_health_severity_t severity,
sai_timespec_t timestamp,
sai_switch_asic_sdk_health_category_t category,
sai_switch_health_data_t data,
const sai_u8_list_t description)
{
gSwitchOrch->onSwitchAsicSdkHealthEvent(switch_id,
severity,
timestamp,
category,
data,
description);
}
8 changes: 8 additions & 0 deletions orchagent/notifications.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,12 @@ void on_twamp_session_event(uint32_t count, sai_twamp_session_event_notification
// The function prototype information can be found here:
// https://github.com/sonic-net/sonic-sairedis/blob/master/meta/NotificationSwitchShutdownRequest.cpp#L49
void on_switch_shutdown_request(sai_object_id_t switch_id);

void on_port_host_tx_ready(sai_object_id_t switch_id, sai_object_id_t port_id, sai_port_host_tx_ready_status_t m_portHostTxReadyStatus);

void on_switch_asic_sdk_health_event(sai_object_id_t switch_id,
sai_switch_asic_sdk_health_severity_t severity,
sai_timespec_t timestamp,
sai_switch_asic_sdk_health_category_t category,
sai_switch_health_data_t data,
const sai_u8_list_t description);
2 changes: 2 additions & 0 deletions orchagent/orchdaemon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,12 @@ bool OrchDaemon::init()
TableConnector app_switch_table(m_applDb, APP_SWITCH_TABLE_NAME);
TableConnector conf_asic_sensors(m_configDb, CFG_ASIC_SENSORS_TABLE_NAME);
TableConnector conf_switch_hash(m_configDb, CFG_SWITCH_HASH_TABLE_NAME);
TableConnector conf_suppress_asic_sdk_health_categories(m_configDb, CFG_SUPPRESS_ASIC_SDK_HEALTH_EVENT_NAME);

vector<TableConnector> switch_tables = {
conf_switch_hash,
conf_asic_sensors,
conf_suppress_asic_sdk_health_categories,
app_switch_table
};

Expand Down
1 change: 1 addition & 0 deletions orchagent/p4orch/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ p4orch_tests_SOURCES = $(ORCHAGENT_DIR)/orch.cpp \
$(ORCHAGENT_DIR)/flex_counter/flow_counter_handler.cpp \
$(ORCHAGENT_DIR)/port/port_capabilities.cpp \
$(ORCHAGENT_DIR)/port/porthlpr.cpp \
$(ORCHAGENT_DIR)/notifications.cpp \
$(P4ORCH_DIR)/p4oidmapper.cpp \
$(P4ORCH_DIR)/p4orch.cpp \
$(P4ORCH_DIR)/p4orch_util.cpp \
Expand Down
2 changes: 2 additions & 0 deletions orchagent/p4orch/tests/test_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ sai_object_id_t kMirrorSessionOid1 = 9001;
char *gMirrorSession2 = "mirror-session-2";
sai_object_id_t kMirrorSessionOid2 = 9002;
sai_object_id_t gUnderlayIfId;
string gMyAsicName = "";
event_handle_t g_events_handle;

#define DEFAULT_BATCH_SIZE 128
#define DEFAULT_MAX_BULK_SIZE 1000
Expand Down
Loading
Loading