From 48ae866aebec110c56eb66c99f2cff9f06ce714b Mon Sep 17 00:00:00 2001 From: Volodymyr Samotiy Date: Tue, 12 Jan 2021 19:24:04 +0200 Subject: [PATCH] [pfcwd] Update PFC storm detection logic for Mellanox platforms (#1586) Use "PFC duration" counters in micro seconds instead of quanta SONiC PFCWD logic requires "pfc duration" value in micro seconds but in SAI it was provided as quanta of time. So it required additional conversion which used speed value to do such conversion and it could cause PFCWD to detect storm on operationally down port in case of link flapping. Now there are new SAI attributes that provide "pfc duration" in micro seconds so PCWD storm detection logic is updated in order to use this new "pfc duration" counters. Such algorithm change helps to avoid false PFC storm detection in case of link flapping because conversion is not needed anymore. Signed-off-by: Volodymyr Samotiy --- orchagent/orchdaemon.cpp | 50 ++++++++++++++++++++++++++----- orchagent/pfc_detect_mellanox.lua | 20 ++----------- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/orchagent/orchdaemon.cpp b/orchagent/orchdaemon.cpp index 493e13b6e46d..641c304e9129 100644 --- a/orchagent/orchdaemon.cpp +++ b/orchagent/orchdaemon.cpp @@ -333,10 +333,48 @@ bool OrchDaemon::init() CFG_PFC_WD_TABLE_NAME }; - if ((platform == MLNX_PLATFORM_SUBSTRING) - || (platform == INVM_PLATFORM_SUBSTRING) - || (platform == BFN_PLATFORM_SUBSTRING) - || (platform == NPS_PLATFORM_SUBSTRING)) + if (platform == MLNX_PLATFORM_SUBSTRING) + { + + static const vector portStatIds = + { + SAI_PORT_STAT_PFC_0_RX_PAUSE_DURATION_US, + SAI_PORT_STAT_PFC_1_RX_PAUSE_DURATION_US, + SAI_PORT_STAT_PFC_2_RX_PAUSE_DURATION_US, + SAI_PORT_STAT_PFC_3_RX_PAUSE_DURATION_US, + SAI_PORT_STAT_PFC_4_RX_PAUSE_DURATION_US, + SAI_PORT_STAT_PFC_5_RX_PAUSE_DURATION_US, + SAI_PORT_STAT_PFC_6_RX_PAUSE_DURATION_US, + SAI_PORT_STAT_PFC_7_RX_PAUSE_DURATION_US, + SAI_PORT_STAT_PFC_0_RX_PKTS, + SAI_PORT_STAT_PFC_1_RX_PKTS, + SAI_PORT_STAT_PFC_2_RX_PKTS, + SAI_PORT_STAT_PFC_3_RX_PKTS, + SAI_PORT_STAT_PFC_4_RX_PKTS, + SAI_PORT_STAT_PFC_5_RX_PKTS, + SAI_PORT_STAT_PFC_6_RX_PKTS, + SAI_PORT_STAT_PFC_7_RX_PKTS, + }; + + static const vector queueStatIds = + { + SAI_QUEUE_STAT_PACKETS, + SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES, + }; + + static const vector queueAttrIds; + + m_orchList.push_back(new PfcWdSwOrch( + m_configDb, + pfc_wd_tables, + portStatIds, + queueStatIds, + queueAttrIds, + PFC_WD_POLL_MSECS)); + } + else if ((platform == INVM_PLATFORM_SUBSTRING) + || (platform == BFN_PLATFORM_SUBSTRING) + || (platform == NPS_PLATFORM_SUBSTRING)) { static const vector portStatIds = @@ -367,9 +405,7 @@ bool OrchDaemon::init() static const vector queueAttrIds; - if ((platform == MLNX_PLATFORM_SUBSTRING) - || (platform == INVM_PLATFORM_SUBSTRING) - || (platform == NPS_PLATFORM_SUBSTRING)) + if ((platform == INVM_PLATFORM_SUBSTRING) || (platform == NPS_PLATFORM_SUBSTRING)) { m_orchList.push_back(new PfcWdSwOrch( m_configDb, diff --git a/orchagent/pfc_detect_mellanox.lua b/orchagent/pfc_detect_mellanox.lua index f436808451fa..0f4d6d4f5df4 100644 --- a/orchagent/pfc_detect_mellanox.lua +++ b/orchagent/pfc_detect_mellanox.lua @@ -8,26 +8,10 @@ local counters_db = ARGV[1] local counters_table_name = ARGV[2] local poll_time = tonumber(ARGV[3]) -local asic_db = "1" -local asic_db_port_table = "ASIC_STATE:SAI_OBJECT_TYPE_PORT" - -local quanta_size = 512 - local rets = {} redis.call('SELECT', counters_db) -local function port_speed_get(port_id) - redis.call('SELECT', asic_db) - local port_speed = redis.call('HGET', asic_db_port_table .. ':' .. port_id, 'SAI_PORT_ATTR_SPEED') - redis.call('SELECT', counters_db) - return tonumber(port_speed) -end - -local function quantatous(quanta, port_id) - return quanta * quanta_size / port_speed_get(port_id) -end - -- Iterate through each queue local n = table.getn(KEYS) for i = n, 1, -1 do @@ -53,7 +37,7 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' + local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION_US' -- Get all counters local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') @@ -79,7 +63,7 @@ for i = n, 1, -1 do packets_last = tonumber(packets_last) pfc_rx_packets_last = tonumber(pfc_rx_packets_last) pfc_duration_last = tonumber(pfc_duration_last) - local storm_condition = ((quantatous(pfc_duration, port_id) - quantatous(pfc_duration_last, port_id)) > poll_time * 0.8) + local storm_condition = (pfc_duration - pfc_duration_last) > (poll_time * 0.8) -- Check actual condition of queue being in PFC storm if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or