From d8cfaa33bb6308fed0898e4fbb85f66f0b476ade Mon Sep 17 00:00:00 2001
From: Stephen Sun <stephens@nvidia.com>
Date: Thu, 10 Jun 2021 10:49:01 +0800
Subject: [PATCH] Bug fix in buffer pool calculation and headroom checking

- Take number of lanes instead of speed into account when determining whether it has doubled pipeline latency
  For speeds other than 400G, eg 100G, it's possible that some 100G ports have 8 lanes and others have 4 lanes
  In this case, we need to add "8_lane" to the profile name to indicate whether the profile is for 8 lane ports or normal ports
- Take advantage of "set" feature of the lua to present the profile referencing count, which also makes the code more maintainable
- Take deviation into account when checking the headroom against the limit

Signed-off-by: Stephen Sun <stephens@nvidia.com>
---
 cfgmgr/buffer_check_headroom_mellanox.lua |  40 ++++++-
 cfgmgr/buffer_headroom_mellanox.lua       |   6 +-
 cfgmgr/buffer_pool_mellanox.lua           | 135 +++++++++++++---------
 cfgmgr/buffermgrdyn.cpp                   |  22 +++-
 cfgmgr/buffermgrdyn.h                     |   7 +-
 5 files changed, 138 insertions(+), 72 deletions(-)

diff --git a/cfgmgr/buffer_check_headroom_mellanox.lua b/cfgmgr/buffer_check_headroom_mellanox.lua
index 510a967b730..08f955fe4c7 100644
--- a/cfgmgr/buffer_check_headroom_mellanox.lua
+++ b/cfgmgr/buffer_check_headroom_mellanox.lua
@@ -7,10 +7,25 @@ local port = KEYS[1]
 local input_profile_name = ARGV[1]
 local input_profile_size = ARGV[2]
 local new_pg = ARGV[3]
-local accumulative_size = 0
+
+local function is_port_with_8lanes(lanes)
+    -- On Spectrum 3, ports with 8 lanes have doubled pipeline latency
+    local number_of_lanes = 0
+    if lanes then
+        local _
+        _, number_of_lanes = string.gsub(lanes, ",", ",")
+        number_of_lanes = number_of_lanes + 1
+    end
+    return number_of_lanes == 8
+end
+
+-- Initialize the accumulative size with 4096
+-- This is to absorb the possible deviation
+local accumulative_size = 4096
 
 local appl_db = "0"
 local state_db = "6"
+local config_db = "4"
 
 local ret_true = {}
 local ret = {}
@@ -20,7 +35,18 @@ table.insert(ret_true, "result:true")
 
 default_ret = ret_true
 
-local speed = redis.call('HGET', 'PORT|' .. port, 'speed')
+-- Connect to CONFIG_DB
+redis.call('SELECT', config_db)
+
+local lanes
+
+-- On SPC3 switch, we need to know whether it's a 8-lane port because it has extra pipeline latency
+local is_spc3 = false
+local platform = redis.call('HGET', 'DEVICE_METADATA|localhost', 'platform')
+if platform and string.sub(platform, 1, 16) == "x86_64-mlnx_msn4" then
+    is_spc3 = true
+    lanes = redis.call('HGET', 'PORT|' .. port, 'lanes')
+end
 
 -- Fetch the threshold from STATE_DB
 redis.call('SELECT', state_db)
@@ -31,11 +57,12 @@ if max_headroom_size == nil then
 end
 
 local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
-local pipeline_delay = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
-if speed == 400000 then
-    pipeline_delay = pipeline_delay * 2 - 1
+local pipeline_latency = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
+if is_spc3 and is_port_with_8lanes(lanes) then
+    -- The pipeline latency should be adjusted accordingly for ports with 2 buffer units
+    pipeline_latency = pipeline_latency * 2 - 1
 end
-accumulative_size = accumulative_size + 2 * pipeline_delay * 1024
+accumulative_size = accumulative_size + 2 * pipeline_latency * 1024
 
 -- Fetch all keys in BUFFER_PG according to the port
 redis.call('SELECT', appl_db)
@@ -95,6 +122,7 @@ end
 
 if max_headroom_size > accumulative_size then
     table.insert(ret, "result:true")
+    table.insert(ret, "debug:Accumulative headroom on port " .. accumulative_size .. ", the maximum available headroom " .. max_headroom_size)
 else
     table.insert(ret, "result:false")
     table.insert(ret, "debug:Accumulative headroom on port " .. accumulative_size .. " exceeds the maximum available headroom which is " .. max_headroom_size)
diff --git a/cfgmgr/buffer_headroom_mellanox.lua b/cfgmgr/buffer_headroom_mellanox.lua
index 2c958140081..aa0cc53f0f5 100644
--- a/cfgmgr/buffer_headroom_mellanox.lua
+++ b/cfgmgr/buffer_headroom_mellanox.lua
@@ -3,6 +3,7 @@
 -- ARGV[2] - cable length
 -- ARGV[3] - port mtu
 -- ARGV[4] - gearbox delay
+-- ARGV[5] - whether the port is an 8-lane port
 
 -- parameters retried from databases:
 -- From CONFIG_DB.LOSSLESS_TRAFFIC_PATTERN
@@ -26,6 +27,7 @@ local port_speed = tonumber(ARGV[1])
 local cable_length = tonumber(string.sub(ARGV[2], 1, -2))
 local port_mtu = tonumber(ARGV[3])
 local gearbox_delay = tonumber(ARGV[4])
+local is_8lane = ARGV[5]
 
 local appl_db = "0"
 local config_db = "4"
@@ -100,8 +102,8 @@ local xon_value
 local headroom_size
 local speed_overhead
 
--- Adjustment for 400G
-if port_speed == 400000 then
+-- Adjustment for 8-lane port
+if is_8lane ~= nil and is_8lane == "true" then
     pipeline_latency = 37 * 1024
     speed_overhead = port_mtu
 else
diff --git a/cfgmgr/buffer_pool_mellanox.lua b/cfgmgr/buffer_pool_mellanox.lua
index f27316d2b7e..68f18147b1f 100644
--- a/cfgmgr/buffer_pool_mellanox.lua
+++ b/cfgmgr/buffer_pool_mellanox.lua
@@ -5,12 +5,12 @@ local appl_db = "0"
 local config_db = "4"
 local state_db = "6"
 
-local lossypg_reserved = 19 * 1024
-local lossypg_reserved_400g = 37 * 1024
--- Number of 400G ports
-local port_count_400g = 0
--- Number of lossy PG on 400G ports
-local lossypg_400g = 0
+-- Number of ports with 8 lanes (whose pipeline latency should be doubled)
+local port_count_8lanes = 0
+-- Number of lossy PG on ports with 8 lanes
+local lossypg_8lanes = 0
+
+local is_spc3 = false
 
 local result = {}
 local profiles = {}
@@ -20,16 +20,7 @@ local total_port = 0
 local mgmt_pool_size = 256 * 1024
 local egress_mirror_headroom = 10 * 1024
 
-local function find_profile(ref)
-    -- Remove the surrounding square bracket and the find in the list
-    local name = string.sub(ref, 2, -2)
-    for i = 1, #profiles, 1 do
-        if profiles[i][1] == name then
-            return i
-        end
-    end
-    return 0
-end
+local port_set_8lanes = {}
 
 local function iterate_all_items(all_items)
     table.sort(all_items)
@@ -43,9 +34,13 @@ local function iterate_all_items(all_items)
         port = string.match(all_items[i], "Ethernet%d+")
         if port ~= nil then
             local range = string.match(all_items[i], "Ethernet%d+:([^%s]+)$")
-            local profile = redis.call('HGET', all_items[i], 'profile')
-            local index = find_profile(profile)
-            if index == 0 then
+            local profile_name = redis.call('HGET', all_items[i], 'profile')
+            if not profile_name then
+                return 1
+            end
+            profile_name = string.sub(profile_name, 2, -2)
+            local profile_ref_count = profiles[profile_name]
+            if profile_ref_count == nil then
                 -- Indicate an error in case the referenced profile hasn't been inserted or has been removed
                 -- It's possible when the orchagent is busy
                 -- The buffermgrd will take care of it and retry later
@@ -57,13 +52,9 @@ local function iterate_all_items(all_items)
             else
                 size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1))
             end
-            profiles[index][2] = profiles[index][2] + size
-            local speed = redis.call('HGET', 'PORT_TABLE:'..port, 'speed')
-            if speed == '400000' then
-                if profile == '[BUFFER_PROFILE_TABLE:ingress_lossy_profile]' then
-                    lossypg_400g = lossypg_400g + size
-                end
-                port_count_400g = port_count_400g + 1
+            profiles[profile_name] = profile_ref_count + size
+            if is_spc3 and port_set_8lanes[port] and profile_name == 'BUFFER_PROFILE_TABLE:ingress_lossy_profile' then
+                lossypg_8lanes = lossypg_8lanes + size
             end
         end
     end
@@ -77,6 +68,31 @@ local ports_table = redis.call('KEYS', 'PORT|*')
 
 total_port = #ports_table
 
+-- Initialize the port_set_8lanes set
+local platform = redis.call('HGET', 'DEVICE_METADATA|localhost', 'platform')
+if platform and string.sub(platform, 1, 16) == "x86_64-mlnx_msn4" then
+    is_spc3 = true
+    local lanes
+    local number_of_lanes
+    local port
+    for i = 1, total_port, 1 do
+        -- Load lanes from PORT table
+        lanes = redis.call('HGET', ports_table[i], 'lanes')
+        if lanes then
+            local _
+            _, number_of_lanes = string.gsub(lanes, ",", ",")
+            number_of_lanes = number_of_lanes + 1
+            port = string.sub(ports_table[i], 6, -1)
+            if (number_of_lanes == 8) then
+                port_set_8lanes[port] = true
+                port_count_8lanes = port_count_8lanes + 1
+            else
+                port_set_8lanes[port] = false
+            end
+        end
+    end
+end
+
 local egress_lossless_pool_size = redis.call('HGET', 'BUFFER_POOL|egress_lossless_pool', 'size')
 
 -- Whether shared headroom pool is enabled?
@@ -97,13 +113,30 @@ else
     shp_size = 0
 end
 
+-- Fetch mmu_size
+redis.call('SELECT', state_db)
+local mmu_size = tonumber(redis.call('HGET', 'BUFFER_MAX_PARAM_TABLE|global', 'mmu_size'))
+if mmu_size == nil then
+    mmu_size = tonumber(egress_lossless_pool_size)
+end
+local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
+local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size'))
+local pipeline_latency = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
+
+local lossypg_reserved = pipeline_latency * 1024
+local lossypg_reserved_8lanes = (2 * pipeline_latency - 1) * 1024
+
+-- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail
+local number_of_cells = math.floor(mmu_size / cell_size)
+local ceiling_mmu_size = number_of_cells * cell_size
+
 -- Switch to APPL_DB
 redis.call('SELECT', appl_db)
 
 -- Fetch names of all profiles and insert them into the look up table
 local all_profiles = redis.call('KEYS', 'BUFFER_PROFILE*')
 for i = 1, #all_profiles, 1 do
-    table.insert(profiles, {all_profiles[i], 0})
+    profiles[all_profiles[i]] = 0
 end
 
 -- Fetch all the PGs
@@ -122,56 +155,44 @@ local statistics = {}
 -- Fetch sizes of all of the profiles, accumulate them
 local accumulative_occupied_buffer = 0
 local accumulative_xoff = 0
-for i = 1, #profiles, 1 do
-    if profiles[i][1] ~= "BUFFER_PROFILE_TABLE_KEY_SET" and profiles[i][1] ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
-        local size = tonumber(redis.call('HGET', profiles[i][1], 'size'))
+
+for name in pairs(profiles) do
+    if name ~= "BUFFER_PROFILE_TABLE_KEY_SET" and name ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
+        local size = tonumber(redis.call('HGET', name, 'size'))
         if size ~= nil then 
-            if profiles[i][1] == "BUFFER_PROFILE_TABLE:ingress_lossy_profile" then
+            if name == "BUFFER_PROFILE_TABLE:ingress_lossy_profile" then
                 size = size + lossypg_reserved
             end
-            if profiles[i][1] == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then
-                profiles[i][2] = total_port
+            if name == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then
+                profiles[name] = total_port
             end
             if size ~= 0 then
                 if shp_enabled and shp_size == 0 then
-                    local xon = tonumber(redis.call('HGET', profiles[i][1], 'xon'))
-                    local xoff = tonumber(redis.call('HGET', profiles[i][1], 'xoff'))
+                    local xon = tonumber(redis.call('HGET', name, 'xon'))
+                    local xoff = tonumber(redis.call('HGET', name, 'xoff'))
                     if xon ~= nil and xoff ~= nil and xon + xoff > size then
-                        accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[i][2]
+                        accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[name]
                     end
                 end
-                accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[i][2]
+                accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[name]
             end
-            table.insert(statistics, {profiles[i][1], size, profiles[i][2]})
+            table.insert(statistics, {name, size, profiles[name]})
         end
     end
 end
 
--- Extra lossy xon buffer for 400G port
-local lossypg_extra_for_400g = (lossypg_reserved_400g - lossypg_reserved) * lossypg_400g
-accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_400g
+-- Extra lossy xon buffer for ports with 8 lanes
+local lossypg_extra_for_8lanes = (lossypg_reserved_8lanes - lossypg_reserved) * lossypg_8lanes
+accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_8lanes
 
 -- Accumulate sizes for management PGs
-local accumulative_management_pg = (total_port - port_count_400g) * lossypg_reserved + port_count_400g * lossypg_reserved_400g
+local accumulative_management_pg = (total_port - port_count_8lanes) * lossypg_reserved + port_count_8lanes * lossypg_reserved_8lanes
 accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_management_pg
 
 -- Accumulate sizes for egress mirror and management pool
 local accumulative_egress_mirror_overhead = total_port * egress_mirror_headroom
 accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_egress_mirror_overhead + mgmt_pool_size
 
--- Fetch mmu_size
-redis.call('SELECT', state_db)
-local mmu_size = tonumber(redis.call('HGET', 'BUFFER_MAX_PARAM_TABLE|global', 'mmu_size'))
-if mmu_size == nil then
-    mmu_size = tonumber(egress_lossless_pool_size)
-end
-local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
-local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size'))
-
--- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail
-local number_of_cells = math.floor(mmu_size / cell_size)
-local ceiling_mmu_size = number_of_cells * cell_size
-
 -- Switch to CONFIG_DB
 redis.call('SELECT', config_db)
 
@@ -238,13 +259,13 @@ table.insert(result, "debug:accumulative size:" .. accumulative_occupied_buffer)
 for i = 1, #statistics do
     table.insert(result, "debug:" .. statistics[i][1] .. ":" .. statistics[i][2] .. ":" .. statistics[i][3])
 end
-table.insert(result, "debug:extra_400g:" .. (lossypg_reserved_400g - lossypg_reserved) .. ":" .. lossypg_400g .. ":" .. port_count_400g)
+table.insert(result, "debug:extra_8lanes:" .. (lossypg_reserved_8lanes - lossypg_reserved) .. ":" .. lossypg_8lanes .. ":" .. port_count_8lanes)
 table.insert(result, "debug:mgmt_pool:" .. mgmt_pool_size)
 table.insert(result, "debug:accumulative_mgmt_pg:" .. accumulative_management_pg)
 table.insert(result, "debug:egress_mirror:" .. accumulative_egress_mirror_overhead)
 table.insert(result, "debug:shp_enabled:" .. tostring(shp_enabled))
 table.insert(result, "debug:shp_size:" .. shp_size)
 table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff)
-table.insert(result, "debug:total port:" .. total_port)
+table.insert(result, "debug:total port:" .. total_port .. " ports with 8 lanes:" .. port_count_8lanes)
 
 return result
diff --git a/cfgmgr/buffermgrdyn.cpp b/cfgmgr/buffermgrdyn.cpp
index 33ef6c3e5e9..7170522b372 100644
--- a/cfgmgr/buffermgrdyn.cpp
+++ b/cfgmgr/buffermgrdyn.cpp
@@ -242,7 +242,7 @@ string BufferMgrDynamic::parseObjectNameFromReference(const string &reference)
     return parseObjectNameFromKey(objName, 1);
 }
 
-string BufferMgrDynamic::getDynamicProfileName(const string &speed, const string &cable, const string &mtu, const string &threshold, const string &gearbox_model)
+string BufferMgrDynamic::getDynamicProfileName(const string &speed, const string &cable, const string &mtu, const string &threshold, const string &gearbox_model, bool is_8lane)
 {
     string buffer_profile_key;
 
@@ -265,6 +265,11 @@ string BufferMgrDynamic::getDynamicProfileName(const string &speed, const string
         buffer_profile_key = buffer_profile_key + "_" + gearbox_model;
     }
 
+    if (is_8lane)
+    {
+        buffer_profile_key = buffer_profile_key + "_8lane";
+    }
+
     return buffer_profile_key + "_profile";
 }
 
@@ -285,6 +290,7 @@ void BufferMgrDynamic::calculateHeadroomSize(buffer_profile_t &headroom)
     argv.emplace_back(headroom.cable_length);
     argv.emplace_back(headroom.port_mtu);
     argv.emplace_back(m_identifyGearboxDelay);
+    argv.emplace_back(headroom.is_8lane ? "true" : "false");
 
     try
     {
@@ -571,7 +577,7 @@ void BufferMgrDynamic::updateBufferPgToDb(const string &key, const string &profi
 }
 
 // We have to check the headroom ahead of applying them
-task_process_status BufferMgrDynamic::allocateProfile(const string &speed, const string &cable_len, const string &mtu, const string &threshold, const string &gearbox_model, string &profile_name)
+task_process_status BufferMgrDynamic::allocateProfile(const string &speed, const string &cable_len, const string &mtu, const string &threshold, const string &gearbox_model, bool is_8lane, string &profile_name)
 {
     // Create record in BUFFER_PROFILE table
 
@@ -595,6 +601,7 @@ task_process_status BufferMgrDynamic::allocateProfile(const string &speed, const
         profile.cable_length = cable_len;
         profile.port_mtu = mtu;
         profile.gearbox_model = gearbox_model;
+        profile.is_8lane = is_8lane;
 
         // Call vendor-specific lua plugin to calculate the xon, xoff, xon_offset, size
         // Pay attention, the threshold can contain valid value
@@ -796,6 +803,7 @@ task_process_status BufferMgrDynamic::refreshPgsForPort(const string &port, cons
 {
     port_info_t &portInfo = m_portInfoLookup[port];
     string &gearbox_model = portInfo.gearbox_model;
+    bool is8lane = portInfo.is_8lane;
     bool isHeadroomUpdated = false;
     buffer_pg_lookup_t &portPgs = m_portPgLookup[port];
     set<string> profilesToBeReleased;
@@ -834,8 +842,8 @@ task_process_status BufferMgrDynamic::refreshPgsForPort(const string &port, cons
             {
                 threshold = m_defaultThreshold;
             }
-            newProfile = getDynamicProfileName(speed, cable_length, mtu, threshold, gearbox_model);
-            auto rc = allocateProfile(speed, cable_length, mtu, threshold, gearbox_model, newProfile);
+            newProfile = getDynamicProfileName(speed, cable_length, mtu, threshold, gearbox_model, is8lane);
+            auto rc = allocateProfile(speed, cable_length, mtu, threshold, gearbox_model, is8lane, newProfile);
             if (task_process_status::task_success != rc)
                 return rc;
 
@@ -1369,6 +1377,12 @@ task_process_status BufferMgrDynamic::handlePortTable(KeyOpFieldsValuesTuple &tu
 
         for (auto i : kfvFieldsValues(tuple))
         {
+            if (fvField(i) == "lanes")
+            {
+                auto &lanes = fvValue(i);
+                portInfo.is_8lane = (count(lanes.begin(), lanes.end(), ',') + 1 == 8);
+            }
+
             if (fvField(i) == "speed" && fvValue(i) != portInfo.speed)
             {
                 speed_updated = true;
diff --git a/cfgmgr/buffermgrdyn.h b/cfgmgr/buffermgrdyn.h
index 1a683059099..ec3d0f6eb2f 100644
--- a/cfgmgr/buffermgrdyn.h
+++ b/cfgmgr/buffermgrdyn.h
@@ -53,6 +53,7 @@ typedef struct {
     std::string cable_length;
     std::string port_mtu;
     std::string gearbox_model;
+    bool is_8lane;
 
     // APPL_DB.BUFFER_PROFILE fields
     std::string name;
@@ -93,7 +94,7 @@ typedef struct {
     std::string cable_length;
     std::string mtu;
     std::string gearbox_model;
-//    std::string profile_name;
+    bool is_8lane;
 } port_info_t;
 
 //TODO:
@@ -215,7 +216,7 @@ class BufferMgrDynamic : public Orch
     void transformReference(std::string &name);
     std::string parseObjectNameFromKey(const std::string &key, size_t pos/* = 1*/);
     std::string parseObjectNameFromReference(const std::string &reference);
-    std::string getDynamicProfileName(const std::string &speed, const std::string &cable, const std::string &mtu, const std::string &threshold, const std::string &gearbox_model);
+    std::string getDynamicProfileName(const std::string &speed, const std::string &cable, const std::string &mtu, const std::string &threshold, const std::string &gearbox_model, bool is_8lane);
     inline bool isNonZero(const std::string &value) const
     {
         return !value.empty() && value != "0";
@@ -230,7 +231,7 @@ class BufferMgrDynamic : public Orch
     void calculateHeadroomSize(buffer_profile_t &headroom);
     void checkSharedBufferPoolSize(bool force_update_during_initialization);
     void recalculateSharedBufferPool();
-    task_process_status allocateProfile(const std::string &speed, const std::string &cable, const std::string &mtu, const std::string &threshold, const std::string &gearbox_model, std::string &profile_name);
+    task_process_status allocateProfile(const std::string &speed, const std::string &cable, const std::string &mtu, const std::string &threshold, const std::string &gearbox_model, bool is_8lane, std::string &profile_name);
     void releaseProfile(const std::string &profile_name);
     bool isHeadroomResourceValid(const std::string &port, const buffer_profile_t &profile, const std::string &new_pg);
     void refreshSharedHeadroomPool(bool enable_state_updated_by_ratio, bool enable_state_updated_by_size);