diff --git a/cfgmgr/buffer_check_headroom_mellanox.lua b/cfgmgr/buffer_check_headroom_mellanox.lua index 510a967b730..08f955fe4c7 100644 --- a/cfgmgr/buffer_check_headroom_mellanox.lua +++ b/cfgmgr/buffer_check_headroom_mellanox.lua @@ -7,10 +7,25 @@ local port = KEYS[1] local input_profile_name = ARGV[1] local input_profile_size = ARGV[2] local new_pg = ARGV[3] -local accumulative_size = 0 + +local function is_port_with_8lanes(lanes) + -- On Spectrum 3, ports with 8 lanes have doubled pipeline latency + local number_of_lanes = 0 + if lanes then + local _ + _, number_of_lanes = string.gsub(lanes, ",", ",") + number_of_lanes = number_of_lanes + 1 + end + return number_of_lanes == 8 +end + +-- Initialize the accumulative size with 4096 +-- This is to absorb the possible deviation +local accumulative_size = 4096 local appl_db = "0" local state_db = "6" +local config_db = "4" local ret_true = {} local ret = {} @@ -20,7 +35,18 @@ table.insert(ret_true, "result:true") default_ret = ret_true -local speed = redis.call('HGET', 'PORT|' .. port, 'speed') +-- Connect to CONFIG_DB +redis.call('SELECT', config_db) + +local lanes + +-- On SPC3 switch, we need to know whether it's a 8-lane port because it has extra pipeline latency +local is_spc3 = false +local platform = redis.call('HGET', 'DEVICE_METADATA|localhost', 'platform') +if platform and string.sub(platform, 1, 16) == "x86_64-mlnx_msn4" then + is_spc3 = true + lanes = redis.call('HGET', 'PORT|' .. port, 'lanes') +end -- Fetch the threshold from STATE_DB redis.call('SELECT', state_db) @@ -31,11 +57,12 @@ if max_headroom_size == nil then end local asic_keys = redis.call('KEYS', 'ASIC_TABLE*') -local pipeline_delay = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency')) -if speed == 400000 then - pipeline_delay = pipeline_delay * 2 - 1 +local pipeline_latency = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency')) +if is_spc3 and is_port_with_8lanes(lanes) then + -- The pipeline latency should be adjusted accordingly for ports with 2 buffer units + pipeline_latency = pipeline_latency * 2 - 1 end -accumulative_size = accumulative_size + 2 * pipeline_delay * 1024 +accumulative_size = accumulative_size + 2 * pipeline_latency * 1024 -- Fetch all keys in BUFFER_PG according to the port redis.call('SELECT', appl_db) @@ -95,6 +122,7 @@ end if max_headroom_size > accumulative_size then table.insert(ret, "result:true") + table.insert(ret, "debug:Accumulative headroom on port " .. accumulative_size .. ", the maximum available headroom " .. max_headroom_size) else table.insert(ret, "result:false") table.insert(ret, "debug:Accumulative headroom on port " .. accumulative_size .. " exceeds the maximum available headroom which is " .. max_headroom_size) diff --git a/cfgmgr/buffer_headroom_mellanox.lua b/cfgmgr/buffer_headroom_mellanox.lua index 2c958140081..aa0cc53f0f5 100644 --- a/cfgmgr/buffer_headroom_mellanox.lua +++ b/cfgmgr/buffer_headroom_mellanox.lua @@ -3,6 +3,7 @@ -- ARGV[2] - cable length -- ARGV[3] - port mtu -- ARGV[4] - gearbox delay +-- ARGV[5] - whether the port is an 8-lane port -- parameters retried from databases: -- From CONFIG_DB.LOSSLESS_TRAFFIC_PATTERN @@ -26,6 +27,7 @@ local port_speed = tonumber(ARGV[1]) local cable_length = tonumber(string.sub(ARGV[2], 1, -2)) local port_mtu = tonumber(ARGV[3]) local gearbox_delay = tonumber(ARGV[4]) +local is_8lane = ARGV[5] local appl_db = "0" local config_db = "4" @@ -100,8 +102,8 @@ local xon_value local headroom_size local speed_overhead --- Adjustment for 400G -if port_speed == 400000 then +-- Adjustment for 8-lane port +if is_8lane ~= nil and is_8lane == "true" then pipeline_latency = 37 * 1024 speed_overhead = port_mtu else diff --git a/cfgmgr/buffer_pool_mellanox.lua b/cfgmgr/buffer_pool_mellanox.lua index f27316d2b7e..68f18147b1f 100644 --- a/cfgmgr/buffer_pool_mellanox.lua +++ b/cfgmgr/buffer_pool_mellanox.lua @@ -5,12 +5,12 @@ local appl_db = "0" local config_db = "4" local state_db = "6" -local lossypg_reserved = 19 * 1024 -local lossypg_reserved_400g = 37 * 1024 --- Number of 400G ports -local port_count_400g = 0 --- Number of lossy PG on 400G ports -local lossypg_400g = 0 +-- Number of ports with 8 lanes (whose pipeline latency should be doubled) +local port_count_8lanes = 0 +-- Number of lossy PG on ports with 8 lanes +local lossypg_8lanes = 0 + +local is_spc3 = false local result = {} local profiles = {} @@ -20,16 +20,7 @@ local total_port = 0 local mgmt_pool_size = 256 * 1024 local egress_mirror_headroom = 10 * 1024 -local function find_profile(ref) - -- Remove the surrounding square bracket and the find in the list - local name = string.sub(ref, 2, -2) - for i = 1, #profiles, 1 do - if profiles[i][1] == name then - return i - end - end - return 0 -end +local port_set_8lanes = {} local function iterate_all_items(all_items) table.sort(all_items) @@ -43,9 +34,13 @@ local function iterate_all_items(all_items) port = string.match(all_items[i], "Ethernet%d+") if port ~= nil then local range = string.match(all_items[i], "Ethernet%d+:([^%s]+)$") - local profile = redis.call('HGET', all_items[i], 'profile') - local index = find_profile(profile) - if index == 0 then + local profile_name = redis.call('HGET', all_items[i], 'profile') + if not profile_name then + return 1 + end + profile_name = string.sub(profile_name, 2, -2) + local profile_ref_count = profiles[profile_name] + if profile_ref_count == nil then -- Indicate an error in case the referenced profile hasn't been inserted or has been removed -- It's possible when the orchagent is busy -- The buffermgrd will take care of it and retry later @@ -57,13 +52,9 @@ local function iterate_all_items(all_items) else size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1)) end - profiles[index][2] = profiles[index][2] + size - local speed = redis.call('HGET', 'PORT_TABLE:'..port, 'speed') - if speed == '400000' then - if profile == '[BUFFER_PROFILE_TABLE:ingress_lossy_profile]' then - lossypg_400g = lossypg_400g + size - end - port_count_400g = port_count_400g + 1 + profiles[profile_name] = profile_ref_count + size + if is_spc3 and port_set_8lanes[port] and profile_name == 'BUFFER_PROFILE_TABLE:ingress_lossy_profile' then + lossypg_8lanes = lossypg_8lanes + size end end end @@ -77,6 +68,31 @@ local ports_table = redis.call('KEYS', 'PORT|*') total_port = #ports_table +-- Initialize the port_set_8lanes set +local platform = redis.call('HGET', 'DEVICE_METADATA|localhost', 'platform') +if platform and string.sub(platform, 1, 16) == "x86_64-mlnx_msn4" then + is_spc3 = true + local lanes + local number_of_lanes + local port + for i = 1, total_port, 1 do + -- Load lanes from PORT table + lanes = redis.call('HGET', ports_table[i], 'lanes') + if lanes then + local _ + _, number_of_lanes = string.gsub(lanes, ",", ",") + number_of_lanes = number_of_lanes + 1 + port = string.sub(ports_table[i], 6, -1) + if (number_of_lanes == 8) then + port_set_8lanes[port] = true + port_count_8lanes = port_count_8lanes + 1 + else + port_set_8lanes[port] = false + end + end + end +end + local egress_lossless_pool_size = redis.call('HGET', 'BUFFER_POOL|egress_lossless_pool', 'size') -- Whether shared headroom pool is enabled? @@ -97,13 +113,30 @@ else shp_size = 0 end +-- Fetch mmu_size +redis.call('SELECT', state_db) +local mmu_size = tonumber(redis.call('HGET', 'BUFFER_MAX_PARAM_TABLE|global', 'mmu_size')) +if mmu_size == nil then + mmu_size = tonumber(egress_lossless_pool_size) +end +local asic_keys = redis.call('KEYS', 'ASIC_TABLE*') +local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size')) +local pipeline_latency = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency')) + +local lossypg_reserved = pipeline_latency * 1024 +local lossypg_reserved_8lanes = (2 * pipeline_latency - 1) * 1024 + +-- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail +local number_of_cells = math.floor(mmu_size / cell_size) +local ceiling_mmu_size = number_of_cells * cell_size + -- Switch to APPL_DB redis.call('SELECT', appl_db) -- Fetch names of all profiles and insert them into the look up table local all_profiles = redis.call('KEYS', 'BUFFER_PROFILE*') for i = 1, #all_profiles, 1 do - table.insert(profiles, {all_profiles[i], 0}) + profiles[all_profiles[i]] = 0 end -- Fetch all the PGs @@ -122,56 +155,44 @@ local statistics = {} -- Fetch sizes of all of the profiles, accumulate them local accumulative_occupied_buffer = 0 local accumulative_xoff = 0 -for i = 1, #profiles, 1 do - if profiles[i][1] ~= "BUFFER_PROFILE_TABLE_KEY_SET" and profiles[i][1] ~= "BUFFER_PROFILE_TABLE_DEL_SET" then - local size = tonumber(redis.call('HGET', profiles[i][1], 'size')) + +for name in pairs(profiles) do + if name ~= "BUFFER_PROFILE_TABLE_KEY_SET" and name ~= "BUFFER_PROFILE_TABLE_DEL_SET" then + local size = tonumber(redis.call('HGET', name, 'size')) if size ~= nil then - if profiles[i][1] == "BUFFER_PROFILE_TABLE:ingress_lossy_profile" then + if name == "BUFFER_PROFILE_TABLE:ingress_lossy_profile" then size = size + lossypg_reserved end - if profiles[i][1] == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then - profiles[i][2] = total_port + if name == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then + profiles[name] = total_port end if size ~= 0 then if shp_enabled and shp_size == 0 then - local xon = tonumber(redis.call('HGET', profiles[i][1], 'xon')) - local xoff = tonumber(redis.call('HGET', profiles[i][1], 'xoff')) + local xon = tonumber(redis.call('HGET', name, 'xon')) + local xoff = tonumber(redis.call('HGET', name, 'xoff')) if xon ~= nil and xoff ~= nil and xon + xoff > size then - accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[i][2] + accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[name] end end - accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[i][2] + accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[name] end - table.insert(statistics, {profiles[i][1], size, profiles[i][2]}) + table.insert(statistics, {name, size, profiles[name]}) end end end --- Extra lossy xon buffer for 400G port -local lossypg_extra_for_400g = (lossypg_reserved_400g - lossypg_reserved) * lossypg_400g -accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_400g +-- Extra lossy xon buffer for ports with 8 lanes +local lossypg_extra_for_8lanes = (lossypg_reserved_8lanes - lossypg_reserved) * lossypg_8lanes +accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_8lanes -- Accumulate sizes for management PGs -local accumulative_management_pg = (total_port - port_count_400g) * lossypg_reserved + port_count_400g * lossypg_reserved_400g +local accumulative_management_pg = (total_port - port_count_8lanes) * lossypg_reserved + port_count_8lanes * lossypg_reserved_8lanes accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_management_pg -- Accumulate sizes for egress mirror and management pool local accumulative_egress_mirror_overhead = total_port * egress_mirror_headroom accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_egress_mirror_overhead + mgmt_pool_size --- Fetch mmu_size -redis.call('SELECT', state_db) -local mmu_size = tonumber(redis.call('HGET', 'BUFFER_MAX_PARAM_TABLE|global', 'mmu_size')) -if mmu_size == nil then - mmu_size = tonumber(egress_lossless_pool_size) -end -local asic_keys = redis.call('KEYS', 'ASIC_TABLE*') -local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size')) - --- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail -local number_of_cells = math.floor(mmu_size / cell_size) -local ceiling_mmu_size = number_of_cells * cell_size - -- Switch to CONFIG_DB redis.call('SELECT', config_db) @@ -238,13 +259,13 @@ table.insert(result, "debug:accumulative size:" .. accumulative_occupied_buffer) for i = 1, #statistics do table.insert(result, "debug:" .. statistics[i][1] .. ":" .. statistics[i][2] .. ":" .. statistics[i][3]) end -table.insert(result, "debug:extra_400g:" .. (lossypg_reserved_400g - lossypg_reserved) .. ":" .. lossypg_400g .. ":" .. port_count_400g) +table.insert(result, "debug:extra_8lanes:" .. (lossypg_reserved_8lanes - lossypg_reserved) .. ":" .. lossypg_8lanes .. ":" .. port_count_8lanes) table.insert(result, "debug:mgmt_pool:" .. mgmt_pool_size) table.insert(result, "debug:accumulative_mgmt_pg:" .. accumulative_management_pg) table.insert(result, "debug:egress_mirror:" .. accumulative_egress_mirror_overhead) table.insert(result, "debug:shp_enabled:" .. tostring(shp_enabled)) table.insert(result, "debug:shp_size:" .. shp_size) table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff) -table.insert(result, "debug:total port:" .. total_port) +table.insert(result, "debug:total port:" .. total_port .. " ports with 8 lanes:" .. port_count_8lanes) return result diff --git a/cfgmgr/buffermgrdyn.cpp b/cfgmgr/buffermgrdyn.cpp index 33ef6c3e5e9..7170522b372 100644 --- a/cfgmgr/buffermgrdyn.cpp +++ b/cfgmgr/buffermgrdyn.cpp @@ -242,7 +242,7 @@ string BufferMgrDynamic::parseObjectNameFromReference(const string &reference) return parseObjectNameFromKey(objName, 1); } -string BufferMgrDynamic::getDynamicProfileName(const string &speed, const string &cable, const string &mtu, const string &threshold, const string &gearbox_model) +string BufferMgrDynamic::getDynamicProfileName(const string &speed, const string &cable, const string &mtu, const string &threshold, const string &gearbox_model, bool is_8lane) { string buffer_profile_key; @@ -265,6 +265,11 @@ string BufferMgrDynamic::getDynamicProfileName(const string &speed, const string buffer_profile_key = buffer_profile_key + "_" + gearbox_model; } + if (is_8lane) + { + buffer_profile_key = buffer_profile_key + "_8lane"; + } + return buffer_profile_key + "_profile"; } @@ -285,6 +290,7 @@ void BufferMgrDynamic::calculateHeadroomSize(buffer_profile_t &headroom) argv.emplace_back(headroom.cable_length); argv.emplace_back(headroom.port_mtu); argv.emplace_back(m_identifyGearboxDelay); + argv.emplace_back(headroom.is_8lane ? "true" : "false"); try { @@ -571,7 +577,7 @@ void BufferMgrDynamic::updateBufferPgToDb(const string &key, const string &profi } // We have to check the headroom ahead of applying them -task_process_status BufferMgrDynamic::allocateProfile(const string &speed, const string &cable_len, const string &mtu, const string &threshold, const string &gearbox_model, string &profile_name) +task_process_status BufferMgrDynamic::allocateProfile(const string &speed, const string &cable_len, const string &mtu, const string &threshold, const string &gearbox_model, bool is_8lane, string &profile_name) { // Create record in BUFFER_PROFILE table @@ -595,6 +601,7 @@ task_process_status BufferMgrDynamic::allocateProfile(const string &speed, const profile.cable_length = cable_len; profile.port_mtu = mtu; profile.gearbox_model = gearbox_model; + profile.is_8lane = is_8lane; // Call vendor-specific lua plugin to calculate the xon, xoff, xon_offset, size // Pay attention, the threshold can contain valid value @@ -796,6 +803,7 @@ task_process_status BufferMgrDynamic::refreshPgsForPort(const string &port, cons { port_info_t &portInfo = m_portInfoLookup[port]; string &gearbox_model = portInfo.gearbox_model; + bool is8lane = portInfo.is_8lane; bool isHeadroomUpdated = false; buffer_pg_lookup_t &portPgs = m_portPgLookup[port]; set profilesToBeReleased; @@ -834,8 +842,8 @@ task_process_status BufferMgrDynamic::refreshPgsForPort(const string &port, cons { threshold = m_defaultThreshold; } - newProfile = getDynamicProfileName(speed, cable_length, mtu, threshold, gearbox_model); - auto rc = allocateProfile(speed, cable_length, mtu, threshold, gearbox_model, newProfile); + newProfile = getDynamicProfileName(speed, cable_length, mtu, threshold, gearbox_model, is8lane); + auto rc = allocateProfile(speed, cable_length, mtu, threshold, gearbox_model, is8lane, newProfile); if (task_process_status::task_success != rc) return rc; @@ -1369,6 +1377,12 @@ task_process_status BufferMgrDynamic::handlePortTable(KeyOpFieldsValuesTuple &tu for (auto i : kfvFieldsValues(tuple)) { + if (fvField(i) == "lanes") + { + auto &lanes = fvValue(i); + portInfo.is_8lane = (count(lanes.begin(), lanes.end(), ',') + 1 == 8); + } + if (fvField(i) == "speed" && fvValue(i) != portInfo.speed) { speed_updated = true; diff --git a/cfgmgr/buffermgrdyn.h b/cfgmgr/buffermgrdyn.h index 1a683059099..ec3d0f6eb2f 100644 --- a/cfgmgr/buffermgrdyn.h +++ b/cfgmgr/buffermgrdyn.h @@ -53,6 +53,7 @@ typedef struct { std::string cable_length; std::string port_mtu; std::string gearbox_model; + bool is_8lane; // APPL_DB.BUFFER_PROFILE fields std::string name; @@ -93,7 +94,7 @@ typedef struct { std::string cable_length; std::string mtu; std::string gearbox_model; -// std::string profile_name; + bool is_8lane; } port_info_t; //TODO: @@ -215,7 +216,7 @@ class BufferMgrDynamic : public Orch void transformReference(std::string &name); std::string parseObjectNameFromKey(const std::string &key, size_t pos/* = 1*/); std::string parseObjectNameFromReference(const std::string &reference); - std::string getDynamicProfileName(const std::string &speed, const std::string &cable, const std::string &mtu, const std::string &threshold, const std::string &gearbox_model); + std::string getDynamicProfileName(const std::string &speed, const std::string &cable, const std::string &mtu, const std::string &threshold, const std::string &gearbox_model, bool is_8lane); inline bool isNonZero(const std::string &value) const { return !value.empty() && value != "0"; @@ -230,7 +231,7 @@ class BufferMgrDynamic : public Orch void calculateHeadroomSize(buffer_profile_t &headroom); void checkSharedBufferPoolSize(bool force_update_during_initialization); void recalculateSharedBufferPool(); - task_process_status allocateProfile(const std::string &speed, const std::string &cable, const std::string &mtu, const std::string &threshold, const std::string &gearbox_model, std::string &profile_name); + task_process_status allocateProfile(const std::string &speed, const std::string &cable, const std::string &mtu, const std::string &threshold, const std::string &gearbox_model, bool is_8lane, std::string &profile_name); void releaseProfile(const std::string &profile_name); bool isHeadroomResourceValid(const std::string &port, const buffer_profile_t &profile, const std::string &new_pg); void refreshSharedHeadroomPool(bool enable_state_updated_by_ratio, bool enable_state_updated_by_size);