Skip to content

Commit

Permalink
[Dynamic Buffer Calc][Mellanox] Bug fixes and enhancements for the lu…
Browse files Browse the repository at this point in the history
…a plugins for buffer pool calculation and headroom checking (#1781)

What I did
Bug fixes for buffer pool calculation and headroom checking on Mellanox platforms.

Test the number of lanes instead of the speed when determining whether special handling is required for a port.
For speeds other than 400G, eg 100G, it's possible that some 100G ports have 8 lanes and others have 4 lanes,
which means they can not share the same buffer profile.
A suffix _8lane is introduced to indicate it, like pg_lossless_100000_5m_8lane_profile
Take the private headroom into account when calculating the buffer pool size
Take deviation into account when checking the headroom against the per-port limit to avoid the inaccurate result in a rare case
Use hashtable to record the reference count of a profile in lug plugin

Signed-off-by: Stephen Sun stephens@nvidia.com

How I verified it
Run regression and manually test

Details if related

Test the number of lanes instead of the speed when determining whether special handling (double headroom size) is required for a port.
Originally, it was determined by testing whether the ports' speed is 400G but that is not accurate. A user can configure a port with 8 lanes to 100G. In this case, special handling is still required for a port that is not 400G.
So we need to adjust the way to do that.
The variable names are also updated accordingly: xxx_400g => xxx_8lanes
Take deviation into account when checking the headroom against the per-port limit to avoid the inaccurate result in a rare case
There are some deviations that make the accumulative headroom a bit larger than the quantity calculated by the buffer manager. We need to take it into account when calculating the accumulative headroom.
  • Loading branch information
stephenxs authored and qiluo-msft committed Jun 29, 2021
1 parent f949dfe commit bb383be
Show file tree
Hide file tree
Showing 6 changed files with 190 additions and 82 deletions.
35 changes: 29 additions & 6 deletions cfgmgr/buffer_check_headroom_mellanox.lua
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,25 @@ local port = KEYS[1]
local input_profile_name = ARGV[1]
local input_profile_size = ARGV[2]
local new_pg = ARGV[3]
local accumulative_size = 0

local function is_port_with_8lanes(lanes)
-- On Spectrum 3, ports with 8 lanes have doubled pipeline latency
local number_of_lanes = 0
if lanes then
local _
_, number_of_lanes = string.gsub(lanes, ",", ",")
number_of_lanes = number_of_lanes + 1
end
return number_of_lanes == 8
end

-- Initialize the accumulative size with 4096
-- This is to absorb the possible deviation
local accumulative_size = 4096

local appl_db = "0"
local state_db = "6"
local config_db = "4"

local ret_true = {}
local ret = {}
Expand All @@ -20,7 +35,13 @@ table.insert(ret_true, "result:true")

default_ret = ret_true

local speed = redis.call('HGET', 'PORT|' .. port, 'speed')
-- Connect to CONFIG_DB
redis.call('SELECT', config_db)

local lanes

-- We need to know whether it's a 8-lane port because it has extra pipeline latency
lanes = redis.call('HGET', 'PORT|' .. port, 'lanes')

-- Fetch the threshold from STATE_DB
redis.call('SELECT', state_db)
Expand All @@ -31,11 +52,12 @@ if max_headroom_size == nil then
end

local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
local pipeline_delay = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
if speed == 400000 then
pipeline_delay = pipeline_delay * 2 - 1
local pipeline_latency = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
if is_port_with_8lanes(lanes) then
-- The pipeline latency should be adjusted accordingly for ports with 2 buffer units
pipeline_latency = pipeline_latency * 2 - 1
end
accumulative_size = accumulative_size + 2 * pipeline_delay * 1024
accumulative_size = accumulative_size + 2 * pipeline_latency * 1024

-- Fetch all keys in BUFFER_PG according to the port
redis.call('SELECT', appl_db)
Expand Down Expand Up @@ -95,6 +117,7 @@ end

if max_headroom_size > accumulative_size then
table.insert(ret, "result:true")
table.insert(ret, "debug:Accumulative headroom on port " .. accumulative_size .. ", the maximum available headroom " .. max_headroom_size)
else
table.insert(ret, "result:false")
table.insert(ret, "debug:Accumulative headroom on port " .. accumulative_size .. " exceeds the maximum available headroom which is " .. max_headroom_size)
Expand Down
8 changes: 5 additions & 3 deletions cfgmgr/buffer_headroom_mellanox.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
-- ARGV[2] - cable length
-- ARGV[3] - port mtu
-- ARGV[4] - gearbox delay
-- ARGV[5] - lane count of the ports on which the profile will be applied

-- parameters retried from databases:
-- From CONFIG_DB.LOSSLESS_TRAFFIC_PATTERN
Expand All @@ -26,6 +27,7 @@ local port_speed = tonumber(ARGV[1])
local cable_length = tonumber(string.sub(ARGV[2], 1, -2))
local port_mtu = tonumber(ARGV[3])
local gearbox_delay = tonumber(ARGV[4])
local is_8lane = (ARGV[5] == "8")

local appl_db = "0"
local config_db = "4"
Expand Down Expand Up @@ -100,9 +102,9 @@ local xon_value
local headroom_size
local speed_overhead

-- Adjustment for 400G
if port_speed == 400000 then
pipeline_latency = 37 * 1024
-- Adjustment for 8-lane port
if is_8lane ~= nil and is_8lane then
pipeline_latency = pipeline_latency * 2 - 1024
speed_overhead = port_mtu
else
speed_overhead = 0
Expand Down
169 changes: 109 additions & 60 deletions cfgmgr/buffer_pool_mellanox.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,31 @@ local appl_db = "0"
local config_db = "4"
local state_db = "6"

local lossypg_reserved = 19 * 1024
local lossypg_reserved_400g = 37 * 1024
-- Number of 400G ports
local port_count_400g = 0
-- Number of lossy PG on 400G ports
local lossypg_400g = 0
-- Number of ports with 8 lanes (whose pipeline latency should be doubled)
local port_count_8lanes = 0
-- Number of lossy PG on ports with 8 lanes
local lossypg_8lanes = 0

-- Private headrom
local private_headroom = 10 * 1024

local result = {}
local profiles = {}
local lossless_profiles = {}

local total_port = 0

local mgmt_pool_size = 256 * 1024
local egress_mirror_headroom = 10 * 1024

local function find_profile(ref)
-- Remove the surrounding square bracket and the find in the list
local name = string.sub(ref, 2, -2)
for i = 1, #profiles, 1 do
if profiles[i][1] == name then
return i
end
end
return 0
end
-- The set of ports with 8 lanes
local port_set_8lanes = {}
-- Number of ports with lossless profiles
local lossless_port_count = 0

local function iterate_all_items(all_items)
local function iterate_all_items(all_items, check_lossless)
table.sort(all_items)
local lossless_ports = {}
local port
local fvpairs
for i = 1, #all_items, 1 do
Expand All @@ -43,9 +40,13 @@ local function iterate_all_items(all_items)
port = string.match(all_items[i], "Ethernet%d+")
if port ~= nil then
local range = string.match(all_items[i], "Ethernet%d+:([^%s]+)$")
local profile = redis.call('HGET', all_items[i], 'profile')
local index = find_profile(profile)
if index == 0 then
local profile_name = redis.call('HGET', all_items[i], 'profile')
if not profile_name then
return 1
end
profile_name = string.sub(profile_name, 2, -2)
local profile_ref_count = profiles[profile_name]
if profile_ref_count == nil then
-- Indicate an error in case the referenced profile hasn't been inserted or has been removed
-- It's possible when the orchagent is busy
-- The buffermgrd will take care of it and retry later
Expand All @@ -57,13 +58,15 @@ local function iterate_all_items(all_items)
else
size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1))
end
profiles[index][2] = profiles[index][2] + size
local speed = redis.call('HGET', 'PORT_TABLE:'..port, 'speed')
if speed == '400000' then
if profile == '[BUFFER_PROFILE_TABLE:ingress_lossy_profile]' then
lossypg_400g = lossypg_400g + size
profiles[profile_name] = profile_ref_count + size
if port_set_8lanes[port] and profile_name == 'BUFFER_PROFILE_TABLE:ingress_lossy_profile' then
lossypg_8lanes = lossypg_8lanes + size
end
if check_lossless and lossless_profiles[profile_name] then
if lossless_ports[port] == nil then
lossless_port_count = lossless_port_count + 1
lossless_ports[port] = true
end
port_count_400g = port_count_400g + 1
end
end
end
Expand All @@ -77,6 +80,27 @@ local ports_table = redis.call('KEYS', 'PORT|*')

total_port = #ports_table

-- Initialize the port_set_8lanes set
local lanes
local number_of_lanes
local port
for i = 1, total_port, 1 do
-- Load lanes from PORT table
lanes = redis.call('HGET', ports_table[i], 'lanes')
if lanes then
local _
_, number_of_lanes = string.gsub(lanes, ",", ",")
number_of_lanes = number_of_lanes + 1
port = string.sub(ports_table[i], 6, -1)
if (number_of_lanes == 8) then
port_set_8lanes[port] = true
port_count_8lanes = port_count_8lanes + 1
else
port_set_8lanes[port] = false
end
end
end

local egress_lossless_pool_size = redis.call('HGET', 'BUFFER_POOL|egress_lossless_pool', 'size')

-- Whether shared headroom pool is enabled?
Expand All @@ -97,22 +121,45 @@ else
shp_size = 0
end

-- Fetch mmu_size
redis.call('SELECT', state_db)
local mmu_size = tonumber(redis.call('HGET', 'BUFFER_MAX_PARAM_TABLE|global', 'mmu_size'))
if mmu_size == nil then
mmu_size = tonumber(egress_lossless_pool_size)
end
local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size'))
local pipeline_latency = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))

local lossypg_reserved = pipeline_latency * 1024
local lossypg_reserved_8lanes = (2 * pipeline_latency - 1) * 1024

-- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail
local number_of_cells = math.floor(mmu_size / cell_size)
local ceiling_mmu_size = number_of_cells * cell_size

-- Switch to APPL_DB
redis.call('SELECT', appl_db)

-- Fetch names of all profiles and insert them into the look up table
local all_profiles = redis.call('KEYS', 'BUFFER_PROFILE*')
for i = 1, #all_profiles, 1 do
table.insert(profiles, {all_profiles[i], 0})
if all_profiles[i] ~= "BUFFER_PROFILE_TABLE_KEY_SET" and all_profiles[i] ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
local xoff = redis.call('HGET', all_profiles[i], 'xoff')
if xoff then
lossless_profiles[all_profiles[i]] = true
end
profiles[all_profiles[i]] = 0
end
end

-- Fetch all the PGs
local all_pgs = redis.call('KEYS', 'BUFFER_PG*')
local all_tcs = redis.call('KEYS', 'BUFFER_QUEUE*')

local fail_count = 0
fail_count = fail_count + iterate_all_items(all_pgs)
fail_count = fail_count + iterate_all_items(all_tcs)
fail_count = fail_count + iterate_all_items(all_pgs, true)
fail_count = fail_count + iterate_all_items(all_tcs, false)
if fail_count > 0 then
return {}
end
Expand All @@ -122,56 +169,55 @@ local statistics = {}
-- Fetch sizes of all of the profiles, accumulate them
local accumulative_occupied_buffer = 0
local accumulative_xoff = 0
for i = 1, #profiles, 1 do
if profiles[i][1] ~= "BUFFER_PROFILE_TABLE_KEY_SET" and profiles[i][1] ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
local size = tonumber(redis.call('HGET', profiles[i][1], 'size'))

for name in pairs(profiles) do
if name ~= "BUFFER_PROFILE_TABLE_KEY_SET" and name ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
local size = tonumber(redis.call('HGET', name, 'size'))
if size ~= nil then
if profiles[i][1] == "BUFFER_PROFILE_TABLE:ingress_lossy_profile" then
if name == "BUFFER_PROFILE_TABLE:ingress_lossy_profile" then
size = size + lossypg_reserved
end
if profiles[i][1] == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then
profiles[i][2] = total_port
if name == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then
profiles[name] = total_port
end
if size ~= 0 then
if shp_enabled and shp_size == 0 then
local xon = tonumber(redis.call('HGET', profiles[i][1], 'xon'))
local xoff = tonumber(redis.call('HGET', profiles[i][1], 'xoff'))
local xon = tonumber(redis.call('HGET', name, 'xon'))
local xoff = tonumber(redis.call('HGET', name, 'xoff'))
if xon ~= nil and xoff ~= nil and xon + xoff > size then
accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[i][2]
accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[name]
end
end
accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[i][2]
accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[name]
end
table.insert(statistics, {profiles[i][1], size, profiles[i][2]})
table.insert(statistics, {name, size, profiles[name]})
end
end
end

-- Extra lossy xon buffer for 400G port
local lossypg_extra_for_400g = (lossypg_reserved_400g - lossypg_reserved) * lossypg_400g
accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_400g
-- Extra lossy xon buffer for ports with 8 lanes
local lossypg_extra_for_8lanes = (lossypg_reserved_8lanes - lossypg_reserved) * lossypg_8lanes
accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_8lanes

-- Accumulate sizes for private headrooms
local accumulative_private_headroom = 0
if shp_enabled then
accumulative_private_headroom = lossless_port_count * private_headroom
accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_private_headroom
accumulative_xoff = accumulative_xoff - accumulative_private_headroom
if accumulative_xoff < 0 then
accumulative_xoff = 0
end
end

-- Accumulate sizes for management PGs
local accumulative_management_pg = (total_port - port_count_400g) * lossypg_reserved + port_count_400g * lossypg_reserved_400g
local accumulative_management_pg = (total_port - port_count_8lanes) * lossypg_reserved + port_count_8lanes * lossypg_reserved_8lanes
accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_management_pg

-- Accumulate sizes for egress mirror and management pool
local accumulative_egress_mirror_overhead = total_port * egress_mirror_headroom
accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_egress_mirror_overhead + mgmt_pool_size

-- Fetch mmu_size
redis.call('SELECT', state_db)
local mmu_size = tonumber(redis.call('HGET', 'BUFFER_MAX_PARAM_TABLE|global', 'mmu_size'))
if mmu_size == nil then
mmu_size = tonumber(egress_lossless_pool_size)
end
local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size'))

-- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail
local number_of_cells = math.floor(mmu_size / cell_size)
local ceiling_mmu_size = number_of_cells * cell_size

-- Switch to CONFIG_DB
redis.call('SELECT', config_db)

Expand Down Expand Up @@ -238,13 +284,16 @@ table.insert(result, "debug:accumulative size:" .. accumulative_occupied_buffer)
for i = 1, #statistics do
table.insert(result, "debug:" .. statistics[i][1] .. ":" .. statistics[i][2] .. ":" .. statistics[i][3])
end
table.insert(result, "debug:extra_400g:" .. (lossypg_reserved_400g - lossypg_reserved) .. ":" .. lossypg_400g .. ":" .. port_count_400g)
table.insert(result, "debug:extra_8lanes:" .. (lossypg_reserved_8lanes - lossypg_reserved) .. ":" .. lossypg_8lanes .. ":" .. port_count_8lanes)
table.insert(result, "debug:mgmt_pool:" .. mgmt_pool_size)
if shp_enabled then
table.insert(result, "debug:accumulative_private_headroom:" .. accumulative_private_headroom)
table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff)
end
table.insert(result, "debug:accumulative_mgmt_pg:" .. accumulative_management_pg)
table.insert(result, "debug:egress_mirror:" .. accumulative_egress_mirror_overhead)
table.insert(result, "debug:shp_enabled:" .. tostring(shp_enabled))
table.insert(result, "debug:shp_size:" .. shp_size)
table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff)
table.insert(result, "debug:total port:" .. total_port)
table.insert(result, "debug:total port:" .. total_port .. " ports with 8 lanes:" .. port_count_8lanes)

return result
Loading

0 comments on commit bb383be

Please sign in to comment.