Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fabricportsorch] Collect counters for fabric links #1944

Merged
merged 1 commit into from
Oct 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 95 additions & 26 deletions orchagent/fabricportsorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,18 @@
#include "timer.h"

#define FABRIC_POLLING_INTERVAL_DEFAULT (30)
#define FABRIC_PORT_PREFIX "PORT"
#define FABRIC_PORT_ERROR 0
#define FABRIC_PORT_SUCCESS 1
#define FABRIC_PORT_STAT_COUNTER_FLEX_COUNTER_GROUP "FABRIC_PORT_STAT_COUNTER"
#define FABRIC_PORT_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS 10000
#define FABRIC_QUEUE_STAT_COUNTER_FLEX_COUNTER_GROUP "FABRIC_QUEUE_STAT_COUNTER"
#define FABRIC_QUEUE_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS 100000
#define FABRIC_PORT_TABLE "FABRIC_PORT_TABLE"

extern sai_object_id_t gSwitchId;
extern sai_switch_api_t *sai_switch_api;
extern sai_port_api_t *sai_port_api;
extern sai_queue_api_t *sai_queue_api;

const vector<sai_port_stat_t> port_stat_ids =
{
Expand All @@ -42,7 +43,8 @@ static const vector<sai_queue_stat_t> queue_stat_ids =
SAI_QUEUE_STAT_CURR_OCCUPANCY_LEVEL,
};

FabricPortsOrch::FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pri_t> &tableNames) :
FabricPortsOrch::FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pri_t> &tableNames,
bool fabricPortStatEnabled, bool fabricQueueStatEnabled) :
Orch(appl_db, tableNames),
port_stat_manager(FABRIC_PORT_STAT_COUNTER_FLEX_COUNTER_GROUP, StatsMode::READ,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have unit tests for fabric port flex counter groups similar to test_flex_counter.py?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't. Let me see if I can add FABRIC_PORT_STAT_COUNTER for test_flex_counter.py

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The VirtualChassisTopology doesn't have any fabric asics defined, so it's not a simple change to test anything for switch_type=fabric in sonic-swss.

I'll work separately from this PR into updating virtual chassis topology and adding a test_fabric_counters.py test.

FABRIC_PORT_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS, true),
Expand All @@ -55,14 +57,17 @@ FabricPortsOrch::FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pr
SWSS_LOG_NOTICE( "FabricPortsOrch constructor" );

m_state_db = shared_ptr<DBConnector>(new DBConnector("STATE_DB", 0));
m_stateTable = unique_ptr<Table>(new Table(m_state_db.get(), FABRIC_PORT_TABLE));
m_stateTable = unique_ptr<Table>(new Table(m_state_db.get(), APP_FABRIC_PORT_TABLE_NAME));

m_counter_db = shared_ptr<DBConnector>(new DBConnector("COUNTERS_DB", 0));
m_laneQueueCounterTable = unique_ptr<Table>(new Table(m_counter_db.get(), COUNTERS_QUEUE_NAME_MAP));
m_lanePortCounterTable = unique_ptr<Table>(new Table(m_counter_db.get(), COUNTERS_QUEUE_PORT_MAP));
m_portNameQueueCounterTable = unique_ptr<Table>(new Table(m_counter_db.get(), COUNTERS_FABRIC_QUEUE_NAME_MAP));
m_portNamePortCounterTable = unique_ptr<Table>(new Table(m_counter_db.get(), COUNTERS_FABRIC_PORT_NAME_MAP));

m_flex_db = shared_ptr<DBConnector>(new DBConnector("FLEX_COUNTER_DB", 0));
m_flexCounterTable = unique_ptr<ProducerTable>(new ProducerTable(m_flex_db.get(), FABRIC_PORT_TABLE));
m_flexCounterTable = unique_ptr<ProducerTable>(new ProducerTable(m_flex_db.get(), APP_FABRIC_PORT_TABLE_NAME));

m_fabricPortStatEnabled = fabricPortStatEnabled;
m_fabricQueueStatEnabled = fabricQueueStatEnabled;

getFabricPortList();

Expand Down Expand Up @@ -147,32 +152,96 @@ bool FabricPortsOrch::allPortsReady()

void FabricPortsOrch::generatePortStats()
{
// FIX_ME: This function installs flex counters for port stats
// on fabric ports for fabric asics and voq asics (that connect
// to fabric asics via fabric ports). These counters will be
// installed in FLEX_COUNTER_DB, and queried by syncd and updated
// to COUNTERS_DB.
// However, currently BCM SAI doesn't update its code to query
// port stats (metrics in list port_stat_ids) yet.
// Also, BCM sets too low value for "Max logical port count" (256),
// causing syncd to crash on voq asics that now include regular front
// panel ports, fabric ports, and multiple logical ports.
// So, this function will just do nothing for now, and we will readd
// code to install port stats counters when BCM completely supports.
if (!m_fabricPortStatEnabled) return;

SWSS_LOG_NOTICE("Generate fabric port stats");

vector<FieldValueTuple> portNamePortCounterMap;
for (auto p : m_fabricLanePortMap)
{
int lane = p.first;
sai_object_id_t port = p.second;

std::ostringstream portName;
portName << FABRIC_PORT_PREFIX << lane;
portNamePortCounterMap.emplace_back(portName.str(), sai_serialize_object_id(port));

// Install flex counters for port stats
std::unordered_set<std::string> counter_stats;
for (const auto& it: port_stat_ids)
{
counter_stats.emplace(sai_serialize_port_stat(it));
}
port_stat_manager.setCounterIdList(port, CounterType::PORT, counter_stats);
}
m_portNamePortCounterTable->set("", portNamePortCounterMap);
}

void FabricPortsOrch::generateQueueStats()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is function generateQueueStats used anywhere ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

generateQueueStats is called by FlexCounterOrch. This code was already in place.

{
if (!m_fabricQueueStatEnabled) return;
if (m_isQueueStatsGenerated) return;
if (!m_getFabricPortListDone) return;

// FIX_ME: Similar to generatePortStats(), generateQueueStats() installs
// flex counters for queue stats on fabric ports for fabric asics and voq asics.
// However, currently BCM SAI doesn't fully support queue stats query.
// Query on queue type and index is not supported for fabric asics while
// voq asics are not completely supported.
// So, this function will just do nothing for now, and we will readd
// code to install queue stats counters when BCM completely supports.
SWSS_LOG_NOTICE("Generate queue map for fabric ports");

sai_status_t status;
sai_attribute_t attr;

for (auto p : m_fabricLanePortMap)
{
int lane = p.first;
sai_object_id_t port = p.second;

// Each serdes has some pipes (queues) for unicast and multicast.
// But normally fabric serdes uses only one pipe.
attr.id = SAI_PORT_ATTR_QOS_NUMBER_OF_QUEUES;
status = sai_port_api->get_port_attribute(port, 1, &attr);
if (status != SAI_STATUS_SUCCESS)
{
throw runtime_error("FabricPortsOrch get port queue number failure");
}
int num_queues = attr.value.u32;

if (num_queues > 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will there be any chance we'll get num_queues <= 0? If yes, please add logic for error logging. If not, we do not need this check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely >= 0 as the return value of the SAI call is u32. I think we should expect a value >= 1. I'll check if that is correct and update the code accordingly.

{
vector<sai_object_id_t> m_queue_ids;
m_queue_ids.resize(num_queues);

attr.id = SAI_PORT_ATTR_QOS_QUEUE_LIST;
attr.value.objlist.count = (uint32_t) num_queues;
attr.value.objlist.list = m_queue_ids.data();

status = sai_port_api->get_port_attribute(port, 1, &attr);
if (status != SAI_STATUS_SUCCESS)
{
throw runtime_error("FabricPortsOrch get port queue list failure");
}

// Maintain queue map and install flex counters for queue stats
vector<FieldValueTuple> portNameQueueMap;

// Fabric serdes queue type is SAI_QUEUE_TYPE_FABRIC_TX. Since we always
// maintain only one queue for fabric serdes, m_queue_ids size is 1.
// And so, there is no need to query SAI_QUEUE_ATTR_TYPE and SAI_QUEUE_ATTR_INDEX
// for queue. Actually, SAI does not support query these attributes on fabric serdes.
int queueIndex = 0;
std::ostringstream portName;
portName << FABRIC_PORT_PREFIX << lane << ":" << queueIndex;
const auto queue = sai_serialize_object_id(m_queue_ids[queueIndex]);
portNameQueueMap.emplace_back(portName.str(), queue);

// We collect queue counters like occupancy level
std::unordered_set<string> counter_stats;
for (const auto& it: queue_stat_ids)
{
counter_stats.emplace(sai_serialize_queue_stat(it));
}
queue_stat_manager.setCounterIdList(m_queue_ids[queueIndex], CounterType::QUEUE, counter_stats);

m_portNameQueueCounterTable->set("", portNameQueueMap);
}
}

m_isQueueStatsGenerated = true;
}
Expand All @@ -199,7 +268,7 @@ void FabricPortsOrch::updateFabricPortState()
int lane = p.first;
sai_object_id_t port = p.second;

string key = "PORT" + to_string(lane);
string key = FABRIC_PORT_PREFIX + to_string(lane);
std::vector<FieldValueTuple> values;
uint32_t remote_peer;
uint32_t remote_port;
Expand Down
10 changes: 7 additions & 3 deletions orchagent/fabricportsorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,22 @@
class FabricPortsOrch : public Orch, public Subject
{
public:
FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pri_t> &tableNames);
FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pri_t> &tableNames,
bool fabricPortStatEnabled=true, bool fabricQueueStatEnabled=true);
bool allPortsReady();
void generateQueueStats();

private:
bool m_fabricPortStatEnabled;
bool m_fabricQueueStatEnabled;

shared_ptr<DBConnector> m_state_db;
shared_ptr<DBConnector> m_counter_db;
shared_ptr<DBConnector> m_flex_db;

unique_ptr<Table> m_stateTable;
unique_ptr<Table> m_laneQueueCounterTable;
unique_ptr<Table> m_lanePortCounterTable;
unique_ptr<Table> m_portNameQueueCounterTable;
unique_ptr<Table> m_portNamePortCounterTable;
unique_ptr<ProducerTable> m_flexCounterTable;

swss::SelectableTimer *m_timer = nullptr;
Expand Down
3 changes: 3 additions & 0 deletions orchagent/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,9 @@ int main(int argc, char **argv)
if (gMySwitchType == "voq")
{
orchDaemon->setFabricEnabled(true);
// SAI doesn't fully support counters for non fabric asics
orchDaemon->setFabricPortStatEnabled(false);
orchDaemon->setFabricQueueStatEnabled(false);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are initialized as "true" in declaration (line#s 77 and 78 below). For regular switch (i.e, gMySwitchType is not "fabric" and not "voq") are we going to have these m_fabricPortStatEnabled, m_fabricQueueStatEnabled unchanged?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FabricPortsOrch is only created when gMySwitchType is "voq" or "fabric". These settings are set here to be used when FabricPortsOrch is instanciated.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you check the build failure?

We will need sonic-net/sonic-swss-common#551 to merge for build to pass.

}
else
Expand Down
2 changes: 1 addition & 1 deletion orchagent/orchdaemon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ bool OrchDaemon::init()
vector<table_name_with_pri_t> fabric_port_tables = {
// empty for now
};
gFabricPortsOrch = new FabricPortsOrch(m_applDb, fabric_port_tables);
gFabricPortsOrch = new FabricPortsOrch(m_applDb, fabric_port_tables, m_fabricPortStatEnabled, m_fabricQueueStatEnabled);
m_orchList.push_back(gFabricPortsOrch);
}

Expand Down
10 changes: 10 additions & 0 deletions orchagent/orchdaemon.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ class OrchDaemon
{
m_fabricEnabled = enabled;
}
void setFabricPortStatEnabled(bool enabled)
{
m_fabricPortStatEnabled = enabled;
}
void setFabricQueueStatEnabled(bool enabled)
{
m_fabricQueueStatEnabled = enabled;
}
void logRotate();
private:
DBConnector *m_applDb;
Expand All @@ -77,6 +85,8 @@ class OrchDaemon
DBConnector *m_chassisAppDb;

bool m_fabricEnabled = false;
bool m_fabricPortStatEnabled = true;
bool m_fabricQueueStatEnabled = true;

std::vector<Orch *> m_orchList;
Select *m_select;
Expand Down
40 changes: 32 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,12 @@ def random_string(size=4, chars=string.ascii_uppercase + string.digits):


class AsicDbValidator(DVSDatabase):
def __init__(self, db_id: int, connector: str):
def __init__(self, db_id: int, connector: str, switch_type: str):
DVSDatabase.__init__(self, db_id, connector)
self._wait_for_asic_db_to_initialize()
self._populate_default_asic_db_values()
self._generate_oid_to_interface_mapping()
if switch_type not in ['fabric']:
self._wait_for_asic_db_to_initialize()
self._populate_default_asic_db_values()
self._generate_oid_to_interface_mapping()

def _wait_for_asic_db_to_initialize(self) -> None:
"""Wait up to 30 seconds for the default fields to appear in ASIC DB."""
Expand Down Expand Up @@ -497,7 +498,9 @@ def _polling_function():
wait_for_result(_polling_function, service_polling_config)

def init_asic_db_validator(self) -> None:
self.asicdb = AsicDbValidator(self.ASIC_DB_ID, self.redis_sock)
self.get_config_db()
metadata = self.config_db.get_entry('DEVICE_METADATA|localhost', '')
self.asicdb = AsicDbValidator(self.ASIC_DB_ID, self.redis_sock, metadata.get("switch_type"))

def init_appl_db_validator(self) -> None:
self.appldb = ApplDbValidator(self.APPL_DB_ID, self.redis_sock)
Expand Down Expand Up @@ -526,11 +529,13 @@ def _polling_function():
port_table_keys = app_db.get_keys("PORT_TABLE")
return ("PortInitDone" in port_table_keys and "PortConfigDone" in port_table_keys, None)

wait_for_result(_polling_function, startup_polling_config)
if metadata.get('switch_type') not in ['fabric']:
wait_for_result(_polling_function, startup_polling_config)

# Verify that all ports have been created
asic_db = self.get_asic_db()
asic_db.wait_for_n_keys("ASIC_STATE:SAI_OBJECT_TYPE_PORT", num_ports + 1) # +1 CPU Port
if metadata.get('switch_type') not in ['fabric']:
asic_db = self.get_asic_db()
asic_db.wait_for_n_keys("ASIC_STATE:SAI_OBJECT_TYPE_PORT", num_ports + 1) # +1 CPU Port

# Verify that fabric ports are monitored in STATE_DB
if metadata.get('switch_type', 'npu') in ['voq', 'fabric']:
Expand Down Expand Up @@ -1802,6 +1807,25 @@ def dvs(request, manage_dvs) -> DockerVirtualSwitch:

return manage_dvs(log_path, dvs_env)

@pytest.yield_fixture(scope="module")
def vst(request):
vctns = request.config.getoption("--vctns")
topo = request.config.getoption("--topo")
forcedvs = request.config.getoption("--forcedvs")
keeptb = request.config.getoption("--keeptb")
imgname = request.config.getoption("--imgname")
max_cpu = request.config.getoption("--max_cpu")
log_path = vctns if vctns else request.module.__name__
dvs_env = getattr(request.module, "DVS_ENV", [])
if not topo:
# use ecmp topology as default
topo = "virtual_chassis/chassis_supervisor.json"
vct = DockerVirtualChassisTopology(vctns, imgname, keeptb, dvs_env, log_path, max_cpu,
forcedvs, topo)
yield vct
vct.get_logs(request.module.__name__)
vct.destroy()

@pytest.fixture(scope="module")
def vct(request):
vctns = request.config.getoption("--vctns")
Expand Down
Loading