Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle Port oper down error status notification #3350

Merged
merged 1 commit into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion orchagent/p4orch/tests/fake_portorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ extern "C"

PortsOrch::PortsOrch(DBConnector *db, DBConnector *stateDb, vector<table_name_with_pri_t> &tableNames,
DBConnector *chassisAppDb)
: Orch(db, tableNames), m_portStateTable(stateDb, STATE_PORT_TABLE_NAME),
: Orch(db, tableNames),
m_portStateTable(stateDb, STATE_PORT_TABLE_NAME),
m_portOpErrTable(stateDb, STATE_PORT_OPER_ERR_TABLE_NAME),
port_stat_manager(PORT_STAT_COUNTER_FLEX_COUNTER_GROUP, StatsMode::READ,
PORT_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS, true),
port_buffer_drop_stat_manager(PORT_BUFFER_DROP_STAT_FLEX_COUNTER_GROUP, StatsMode::READ,
Expand Down
44 changes: 43 additions & 1 deletion orchagent/port.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ extern "C" {
#include <vector>
#include <map>
#include <bitset>
#include <chrono>
#include <unordered_set>

#include <iomanip>
#include <sstream>
#include <macaddress.h>
#include <sairedis.h>

Expand Down Expand Up @@ -74,6 +76,42 @@ struct SystemLagInfo
int32_t spa_id = 0;
};

class PortOperErrorEvent
{
public:
PortOperErrorEvent() = default;
PortOperErrorEvent(const sai_port_error_status_t error, std::string key) : m_errorFlag(error), m_dbKeyError(key){}
~PortOperErrorEvent() = default;

inline void incrementErrorCount(void) { m_errorCount++; }

inline size_t getErrorCount(void) const { return m_errorCount; }

void recordEventTime(void) {
auto now = std::chrono::system_clock::now();
m_eventTime = std::chrono::system_clock::to_time_t(now);
}

std::string getEventTime(void) {
std::ostringstream oss;
oss << std::put_time(std::gmtime(&m_eventTime), "%Y-%m-%d %H:%M:%S");
return oss.str();
}

inline std::string getDbKey(void) const { return m_dbKeyError; }

// Returns true if port oper error flag in sai_port_error_status_t is set
bool isErrorSet(sai_port_error_status_t errstatus) const { return (m_errorFlag & errstatus);}

static const std::unordered_map<sai_port_error_status_t, std::string> db_key_errors;

private:
sai_port_error_status_t m_errorFlag = SAI_PORT_ERROR_STATUS_CLEAR;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we really need this member? I think the port event from redis already contains the error status.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Junchao-Mellanox probably not required but i thought in case i need to take OA coredump for future debubbing i don't need redis dump.

size_t m_errorCount = 0;
std::string m_dbKeyError; // DB key for this port error
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we really need this member? The DB key name is already in the static map

std::time_t m_eventTime = 0;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we really need this member? The event time can be get whenever there is an error event. Seems no need to save it in a class.

};

class Port
{
public:
Expand Down Expand Up @@ -155,6 +193,7 @@ class Port
sai_object_id_t m_parent_port_id = 0;
uint32_t m_dependency_bitmap = 0;
sai_port_oper_status_t m_oper_status = SAI_PORT_OPER_STATUS_UNKNOWN;
sai_port_error_status_t m_oper_error_status = SAI_PORT_ERROR_STATUS_CLEAR; //Bitmap of last port oper error status
std::set<std::string> m_members;
std::set<std::string> m_child_ports;
std::vector<sai_object_id_t> m_queue_ids;
Expand Down Expand Up @@ -193,6 +232,9 @@ class Port
sai_object_id_t m_system_side_id = 0;
sai_object_id_t m_line_side_id = 0;

/* Port oper error status to event map*/
std::unordered_map<sai_port_error_status_t, PortOperErrorEvent> m_portOperErrorToEvent;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i suppose we only need a std::unordered_map<sai_port_error_status_t, uint32_t> map here to record the occur count for each error type.


/* pre-emphasis */
std::map<sai_port_serdes_attr_t, std::vector<uint32_t>> m_preemphasis;

Expand Down
122 changes: 119 additions & 3 deletions orchagent/portsorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,25 @@ static char* hostif_vlan_tag[] = {
[SAI_HOSTIF_VLAN_TAG_ORIGINAL] = "SAI_HOSTIF_VLAN_TAG_ORIGINAL"
};

const std::unordered_map<sai_port_error_status_t, std::string> PortOperErrorEvent::db_key_errors =
{
// SAI port oper error status to error name mapping
{ SAI_PORT_ERROR_STATUS_MAC_LOCAL_FAULT, "mac_local_fault"},
{ SAI_PORT_ERROR_STATUS_MAC_REMOTE_FAULT, "mac_remote_fault"},
{ SAI_PORT_ERROR_STATUS_FEC_SYNC_LOSS, "fec_sync_loss"},
{ SAI_PORT_ERROR_STATUS_FEC_LOSS_ALIGNMENT_MARKER, "fec_alignment_loss"},
{ SAI_PORT_ERROR_STATUS_HIGH_SER, "high_ser_error"},
{ SAI_PORT_ERROR_STATUS_HIGH_BER, "high ber_error"},
{ SAI_PORT_ERROR_STATUS_CRC_RATE, "crc_rate"},
{ SAI_PORT_ERROR_STATUS_DATA_UNIT_CRC_ERROR, "data_unit_crc_error"},
{ SAI_PORT_ERROR_STATUS_DATA_UNIT_SIZE, "data_unit_size"},
{ SAI_PORT_ERROR_STATUS_DATA_UNIT_MISALIGNMENT_ERROR, "data_unit_misalignment_error"},
{ SAI_PORT_ERROR_STATUS_CODE_GROUP_ERROR, "code_group_error"},
{ SAI_PORT_ERROR_STATUS_SIGNAL_LOCAL_ERROR, "signal_local_error"},
{ SAI_PORT_ERROR_STATUS_NO_RX_REACHABILITY, "no_rx_reachability"}
};


// functions ----------------------------------------------------------------------------------------------------------

static bool isValidPortTypeForLagMember(const Port& port)
Expand Down Expand Up @@ -509,6 +528,7 @@ bool PortsOrch::checkPathTracingCapability()
PortsOrch::PortsOrch(DBConnector *db, DBConnector *stateDb, vector<table_name_with_pri_t> &tableNames, DBConnector *chassisAppDb) :
Orch(db, tableNames),
m_portStateTable(stateDb, STATE_PORT_TABLE_NAME),
m_portOpErrTable(stateDb, STATE_PORT_OPER_ERR_TABLE_NAME),
port_stat_manager(PORT_STAT_COUNTER_FLEX_COUNTER_GROUP, StatsMode::READ, PORT_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS, false),
gb_port_stat_manager(true,
PORT_STAT_COUNTER_FLEX_COUNTER_GROUP, StatsMode::READ,
Expand Down Expand Up @@ -808,6 +828,26 @@ void PortsOrch::initializeCpuPort()
SWSS_LOG_NOTICE("Get CPU port pid:%" PRIx64, this->m_cpuPort.m_port_id);
}

// Creating mapping of various port oper errors for error handling
void PortsOrch::initializePortOperErrors(Port &port)
{
SWSS_LOG_ENTER();

SWSS_LOG_NOTICE("Initialize port oper errors for port %s", port.m_alias.c_str());

for (auto& error : PortOperErrorEvent::db_key_errors)
{
const sai_port_error_status_t error_status = error.first;
std::string error_name = error.second;

port.m_portOperErrorToEvent[error_status] = PortOperErrorEvent(error_status, error_name);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no sure this is required

SWSS_LOG_NOTICE("Initialize port %s error %s flag=0x%" PRIx32,
port.m_alias.c_str(),
error_name.c_str(),
error_status);
}
}

void PortsOrch::initializePorts()
{
SWSS_LOG_ENTER();
Expand Down Expand Up @@ -3351,6 +3391,26 @@ void PortsOrch::updateDbPortFlapCount(Port& port, sai_port_oper_status_t pstatus
m_portTable->set(port.m_alias, tuples);
}

void PortsOrch::updateDbPortOperError(Port& port, PortOperErrorEvent *pevent)
{
SWSS_LOG_ENTER();

auto key = pevent->getDbKey();
vector<FieldValueTuple> tuples;
FieldValueTuple tup1("oper_error_status", std::to_string(port.m_oper_error_status));

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please check the latest change to notification structure - opencomputeproject/SAI#2087
Was discussed in the SAI community meeting

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eddyk-nvidia that is for future use case. I can't wait for that to merge and start using as its already not meeting 202411 release time.

tuples.push_back(tup1);

size_t count = pevent->getErrorCount();
FieldValueTuple tup2(key + "_count", std::to_string(count));
tuples.push_back(tup2);

auto time = pevent->getEventTime();
FieldValueTuple tup3(key + "_time", time);
tuples.push_back(tup3);

m_portOpErrTable.set(port.m_alias, tuples);
}

void PortsOrch::updateDbPortOperStatus(const Port& port, sai_port_oper_status_t status) const
{
SWSS_LOG_ENTER();
Expand Down Expand Up @@ -4613,6 +4673,8 @@ void PortsOrch::doPortTask(Consumer &consumer)
/* create host_tx_ready field in state-db */
initHostTxReadyState(p);

initializePortOperErrors(p);

// Restore admin status if the port was brought down
if (admin_status != p.m_admin_state_up)
{
Expand Down Expand Up @@ -8019,12 +8081,14 @@ void PortsOrch::doTask(NotificationConsumer &consumer)

for (uint32_t i = 0; i < count; i++)
{
Port port;
sai_object_id_t id = portoperstatus[i].port_id;
sai_port_oper_status_t status = portoperstatus[i].port_state;
sai_port_error_status_t port_oper_err = portoperstatus[i].port_error_status;

SWSS_LOG_NOTICE("Get port state change notification id:%" PRIx64 " status:%d", id, status);

Port port;
SWSS_LOG_NOTICE("Get port state change notification id:%" PRIx64 " status:%d "
"oper_error_status:0x%" PRIx32,
id, status, port_oper_err);

if (!getPort(id, port))
{
Expand Down Expand Up @@ -8061,6 +8125,11 @@ void PortsOrch::doTask(NotificationConsumer &consumer)
{
updateDbPortOperFec(port, "N/A");
}
} else {
if (port_oper_err)
{
updatePortErrorStatus(port, port_oper_err);
}
}

/* update m_portList */
Expand Down Expand Up @@ -8089,6 +8158,53 @@ void PortsOrch::doTask(NotificationConsumer &consumer)

}

void PortsOrch::updatePortErrorStatus(Port &port, sai_port_error_status_t errstatus)
{
size_t errors = 0;
string db_port_error_name;
PortOperErrorEvent *portOperErrorEvent = nullptr;
size_t error_count = PortOperErrorEvent::db_key_errors.size();

SWSS_LOG_NOTICE("Port %s error state set from 0x%" PRIx32 "-> 0x%" PRIx32,
port.m_alias.c_str(),
port.m_oper_error_status,
errstatus);

port.m_oper_error_status = errstatus;

// Iterate through all the port oper errors
while ((errstatus >> errors) && (errors < error_count))
{
sai_port_error_status_t error_status = static_cast<sai_port_error_status_t>(errstatus & (1 << errors));

if (port.m_portOperErrorToEvent.find(error_status) == port.m_portOperErrorToEvent.end())
{
++errors;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if error_status is 0, no need query the map. If it is not 0 and not in the map, we probably need record an error log.

continue;
}

portOperErrorEvent = &port.m_portOperErrorToEvent[error_status];

if (portOperErrorEvent->isErrorSet(errstatus))
{
SWSS_LOG_NOTICE("Port %s oper error event: %s occurred",
port.m_alias.c_str(),
portOperErrorEvent->getDbKey().c_str());
portOperErrorEvent->recordEventTime();
portOperErrorEvent->incrementErrorCount();
updateDbPortOperError(port, portOperErrorEvent);
}
else
{
SWSS_LOG_WARN("Port %s port oper error %s not updated in DB",
port.m_alias.c_str(),
portOperErrorEvent->getDbKey().c_str());
}

++errors;
}
}

void PortsOrch::updatePortOperStatus(Port &port, sai_port_oper_status_t status)
{
SWSS_LOG_NOTICE("Port %s oper state set from %s to %s",
Expand Down
4 changes: 4 additions & 0 deletions orchagent/portsorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,14 @@ class PortsOrch : public Orch, public Subject
void setPort(string alias, Port port);
void getCpuPort(Port &port);
void initHostTxReadyState(Port &port);
void initializePortOperErrors(Port &port);
bool getInbandPort(Port &port);
bool getVlanByVlanId(sai_vlan_id_t vlan_id, Port &vlan);

bool setHostIntfsOperStatus(const Port& port, bool up) const;
void updateDbPortOperStatus(const Port& port, sai_port_oper_status_t status) const;
void updateDbPortFlapCount(Port& port, sai_port_oper_status_t pstatus);
void updateDbPortOperError(Port& port, PortOperErrorEvent *pevent);

bool createVlanHostIntf(Port& vl, string hostif_name);
bool removeVlanHostIntf(Port vl);
Expand Down Expand Up @@ -263,6 +265,7 @@ class PortsOrch : public Orch, public Subject
unique_ptr<Table> m_pgIndexTable;
unique_ptr<Table> m_stateBufferMaximumValueTable;
Table m_portStateTable;
Table m_portOpErrTable;

std::string getQueueWatermarkFlexCounterTableKey(std::string s);
std::string getPriorityGroupWatermarkFlexCounterTableKey(std::string s);
Expand Down Expand Up @@ -502,6 +505,7 @@ class PortsOrch : public Orch, public Subject
bool initGearboxPort(Port &port);
bool getPortOperFec(const Port& port, sai_port_fec_mode_t &fec_mode) const;
void updateDbPortOperFec(Port &port, string fec_str);
void updatePortErrorStatus(Port &port, sai_port_error_status_t port_oper_eror);

map<string, Port::Role> m_recircPortRole;

Expand Down
Loading
Loading