Skip to content

Commit

Permalink
[mux]: Implement rollback for failed mux switchovers (sonic-net#2714)
Browse files Browse the repository at this point in the history
- Make all SAI API operations needed for switchover idempotent
- Implement rollback when a switchover fails

Signed-off-by: Lawrence Lee <lawlee@microsoft.com>
  • Loading branch information
theasianpianist committed May 2, 2023
1 parent 4c6fc31 commit bc4062b
Show file tree
Hide file tree
Showing 10 changed files with 764 additions and 51 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ teamsyncd/teamsyncd
tests/tests
tests/mock_tests/tests_response_publisher
tests/mock_tests/tests_fpmsyncd
tests/mock_tests/tests_intfmgrd
tests/mock_tests/tests_portsyncd


# Test Files #
Expand All @@ -87,5 +89,7 @@ tests/mock_tests/tests.trs
tests/test-suite.log
tests/tests.log
tests/tests.trs
tests/mock_tests/**/*log
tests/mock_tests/**/*trs
orchagent/p4orch/tests/**/*gcda
orchagent/p4orch/tests/**/*gcno
11 changes: 11 additions & 0 deletions orchagent/aclorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1140,6 +1140,11 @@ bool AclRule::createRule()
status = sai_acl_api->create_acl_entry(&m_ruleOid, gSwitchId, (uint32_t)rule_attrs.size(), rule_attrs.data());
if (status != SAI_STATUS_SUCCESS)
{
if (status == SAI_STATUS_ITEM_ALREADY_EXISTS)
{
SWSS_LOG_NOTICE("ACL rule %s already exists", m_id.c_str());
return true;
}
SWSS_LOG_ERROR("Failed to create ACL rule %s, rv:%d",
m_id.c_str(), status);
AclRange::remove(range_objects, range_object_list.count);
Expand Down Expand Up @@ -1201,6 +1206,12 @@ bool AclRule::removeRule()
auto status = sai_acl_api->remove_acl_entry(m_ruleOid);
if (status != SAI_STATUS_SUCCESS)
{
if (status == SAI_STATUS_ITEM_NOT_FOUND)
{
SWSS_LOG_NOTICE("ACL rule already deleted");
m_ruleOid = SAI_NULL_OBJECT_ID;
return true;
}
SWSS_LOG_ERROR("Failed to delete ACL rule, status %s", sai_serialize_status(status).c_str());
return false;
}
Expand Down
118 changes: 83 additions & 35 deletions orchagent/muxorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ static sai_status_t create_route(IpPrefix &pfx, sai_object_id_t nh)
sai_status_t status = sai_route_api->create_route_entry(&route_entry, (uint32_t)attrs.size(), attrs.data());
if (status != SAI_STATUS_SUCCESS)
{
if (status == SAI_STATUS_ITEM_ALREADY_EXISTS) {
SWSS_LOG_NOTICE("Tunnel route to %s already exists", pfx.to_string().c_str());
return SAI_STATUS_SUCCESS;
}
SWSS_LOG_ERROR("Failed to create tunnel route %s,nh %" PRIx64 " rv:%d",
pfx.getIp().to_string().c_str(), nh, status);
return status;
Expand Down Expand Up @@ -145,6 +149,10 @@ static sai_status_t remove_route(IpPrefix &pfx)
sai_status_t status = sai_route_api->remove_route_entry(&route_entry);
if (status != SAI_STATUS_SUCCESS)
{
if (status == SAI_STATUS_ITEM_NOT_FOUND) {
SWSS_LOG_NOTICE("Tunnel route to %s already removed", pfx.to_string().c_str());
return SAI_STATUS_SUCCESS;
}
SWSS_LOG_ERROR("Failed to remove tunnel route %s, rv:%d",
pfx.getIp().to_string().c_str(), status);
return status;
Expand Down Expand Up @@ -497,15 +505,15 @@ void MuxCable::setState(string new_state)

mux_cb_orch_->updateMuxMetricState(mux_name_, new_state, true);

MuxState state = state_;
prev_state_ = state_;
state_ = ns;

st_chg_in_progress_ = true;

if (!(this->*(state_machine_handlers_[it->second]))())
{
//Reset back to original state
state_ = state;
state_ = prev_state_;
st_chg_in_progress_ = false;
st_chg_failed_ = true;
throw std::runtime_error("Failed to handle state transition");
Expand All @@ -521,6 +529,51 @@ void MuxCable::setState(string new_state)
return;
}

void MuxCable::rollbackStateChange()
{
if (prev_state_ == MuxState::MUX_STATE_FAILED || prev_state_ == MuxState::MUX_STATE_PENDING)
{
SWSS_LOG_ERROR("[%s] Rollback to %s not supported", mux_name_.c_str(),
muxStateValToString.at(prev_state_).c_str());
return;
}
SWSS_LOG_WARN("[%s] Rolling back state change to %s", mux_name_.c_str(),
muxStateValToString.at(prev_state_).c_str());
mux_cb_orch_->updateMuxMetricState(mux_name_, muxStateValToString.at(prev_state_), true);
st_chg_in_progress_ = true;
state_ = prev_state_;
bool success = false;
switch (prev_state_)
{
case MuxState::MUX_STATE_ACTIVE:
success = stateActive();
break;
case MuxState::MUX_STATE_INIT:
case MuxState::MUX_STATE_STANDBY:
success = stateStandby();
break;
case MuxState::MUX_STATE_FAILED:
case MuxState::MUX_STATE_PENDING:
// Check at the start of the function means we will never reach here
SWSS_LOG_ERROR("[%s] Rollback to %s not supported", mux_name_.c_str(),
muxStateValToString.at(prev_state_).c_str());
return;
}
st_chg_in_progress_ = false;
if (success)
{
st_chg_failed_ = false;
}
else
{
st_chg_failed_ = true;
SWSS_LOG_ERROR("[%s] Rollback to %s failed",
mux_name_.c_str(), muxStateValToString.at(prev_state_).c_str());
}
mux_cb_orch_->updateMuxMetricState(mux_name_, muxStateValToString.at(state_), false);
mux_cb_orch_->updateMuxState(mux_name_, muxStateValToString.at(state_));
}

string MuxCable::getState()
{
SWSS_LOG_INFO("Get state request for %s, state %s",
Expand Down Expand Up @@ -838,8 +891,6 @@ void MuxNbrHandler::updateTunnelRoute(NextHopKey nh, bool add)
}
}

std::map<std::string, AclTable> MuxAclHandler::acl_table_;

MuxAclHandler::MuxAclHandler(sai_object_id_t port, string alias)
{
SWSS_LOG_ENTER();
Expand All @@ -857,32 +908,21 @@ MuxAclHandler::MuxAclHandler(sai_object_id_t port, string alias)
port_ = port;
alias_ = alias;

auto found = acl_table_.find(table_name);
if (found == acl_table_.end())
{
SWSS_LOG_NOTICE("First time create for port %" PRIx64 "", port);
// Always try to create the table first. If it already exists, function will return early.
createMuxAclTable(port, table_name);

// First time handling of Mux Table, create ACL table, and bind
createMuxAclTable(port, table_name);
SWSS_LOG_NOTICE("Binding port %" PRIx64 "", port);

AclRule* rule = gAclOrch->getAclRule(table_name, rule_name);
if (rule == nullptr)
{
shared_ptr<AclRulePacket> newRule =
make_shared<AclRulePacket>(gAclOrch, rule_name, table_name, false /*no counters*/);
createMuxAclRule(newRule, table_name);
}
else
{
SWSS_LOG_NOTICE("Binding port %" PRIx64 "", port);

AclRule* rule = gAclOrch->getAclRule(table_name, rule_name);
if (rule == nullptr)
{
shared_ptr<AclRulePacket> newRule =
make_shared<AclRulePacket>(gAclOrch, rule_name, table_name, false /*no counters*/);
createMuxAclRule(newRule, table_name);
}
else
{
gAclOrch->updateAclRule(table_name, rule_name, MATCH_IN_PORTS, &port, RULE_OPER_ADD);
}
gAclOrch->updateAclRule(table_name, rule_name, MATCH_IN_PORTS, &port, RULE_OPER_ADD);
}
}

Expand Down Expand Up @@ -915,23 +955,16 @@ void MuxAclHandler::createMuxAclTable(sai_object_id_t port, string strTable)
{
SWSS_LOG_ENTER();

auto inserted = acl_table_.emplace(piecewise_construct,
std::forward_as_tuple(strTable),
std::forward_as_tuple(gAclOrch, strTable));

assert(inserted.second);

AclTable& acl_table = inserted.first->second;

sai_object_id_t table_oid = gAclOrch->getTableById(strTable);
if (table_oid != SAI_NULL_OBJECT_ID)
{
// DROP ACL table is already created
SWSS_LOG_NOTICE("ACL table %s exists, reuse the same", strTable.c_str());
acl_table = *(gAclOrch->getTableByOid(table_oid));
SWSS_LOG_INFO("ACL table %s exists, reuse the same", strTable.c_str());
return;
}

SWSS_LOG_NOTICE("First time create for port %" PRIx64 "", port);
AclTable acl_table(gAclOrch, strTable);
auto dropType = gAclOrch->getAclTableType(TABLE_TYPE_DROP);
assert(dropType);
acl_table.validateAddType(*dropType);
Expand Down Expand Up @@ -1776,10 +1809,25 @@ bool MuxCableOrch::addOperation(const Request& request)
{
mux_obj->setState(state);
}
catch(const std::runtime_error& error)
catch(const std::runtime_error& e)
{
SWSS_LOG_ERROR("Mux Error setting state %s for port %s. Error: %s",
state.c_str(), port_name.c_str(), error.what());
state.c_str(), port_name.c_str(), e.what());
mux_obj->rollbackStateChange();
return true;
}
catch (const std::logic_error& e)
{
SWSS_LOG_ERROR("Logic error while setting state %s for port %s. Error: %s",
state.c_str(), port_name.c_str(), e.what());
mux_obj->rollbackStateChange();
return true;
}
catch (const std::exception& e)
{
SWSS_LOG_ERROR("Exception caught while setting state %s for port %s. Error: %s",
state.c_str(), port_name.c_str(), e.what());
mux_obj->rollbackStateChange();
return true;
}

Expand Down
4 changes: 2 additions & 2 deletions orchagent/muxorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ class MuxAclHandler
void createMuxAclRule(shared_ptr<AclRulePacket> rule, string strTable);
void bindAllPorts(AclTable &acl_table);

// class shared dict: ACL table name -> ACL table
static std::map<std::string, AclTable> acl_table_;
sai_object_id_t port_ = SAI_NULL_OBJECT_ID;
bool is_ingress_acl_ = true;
string alias_;
Expand Down Expand Up @@ -99,6 +97,7 @@ class MuxCable
using state_machine_handlers = map<MuxStateChange, bool (MuxCable::*)()>;

void setState(string state);
void rollbackStateChange();
string getState();
bool isStateChangeInProgress() { return st_chg_in_progress_; }
bool isStateChangeFailed() { return st_chg_failed_; }
Expand All @@ -123,6 +122,7 @@ class MuxCable
MuxCableType cable_type_;

MuxState state_ = MuxState::MUX_STATE_INIT;
MuxState prev_state_;
bool st_chg_in_progress_ = false;
bool st_chg_failed_ = false;

Expand Down
35 changes: 21 additions & 14 deletions orchagent/neighorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,12 @@ bool NeighOrch::addNextHop(const NextHopKey &nh)
sai_status_t status = sai_next_hop_api->create_next_hop(&next_hop_id, gSwitchId, (uint32_t)next_hop_attrs.size(), next_hop_attrs.data());
if (status != SAI_STATUS_SUCCESS)
{
if (status == SAI_STATUS_ITEM_ALREADY_EXISTS)
{
SWSS_LOG_NOTICE("Next hop %s on %s already exists",
nexthop.ip_address.to_string().c_str(), nexthop.alias.c_str());
return true;
}
SWSS_LOG_ERROR("Failed to create next hop %s on %s, rv:%d",
nexthop.ip_address.to_string().c_str(), nexthop.alias.c_str(), status);
task_process_status handle_status = handleSaiCreateStatus(SAI_API_NEXT_HOP, status);
Expand Down Expand Up @@ -1014,7 +1020,7 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
/* When next hop is not found, we continue to remove neighbor entry. */
if (status == SAI_STATUS_ITEM_NOT_FOUND)
{
SWSS_LOG_ERROR("Failed to locate next hop %s on %s, rv:%d",
SWSS_LOG_NOTICE("Next hop %s on %s doesn't exist, rv:%d",
ip_address.to_string().c_str(), alias.c_str(), status);
}
else
Expand Down Expand Up @@ -1049,9 +1055,8 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
{
if (status == SAI_STATUS_ITEM_NOT_FOUND)
{
SWSS_LOG_ERROR("Failed to locate neighbor %s on %s, rv:%d",
SWSS_LOG_NOTICE("Neighbor %s on %s already removed, rv:%d",
m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str(), status);
return true;
}
else
{
Expand All @@ -1064,22 +1069,24 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
}
}
}

if (neighbor_entry.ip_address.addr_family == SAI_IP_ADDR_FAMILY_IPV4)
{
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV4_NEIGHBOR);
}
else
{
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV6_NEIGHBOR);
}
if (neighbor_entry.ip_address.addr_family == SAI_IP_ADDR_FAMILY_IPV4)
{
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV4_NEIGHBOR);
}
else
{
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV6_NEIGHBOR);
}

removeNextHop(ip_address, alias);
m_intfsOrch->decreaseRouterIntfsRefCount(alias);
removeNextHop(ip_address, alias);
m_intfsOrch->decreaseRouterIntfsRefCount(alias);
SWSS_LOG_NOTICE("Removed neighbor %s on %s",
m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str());
}
}

SWSS_LOG_NOTICE("Removed neighbor %s on %s",
m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str());

/* Do not delete entry from cache if its disable request */
if (disable)
Expand Down
1 change: 1 addition & 0 deletions tests/mock_tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ tests_SOURCES = aclorch_ut.cpp \
flowcounterrouteorch_ut.cpp \
orchdaemon_ut.cpp \
intfsorch_ut.cpp \
mux_rollback_ut.cpp \
warmrestartassist_ut.cpp \
test_failure_handling.cpp \
$(top_srcdir)/lib/gearboxutils.cpp \
Expand Down
3 changes: 3 additions & 0 deletions tests/mock_tests/mock_orchagent_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ extern VRFOrch *gVrfOrch;
extern NhgOrch *gNhgOrch;
extern Srv6Orch *gSrv6Orch;
extern BfdOrch *gBfdOrch;
extern AclOrch *gAclOrch;
extern PolicerOrch *gPolicerOrch;
extern Directory<Orch*> gDirectory;

extern sai_acl_api_t *sai_acl_api;
Expand All @@ -70,6 +72,7 @@ extern sai_route_api_t *sai_route_api;
extern sai_neighbor_api_t *sai_neighbor_api;
extern sai_tunnel_api_t *sai_tunnel_api;
extern sai_next_hop_api_t *sai_next_hop_api;
extern sai_next_hop_group_api_t *sai_next_hop_group_api;
extern sai_hostif_api_t *sai_hostif_api;
extern sai_policer_api_t *sai_policer_api;
extern sai_buffer_api_t *sai_buffer_api;
Expand Down
Loading

0 comments on commit bc4062b

Please sign in to comment.