Skip to content

Commit

Permalink
[teammgrd]: Add retry logic when enslaving member port into team (#669)
Browse files Browse the repository at this point in the history
* [teammgrd]: Add retry logic when enslaving member port into team

When a port is not set as admin down, teamd will prevent the port
from being added into the port channel. This could happen when some
other processes or users are executing commands to bring up the port.
This commit will try three times of enslaving before return a false.

Example error message when adding a port which is admin up:

libteamdctl: cli_usock_process_msg:
usock: Error message received: "PortAddFail"
libteamdctl: cli_usock_process_msg: usock: Error message content: "Failed to add port."
command call failed (Invalid argument)

* [teammgrd]: Check the admin status after the teamdctl command failure

Instead of having a retry logic with time limitations, it is better to
confirm the exact reason of the failure - the member port is admin up
or not. The function checkPortIffUp() is added to get the netdev flags
and check the IFF_UP flag.

The add member logic will only retry under the circumstance that the
teamdctl command fails and the member port is still UP.

Signed-off-by: Shu0T1an ChenG <shuche@microsoft.com>
  • Loading branch information
Shuotian Cheng authored Nov 6, 2018
1 parent 36e304d commit f666011
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 20 deletions.
105 changes: 86 additions & 19 deletions cfgmgr/teammgr.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include <unistd.h>

#include "exec.h"
#include "teammgr.h"
#include "logger.h"
Expand All @@ -8,6 +10,9 @@
#include <sstream>
#include <thread>

#include <sys/ioctl.h>
#include <net/if.h>

using namespace std;
using namespace swss;

Expand Down Expand Up @@ -182,7 +187,7 @@ void TeamMgr::doLagMemberTask(Consumer &consumer)
{
KeyOpFieldsValuesTuple t = it->second;

auto tokens = tokenize(kfvKey(t), '|');
auto tokens = tokenize(kfvKey(t), config_db_key_delimiter);
auto lag = tokens[0];
auto member = tokens[1];

Expand All @@ -196,7 +201,11 @@ void TeamMgr::doLagMemberTask(Consumer &consumer)
continue;
}

addLagMember(lag, member);
if (addLagMember(lag, member) == task_need_retry)
{
it++;
continue;
}
}
else if (op == DEL_COMMAND)
{
Expand All @@ -207,6 +216,49 @@ void TeamMgr::doLagMemberTask(Consumer &consumer)
}
}

bool TeamMgr::checkPortIffUp(const string &port)
{
SWSS_LOG_ENTER();

struct ifreq ifr;
memcpy(ifr.ifr_name, port.c_str(), strlen(port.c_str()));
ifr.ifr_name[strlen(port.c_str())] = 0;

int fd = socket(AF_UNIX, SOCK_DGRAM, 0);
if (fd == -1 || ioctl(fd, SIOCGIFFLAGS, &ifr) == -1)
{
SWSS_LOG_ERROR("Failed to get port %s flags", port.c_str());
return false;
}

SWSS_LOG_INFO("Get port %s flags %i", port.c_str(), ifr.ifr_flags);

return ifr.ifr_flags & IFF_UP;
}

bool TeamMgr::findPortMaster(string &master, const string &port)
{
SWSS_LOG_ENTER();

vector<string> keys;
m_cfgLagMemberTable.getKeys(keys);

for (auto key: keys)
{
auto tokens = tokenize(key, config_db_key_delimiter);
auto lag = tokens[0];
auto member = tokens[1];

if (port == member)
{
master = lag;
return true;
}
}

return false;
}

// When a port gets removed and created again, notification is triggered
// when state dabatabase gets updated. In this situation, the port needs
// to be enslaved into the LAG again.
Expand All @@ -226,21 +278,13 @@ void TeamMgr::doPortUpdateTask(Consumer &consumer)
{
SWSS_LOG_INFO("Received port %s state update", alias.c_str());

vector<string> keys;
m_cfgLagMemberTable.getKeys(keys);

for (auto key : keys)
string lag;
if (findPortMaster(lag, alias))
{
auto tokens = tokenize(key, '|');

auto lag = tokens[0];
auto member = tokens[1];

// Find the master of the port
if (alias == member)
if (addLagMember(lag, alias) == task_need_retry)
{
addLagMember(lag, alias);
break;
it++;
continue;
}
}
}
Expand Down Expand Up @@ -291,7 +335,7 @@ bool TeamMgr::setLagMtu(const string &alias, const string &mtu)

for (auto key : keys)
{
auto tokens = tokenize(key, '|');
auto tokens = tokenize(key, config_db_key_delimiter);
auto lag = tokens[0];
auto member = tokens[1];

Expand Down Expand Up @@ -362,7 +406,7 @@ bool TeamMgr::removeLag(const string &alias)
// Once a port is enslaved into a port channel, the port's MTU will
// be inherited from the master's MTU while the port's admin status
// will still be controlled separately.
bool TeamMgr::addLagMember(const string &lag, const string &member)
task_process_status TeamMgr::addLagMember(const string &lag, const string &member)
{
SWSS_LOG_ENTER();

Expand All @@ -373,7 +417,29 @@ bool TeamMgr::addLagMember(const string &lag, const string &member)
// ip link set dev <member> down;
// teamdctl <port_channel_name> port add <member>;
cmd << IP_CMD << " link set dev " << member << " down; ";
cmd << TEAMDCTL_CMD << " " << lag << " port add " << member << "; ";
cmd << TEAMDCTL_CMD << " " << lag << " port add " << member;

if (exec(cmd.str(), res) != 0)
{
// teamdctl port add command will fail when the member port is not
// set to admin status down; it is possible that some other processes
// or users (e.g. portmgrd) are executing the command to bring up the
// member port while adding this port into the port channel. This piece
// of code will check if the port is set to admin status up. If yes,
// it will retry to add the port into the port channel.
if (checkPortIffUp(member))
{
SWSS_LOG_INFO("Failed to add %s to port channel %s, retry...",
member.c_str(), lag.c_str());
return task_need_retry;
}
else
{
SWSS_LOG_ERROR("Failed to add %s to port channel %s",
member.c_str(), lag.c_str());
return task_failed;
}
}

vector<FieldValueTuple> fvs;
m_cfgPortTable.get(member, fvs);
Expand Down Expand Up @@ -403,6 +469,7 @@ bool TeamMgr::addLagMember(const string &lag, const string &member)
}

// ip link set dev <member> [up|down]
cmd.str(string());
cmd << IP_CMD << " link set dev " << member << " " << admin_status;
EXEC_WITH_ERROR_THROW(cmd.str(), res);

Expand All @@ -415,7 +482,7 @@ bool TeamMgr::addLagMember(const string &lag, const string &member)

SWSS_LOG_NOTICE("Add %s to port channel %s", member.c_str(), lag.c_str());

return true;
return task_success;
}

// Once a port is removed from from the master, both the admin status and the
Expand Down
4 changes: 3 additions & 1 deletion cfgmgr/teammgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ class TeamMgr : public Orch

bool addLag(const string &alias, int min_links, bool fall_back);
bool removeLag(const string &alias);
bool addLagMember(const string &lag, const string &member);
task_process_status addLagMember(const string &lag, const string &member);
bool removeLagMember(const string &lag, const string &member);

bool setLagAdminStatus(const string &alias, const string &admin_status);
bool setLagMtu(const string &alias, const string &mtu);

bool findPortMaster(string &, const string &);
bool checkPortIffUp(const string &);
bool isPortStateOk(const string&);
bool isLagStateOk(const string&);
};
Expand Down

0 comments on commit f666011

Please sign in to comment.