From 80e01c01d38017cd21ec3bc9594b418d8e7fee5a Mon Sep 17 00:00:00 2001 From: judyjoseph <53951155+judyjoseph@users.noreply.github.com> Date: Wed, 15 Jan 2020 09:43:22 -0800 Subject: [PATCH] Teamd :: fix for cleaning up the teamd processes correctly on teamd docker stop (#1159) * Send explicit signal to the teamd processes whenthe teamd docker exits. When the teamd docker receives a stop signal, only the processes started by supervisord gets the SIGTERM, so this fix is to propogate the signal to teamd processes via the signal handler in teamsyncd process. * Updates to take care of boundary conditions in the teamsyncd signal handler. * Better way of signal Handling by setting a flag in the signal handler and checking for the flag in the main loop. This way the cleanUp handler is not run in the signal Handler context and can add more Logs as we need not care for signal safety now. * Updated the logic so that teammgrd controls the lifecycle of teamd. Teammgrd tracks the PID's of the teamd processes and sents the SIGTERM signal when the teamd docker is stopped. * Minor change in the function defenition * Updates based on the comments * Minor update in teammgr.cpp --- cfgmgr/teammgr.cpp | 79 +++++++++++++++++++++++++++++++++++++++++ cfgmgr/teammgr.h | 8 +++++ cfgmgr/teammgrd.cpp | 18 ++++++++++ teamsyncd/teamsync.cpp | 26 ++++++++++++++ teamsyncd/teamsync.h | 1 + teamsyncd/teamsyncd.cpp | 18 ++++++++++ 6 files changed, 150 insertions(+) diff --git a/cfgmgr/teammgr.cpp b/cfgmgr/teammgr.cpp index 0ff3ddfdf8c7..6121c30f0c85 100644 --- a/cfgmgr/teammgr.cpp +++ b/cfgmgr/teammgr.cpp @@ -10,11 +10,15 @@ #include #include #include +#include #include #include #include #include +#include + +#define PID_FILE_PATH "/var/run/teamd/" using namespace std; using namespace swss; @@ -110,6 +114,79 @@ void TeamMgr::doTask(Consumer &consumer) } } + +pid_t TeamMgr::getTeamPid(const string &alias) +{ + SWSS_LOG_ENTER(); + pid_t pid = 0; + + string file = string(PID_FILE_PATH) + alias + string(".pid"); + ifstream infile(file); + if (!infile.is_open()) + { + SWSS_LOG_WARN("The LAG PID file: %s is not readable", file.c_str()); + return 0; + } + + string line; + getline(infile, line); + if (line.empty()) + { + SWSS_LOG_WARN("The LAG PID file: %s is empty", file.c_str()); + } + else + { + /*Store the PID value */ + pid = stoi(line, nullptr, 10); + } + + /* Close the file and return */ + infile.close(); + + return pid; +} + + +void TeamMgr::addLagPid(const string &alias) +{ + SWSS_LOG_ENTER(); + m_lagPIDList[alias] = getTeamPid(alias); +} + +void TeamMgr::removeLagPid(const string &alias) +{ + SWSS_LOG_ENTER(); + m_lagPIDList.erase(alias); +} + +void TeamMgr::cleanTeamProcesses(int signo) +{ + pid_t pid = 0; + + for (const auto& it: m_lagList) + { + pid = m_lagPIDList[it]; + if(!pid) { + SWSS_LOG_WARN("Invalid PID found for LaG %s ", it.c_str()); + + /* Try to get the PID again */ + pid = getTeamPid(it); + } + + if(pid > 0) + { + SWSS_LOG_INFO("Sending TERM Signal to (PID: %d) for LaG %s ", pid, it.c_str()); + kill(pid, signo); + } + else + { + SWSS_LOG_ERROR("Can't send TERM signal to LAG %s. PID wasn't found", it.c_str()); + } + } + + return; +} + void TeamMgr::doLagTask(Consumer &consumer) { SWSS_LOG_ENTER(); @@ -173,6 +250,7 @@ void TeamMgr::doLagTask(Consumer &consumer) } m_lagList.insert(alias); + addLagPid(alias); } setLagAdminStatus(alias, admin_status); @@ -189,6 +267,7 @@ void TeamMgr::doLagTask(Consumer &consumer) { removeLag(alias); m_lagList.erase(alias); + removeLagPid(alias); } } diff --git a/cfgmgr/teammgr.h b/cfgmgr/teammgr.h index db3b0338443a..9003d25d54ee 100644 --- a/cfgmgr/teammgr.h +++ b/cfgmgr/teammgr.h @@ -7,6 +7,7 @@ #include "netmsg.h" #include "orch.h" #include "producerstatetable.h" +#include namespace swss { @@ -17,6 +18,8 @@ class TeamMgr : public Orch const std::vector &tables); using Orch::doTask; + void cleanTeamProcesses(int signo); + private: Table m_cfgMetadataTable; // To retrieve MAC address Table m_cfgPortTable; @@ -29,6 +32,7 @@ class TeamMgr : public Orch ProducerStateTable m_appLagTable; std::set m_lagList; + std::map m_lagPIDList; MacAddress m_mac; @@ -45,6 +49,10 @@ class TeamMgr : public Orch bool setLagAdminStatus(const std::string &alias, const std::string &admin_status); bool setLagMtu(const std::string &alias, const std::string &mtu); bool setLagLearnMode(const std::string &alias, const std::string &learn_mode); + + pid_t getTeamPid(const std::string &alias); + void addLagPid(const std::string &alias); + void removeLagPid(const std::string &alias); bool isPortEnslaved(const std::string &); bool findPortMaster(std::string &, const std::string &); diff --git a/cfgmgr/teammgrd.cpp b/cfgmgr/teammgrd.cpp index 6217a0a21a2f..60828e038d6f 100644 --- a/cfgmgr/teammgrd.cpp +++ b/cfgmgr/teammgrd.cpp @@ -5,6 +5,7 @@ #include "netlink.h" #include "select.h" #include "warm_restart.h" +#include using namespace std; using namespace swss; @@ -17,6 +18,14 @@ bool gLogRotate = false; ofstream gRecordOfs; string gRecordFile; +bool received_sigterm = false; + +void sig_handler(int signo) +{ + received_sigterm = true; + return; +} + int main(int argc, char **argv) { Logger::linkToDbNative("teammgrd"); @@ -24,6 +33,9 @@ int main(int argc, char **argv) SWSS_LOG_NOTICE("--- Starting teammrgd ---"); + /* Register the signal handler for SIGTERM */ + signal(SIGTERM, sig_handler); + try { DBConnector conf_db("CONFIG_DB", 0); @@ -55,6 +67,12 @@ int main(int argc, char **argv) while (true) { + if(received_sigterm) + { + teammgr.cleanTeamProcesses(SIGTERM); + received_sigterm = false; + } + Selectable *sel; int ret; diff --git a/teamsyncd/teamsync.cpp b/teamsyncd/teamsync.cpp index 64016ff55715..2719189f8e9a 100644 --- a/teamsyncd/teamsync.cpp +++ b/teamsyncd/teamsync.cpp @@ -109,6 +109,22 @@ void TeamSync::onMsg(int nlmsg_type, struct nl_object *obj) if (!type || (strcmp(type, TEAM_DRV_NAME) != 0)) return; + unsigned int flags = rtnl_link_get_flags(link); + bool admin = flags & IFF_UP; + bool oper = flags & IFF_LOWER_UP; + unsigned int ifindex = rtnl_link_get_ifindex(link); + + if (type) + { + SWSS_LOG_INFO(" nlmsg type:%d key:%s admin:%d oper:%d ifindex:%d type:%s", + nlmsg_type, lagName.c_str(), admin, oper, ifindex, type); + } + else + { + SWSS_LOG_INFO(" nlmsg type:%d key:%s admin:%d oper:%d ifindex:%d", + nlmsg_type, lagName.c_str(), admin, oper, ifindex); + } + if (nlmsg_type == RTM_DELLINK) { if (m_teamSelectables.find(lagName) != m_teamSelectables.end()) @@ -194,6 +210,16 @@ void TeamSync::removeLag(const string &lagName) m_selectablesToRemove.insert(lagName); } +void TeamSync::cleanTeamSync() +{ + for (const auto& it: m_teamSelectables) + { + /* Cleanup LAG */ + removeLag(it.first); + } + return; +} + const struct team_change_handler TeamSync::TeamPortSync::gPortChangeHandler = { .func = TeamSync::TeamPortSync::teamdHandler, .type_mask = TEAM_PORT_CHANGE | TEAM_OPTION_CHANGE diff --git a/teamsyncd/teamsync.h b/teamsyncd/teamsync.h index f47049d1a686..406953e31210 100644 --- a/teamsyncd/teamsync.h +++ b/teamsyncd/teamsync.h @@ -25,6 +25,7 @@ class TeamSync : public NetMsg TeamSync(DBConnector *db, DBConnector *stateDb, Select *select); void periodic(); + void cleanTeamSync(); /* Listen to RTM_NEWLINK, RTM_DELLINK to track team devices */ virtual void onMsg(int nlmsg_type, struct nl_object *obj); diff --git a/teamsyncd/teamsyncd.cpp b/teamsyncd/teamsyncd.cpp index ef368ca691a8..92f59553a90a 100644 --- a/teamsyncd/teamsyncd.cpp +++ b/teamsyncd/teamsyncd.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "logger.h" #include "select.h" #include "netdispatcher.h" @@ -9,6 +10,14 @@ using namespace std; using namespace swss; +bool received_sigterm = false; + +void sig_handler(int signo) +{ + received_sigterm = true; + return; +} + int main(int argc, char **argv) { swss::Logger::linkToDbNative(TEAMSYNCD_APP_NAME); @@ -20,6 +29,9 @@ int main(int argc, char **argv) NetDispatcher::getInstance().registerMessageHandler(RTM_NEWLINK, &sync); NetDispatcher::getInstance().registerMessageHandler(RTM_DELLINK, &sync); + /* Register the signal handler for SIGTERM */ + signal(SIGTERM, sig_handler); + while (1) { try @@ -33,6 +45,12 @@ int main(int argc, char **argv) s.addSelectable(&netlink); while (true) { + if(received_sigterm) + { + sync.cleanTeamSync(); + received_sigterm = false; + } + Selectable *temps; s.select(&temps, 1000); // block for a second sync.periodic();