Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WarmRestart class changes #600

Merged
merged 6 commits into from
Sep 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cfgmgr/vlanmgrd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ int main(int argc, char **argv)
DBConnector appDb(APPL_DB, DBConnector::DEFAULT_UNIXSOCKET, 0);
DBConnector stateDb(STATE_DB, DBConnector::DEFAULT_UNIXSOCKET, 0);

WarmStart::initialize("vlanmgrd");
WarmStart::checkWarmStart("vlanmgrd");

/*
Expand Down
35 changes: 26 additions & 9 deletions doc/swss-schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,15 @@ Stores information for physical switch ports managed by the switch chip. Ports t
; and push the delta to appDB
; Valid value is 1-9999. 0 is invalid.

bgp_timer = 1*4DIGIT ; bgp_timer holds the time interval utilized by fpmsyncd during warm-restart episodes.
; During this interval fpmsyncd will recover all the routing state previously pushed to
; AppDB, as well as all the new state coming from zebra/bgpd. Upon expiration of this
; timer, fpmsyncd will execute the reconciliation logic to eliminate all the staled
; state from AppDB. This timer should match the BGP-GR restart-timer configured within
; the elected routing-stack.
; Supported range: 1-9999.


### VXLAN\_TUNNEL
Stores vxlan tunnels configuration
Status: ready
Expand Down Expand Up @@ -717,15 +726,24 @@ Stores information for physical switch ports managed by the switch chip. Ports t
;Status: work in progress

key = WARM_RESTART_TABLE:process_name ; process_name is a unique process identifier.
restart_count = 1*10DIGIT ; a number between 0 and 2147483647,
; count of warm start times.

state = "init" / "restored" / "reconciled" ; init: process init with warm start enabled.
; restored: process restored to the previous
; state using saved data.
; reconciled: process reconciled with up to date
; dynanic data like port state, neighbor, routes
; and so on.
restore_count = 1*10DIGIT ; a value between 0 and 2147483647 to keep track
; of the number of times that an application has
; 'restored' its state from its associated redis
; data-store; which is equivalent to the number
; of times an application has iterated through
; a warm-restart cycle.

state = "initialized" / "restored" / "reconciled" ; initialized: initial FSM state for processes
; with warm-restart capabilities turned on.
;
; restored: process restored the state previously
; uploaded to redis data-stores.
;
; reconciled: process reconciled 'old' and 'new'
; state collected in 'restored' phase. Examples:
; dynanic data like port state, neighbor, routes
; and so on.

## Configuration files
What configuration files should we have? Do apps, orch agent each need separate files?
Expand All @@ -735,4 +753,3 @@ What configuration files should we have? Do apps, orch agent each need separate
portsyncd reads from port_config.ini and updates PORT_TABLE in APP_DB

All other apps (intfsyncd) read from PORT_TABLE in APP_DB

2 changes: 2 additions & 0 deletions orchagent/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ int main(int argc, char **argv)
swss::Logger::linkToDbNative("orchagent");

SWSS_LOG_ENTER();

WarmStart::initialize("orchagent");
WarmStart::checkWarmStart("orchagent");

if (signal(SIGHUP, sighup_handler) == SIG_ERR)
Expand Down
2 changes: 1 addition & 1 deletion orchagent/orchdaemon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ void OrchDaemon::start()
*/
bool OrchDaemon::warmRestoreAndSyncUp()
{
WarmStart::setWarmStartState("orchagent", WarmStart::INIT);
WarmStart::setWarmStartState("orchagent", WarmStart::INITIALIZED);

for (Orch *o : m_orchList)
{
Expand Down
1 change: 1 addition & 0 deletions portsyncd/portsyncd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ int main(int argc, char **argv)
ProducerStateTable p(&appl_db, APP_PORT_TABLE_NAME);
SubscriberStateTable portCfg(&cfgDb, CFG_PORT_TABLE_NAME);

WarmStart::initialize("portsyncd");
WarmStart::checkWarmStart("portsyncd");
if (WarmStart::isWarmStart())
{
Expand Down
78 changes: 39 additions & 39 deletions tests/test_warm_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@ def stop_swss(dvs):
supervisorctl stop buffermgrd; supervisorctl stop arp_update'])


# Get restart count of all processes supporting warm restart
def swss_get_RestartCount(state_db):
restart_count = {}
# Get restore count of all processes supporting warm restart
def swss_get_RestoreCount(state_db):
restore_count = {}
warmtbl = swsscommon.Table(state_db, swsscommon.STATE_WARM_RESTART_TABLE_NAME)
keys = warmtbl.getKeys()
assert len(keys) != 0
for key in keys:
(status, fvs) = warmtbl.get(key)
assert status == True
for fv in fvs:
if fv[0] == "restart_count":
restart_count[key] = int(fv[1])
print(restart_count)
return restart_count
if fv[0] == "restore_count":
restore_count[key] = int(fv[1])
print(restore_count)
return restore_count

# function to check the restart count incremented by 1 for all processes supporting warm restart
def swss_check_RestartCount(state_db, restart_count):
# function to check the restore count incremented by 1 for all processes supporting warm restart
def swss_check_RestoreCount(state_db, restore_count):
warmtbl = swsscommon.Table(state_db, swsscommon.STATE_WARM_RESTART_TABLE_NAME)
keys = warmtbl.getKeys()
print(keys)
Expand All @@ -42,8 +42,8 @@ def swss_check_RestartCount(state_db, restart_count):
(status, fvs) = warmtbl.get(key)
assert status == True
for fv in fvs:
if fv[0] == "restart_count":
assert int(fv[1]) == restart_count[key] + 1
if fv[0] == "restore_count":
assert int(fv[1]) == restore_count[key] + 1
elif fv[0] == "state":
assert fv[1] == "reconciled"

Expand All @@ -59,21 +59,21 @@ def check_port_oper_status(appl_db, port_name, state):
break
assert oper_status == state

# function to check the restart count incremented by 1 for a single process
def swss_app_check_RestartCount_single(state_db, restart_count, name):
# function to check the restore count incremented by 1 for a single process
def swss_app_check_RestoreCount_single(state_db, restore_count, name):
warmtbl = swsscommon.Table(state_db, swsscommon.STATE_WARM_RESTART_TABLE_NAME)
keys = warmtbl.getKeys()
print(keys)
print(restart_count)
print(restore_count)
assert len(keys) > 0
for key in keys:
if key != name:
continue
(status, fvs) = warmtbl.get(key)
assert status == True
for fv in fvs:
if fv[0] == "restart_count":
assert int(fv[1]) == restart_count[key] + 1
if fv[0] == "restore_count":
assert int(fv[1]) == restore_count[key] + 1
elif fv[0] == "state":
assert fv[1] == "reconciled"

Expand Down Expand Up @@ -174,7 +174,7 @@ def test_PortSyncdWarmRestart(dvs):
(status, fvs) = neighTbl.get("Ethernet20:11.0.0.10")
assert status == True

restart_count = swss_get_RestartCount(state_db)
restore_count = swss_get_RestoreCount(state_db)

# restart portsyncd
dvs.runcmd(['sh', '-c', 'pkill -x portsyncd; cp /var/log/swss/sairedis.rec /var/log/swss/sairedis.rec.b; echo > /var/log/swss/sairedis.rec'])
Expand Down Expand Up @@ -203,7 +203,7 @@ def test_PortSyncdWarmRestart(dvs):
check_port_oper_status(appl_db, "Ethernet24", "up")


swss_app_check_RestartCount_single(state_db, restart_count, "portsyncd")
swss_app_check_RestoreCount_single(state_db, restore_count, "portsyncd")


def test_VlanMgrdWarmRestart(dvs):
Expand Down Expand Up @@ -291,7 +291,7 @@ def test_VlanMgrdWarmRestart(dvs):
(exitcode, bv_before) = dvs.runcmd("bridge vlan")
print(bv_before)

restart_count = swss_get_RestartCount(state_db)
restore_count = swss_get_RestoreCount(state_db)

dvs.runcmd(['sh', '-c', 'pkill -x vlanmgrd; cp /var/log/swss/sairedis.rec /var/log/swss/sairedis.rec.b; echo > /var/log/swss/sairedis.rec'])
dvs.runcmd(['sh', '-c', 'supervisorctl start vlanmgrd'])
Expand All @@ -312,7 +312,7 @@ def test_VlanMgrdWarmRestart(dvs):
(status, fvs) = tbl.get("Vlan20:11.0.0.11")
assert status == True

swss_app_check_RestartCount_single(state_db, restart_count, "vlanmgrd")
swss_app_check_RestoreCount_single(state_db, restore_count, "vlanmgrd")

# function to stop neighsyncd service and clear syslog and sairedis records
def stop_neighsyncd_clear_syslog_sairedis(dvs, save_number):
Expand Down Expand Up @@ -436,8 +436,8 @@ def test_swss_neighbor_syncup(dvs):
# appDB should be kept the same.
#

# get restart_count
restart_count = swss_get_RestartCount(state_db)
# get restore_count
restore_count = swss_get_RestoreCount(state_db)

# stop neighsyncd and clear syslog and sairedis.rec
stop_neighsyncd_clear_syslog_sairedis(dvs, 1)
Expand Down Expand Up @@ -471,8 +471,8 @@ def test_swss_neighbor_syncup(dvs):
check_syslog_for_neighbor_entry(dvs, 0, 0, "ipv6")
check_sairedis_for_neighbor_entry(dvs, 0, 0, 0)

# check restart Count
swss_app_check_RestartCount_single(state_db, restart_count, "neighsyncd")
# check restore Count
swss_app_check_RestoreCount_single(state_db, restore_count, "neighsyncd")

#
# Testcase 3:
Expand All @@ -482,8 +482,8 @@ def test_swss_neighbor_syncup(dvs):
# but it will send netlink message to be removed from appDB, so it works ok here,
# just that if we want to add the same neighbor again, use "change" instead of "add"

# get restart_count
restart_count = swss_get_RestartCount(state_db)
# get restore_count
restore_count = swss_get_RestoreCount(state_db)

# stop neighsyncd and clear syslog and sairedis.rec
stop_neighsyncd_clear_syslog_sairedis(dvs, 2)
Expand Down Expand Up @@ -538,8 +538,8 @@ def test_swss_neighbor_syncup(dvs):
check_syslog_for_neighbor_entry(dvs, 0, 2, "ipv4")
check_syslog_for_neighbor_entry(dvs, 0, 2, "ipv6")
check_sairedis_for_neighbor_entry(dvs, 0, 0, 4)
# check restart Count
swss_app_check_RestartCount_single(state_db, restart_count, "neighsyncd")
# check restore Count
swss_app_check_RestoreCount_single(state_db, restore_count, "neighsyncd")


#
Expand All @@ -549,8 +549,8 @@ def test_swss_neighbor_syncup(dvs):
# The neighsyncd is supposed to sync up the entries from kernel after warm restart
# Check the timer is not retrieved from configDB since it is not configured

# get restart_count
restart_count = swss_get_RestartCount(state_db)
# get restore_count
restore_count = swss_get_RestoreCount(state_db)

# stop neighsyncd and clear syslog and sairedis.rec
stop_neighsyncd_clear_syslog_sairedis(dvs, 3)
Expand Down Expand Up @@ -594,8 +594,8 @@ def test_swss_neighbor_syncup(dvs):
check_syslog_for_neighbor_entry(dvs, 2, 0, "ipv4")
check_syslog_for_neighbor_entry(dvs, 2, 0, "ipv6")
check_sairedis_for_neighbor_entry(dvs, 4, 0, 0)
# check restart Count
swss_app_check_RestartCount_single(state_db, restart_count, "neighsyncd")
# check restore Count
swss_app_check_RestoreCount_single(state_db, restore_count, "neighsyncd")

#
# Testcase 5:
Expand All @@ -615,8 +615,8 @@ def test_swss_neighbor_syncup(dvs):
]
)

# get restart_count
restart_count = swss_get_RestartCount(state_db)
# get restore_count
restore_count = swss_get_RestoreCount(state_db)

# stop neighsyncd and clear syslog and sairedis.rec
stop_neighsyncd_clear_syslog_sairedis(dvs, 4)
Expand Down Expand Up @@ -693,8 +693,9 @@ def test_swss_neighbor_syncup(dvs):
check_syslog_for_neighbor_entry(dvs, 4, 2, "ipv4")
check_syslog_for_neighbor_entry(dvs, 4, 2, "ipv6")
check_sairedis_for_neighbor_entry(dvs, 4, 4, 4)
# check restart Count
swss_app_check_RestartCount_single(state_db, restart_count, "neighsyncd")

# check restore Count
swss_app_check_RestoreCount_single(state_db, restore_count, "neighsyncd")


# TODO: The condition of warm restart readiness check is still under discussion.
Expand Down Expand Up @@ -768,7 +769,7 @@ def test_swss_port_state_syncup(dvs):

tbl = swsscommon.Table(appl_db, swsscommon.APP_PORT_TABLE_NAME)

restart_count = swss_get_RestartCount(state_db)
restore_count = swss_get_RestoreCount(state_db)

# update port admin state
dvs.runcmd("ifconfig Ethernet0 10.0.0.0/31 up")
Expand Down Expand Up @@ -815,7 +816,7 @@ def test_swss_port_state_syncup(dvs):
start_swss(dvs)
time.sleep(10)

swss_check_RestartCount(state_db, restart_count)
swss_check_RestoreCount(state_db, restore_count)

for i in [0, 1, 2]:
(status, fvs) = tbl.get("Ethernet%d" % (i * 4))
Expand All @@ -829,4 +830,3 @@ def test_swss_port_state_syncup(dvs):
assert oper_status == "down"
else:
assert oper_status == "up"

4 changes: 2 additions & 2 deletions warmrestart/warmRestartAssist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ AppRestartAssist::AppRestartAssist(RedisPipeline *pipeline,
m_psTable(psTable),
m_warmStartTimer(timespec{0, 0})
{
WarmStart::initialize(m_appName, m_dockerName);
WarmStart::checkWarmStart(m_appName, m_dockerName);

m_appTableName = m_appTable.getTableName();
Expand Down Expand Up @@ -60,7 +61,7 @@ AppRestartAssist::AppRestartAssist(RedisPipeline *pipeline,
// Clear the producerstate table to make sure no pending data for the AppTable
m_psTable->clear();

WarmStart::setWarmStartState(m_appName, WarmStart::INIT);
WarmStart::setWarmStartState(m_appName, WarmStart::INITIALIZED);
}
}

Expand Down Expand Up @@ -266,4 +267,3 @@ bool AppRestartAssist::checkReconcileTimer(Selectable *s)
}
return false;
}

Loading