Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new BGP Monitor Daemon to pull BGP peer state and store in State DB for MIB consumption #1429

Closed
wants to merge 8 commits into from
1 change: 1 addition & 0 deletions debian/swss.install
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
swssconfig/sample/netbouncer.json etc/swss/config.d
neighsyncd/restore_neighbors.py usr/bin
fpmsyncd/bgp_eoiu_marker.py usr/bin
fpmsyncd/bgpmon.py usr/bin
171 changes: 171 additions & 0 deletions fpmsyncd/bgpmon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#!/usr/bin/env python
gechiang marked this conversation as resolved.
Show resolved Hide resolved

""""
Description: bgpmon.py -- populating bgp related information in stateDB.
script is started by supervisord in bgp docker when the docker is started.

Initial creation of this daemon is to assist SNMP agent in obtaining the
BGP related information for its MIB support. The MIB that this daemon is
assiting is for the CiscoBgp4MIB (Neighbor state only). If there are other
BGP related items that needs to be updated in a periodic manner in the
future, then more can be added into this process.

The script check if there are any bgp activities by monitoring the bgp
frr.log file timestamp. If activity is detected, then it will request bgp
neighbor state via vtysh cli interface. This bgp activity monitoring is
done periodically (every 15 second). When triggered, it looks specifically
for the neighbor state in the json output of show ip bgp neighbors json
and update the state DB for each neighbor accordingly.
In order to not disturb and hold on to the State DB access too long and
removal of the stale neighbors (neighbors that was there previously on
previous get request but no longer there in the current get request), a
"previous" neighbor dictionary will be kept and used to determine if there
is a need to perform update or the peer is stale to be removed from the
state DB
"""

import commands
import json
import os
import syslog
import swsssdk
import time
import traceback

class BgpStateGet():
def __init__(self):
self.ipv4_n = []
gechiang marked this conversation as resolved.
Show resolved Hide resolved
self.ipv4_n_state = {}
self.ipv6_n = []
self.ipv6_n_state = {}
self.new_ipv4_n = []
self.new_ipv4_n_state = {}
self.new_ipv6_n = []
self.new_ipv6_n_state = {}
self.retrieve_fail_cnt = 0
self.cached_timestamp = 0
self.db = swsssdk.SonicV2Connector(host='127.0.0.1')
gechiang marked this conversation as resolved.
Show resolved Hide resolved
self.db.connect(self.db.STATE_DB, False)
self.db.delete_all_by_pattern(self.db.STATE_DB, "NEIGH_STATE_TABLE|*" )

# A quick way to check if there are anything happening within BGP is to
# check its log file has any activities. This is by checking its modified
# timestamp against the cached timestamp that we keep and if there is a
# difference, there is activity detected. In case the log file got wiped
# out, it will default back to constant pulling every 15 seconds
def bgp_activity_detected(self):
try:
timestamp = os.stat("/var/log/frr/frr.log").st_mtime
if timestamp != self.cached_timestamp:
#syslog.syslog("bgpmon timestamp diff detected: {}".format(timestamp))
gechiang marked this conversation as resolved.
Show resolved Hide resolved
self.cached_timestamp = timestamp
return True
else:
return False
except Exception, e:
gechiang marked this conversation as resolved.
Show resolved Hide resolved
gechiang marked this conversation as resolved.
Show resolved Hide resolved
return True

# Get a new snapshot of BGP neighbors and store them in the "new" location
def get_all_neigh_states(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_all_neigh_states [](start = 8, length = 20)

Could you add some unit test or vs test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@qiluo-msft Will add unit test in a separate PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prefer adding in the this PR if you can. It even benefit yourself in iterations.

# Clean up the "new" dictionaries
del self.new_ipv4_n[:]
gechiang marked this conversation as resolved.
Show resolved Hide resolved
self.new_ipv4_n_state.clear()
del self.new_ipv6_n[:]
self.new_ipv6_n_state.clear()
try:
cmd = "vtysh -c 'show bgp summary json'"
output = commands.getoutput(cmd)
gechiang marked this conversation as resolved.
Show resolved Hide resolved
peer_info = json.loads(output)
if "ipv4Unicast" in peer_info and "peers" in peer_info["ipv4Unicast"]:
self.new_ipv4_n = peer_info["ipv4Unicast"]["peers"].keys()
for i in range (0, len(self.new_ipv4_n)):
self.new_ipv4_n_state[self.new_ipv4_n[i]] = \
peer_info["ipv4Unicast"]["peers"][self.new_ipv4_n[i]]["state"]

if "ipv6Unicast" in peer_info and "peers" in peer_info["ipv6Unicast"]:
self.new_ipv6_n = peer_info["ipv6Unicast"]["peers"].keys()
for i in range (0, len(self.new_ipv6_n)):
self.new_ipv6_n_state[self.new_ipv6_n[i]] = \
peer_info["ipv6Unicast"]["peers"][self.new_ipv6_n[i]]["state"]
gechiang marked this conversation as resolved.
Show resolved Hide resolved

except Exception:
gechiang marked this conversation as resolved.
Show resolved Hide resolved
self.retrieve_fail_cnt += 1
gechiang marked this conversation as resolved.
Show resolved Hide resolved
syslog.syslog(syslog.LOG_ERR, "*ERROR* get_all_neigh_states Exception: %s"
% (traceback.format_exc()))

def update_neigh_states(self):
# handle IPV4 case
for i in range (0, len(self.new_ipv4_n)):
neighb = self.new_ipv4_n[i]
key = "NEIGH_STATE_TABLE|%s" % neighb
if neighb in self.ipv4_n:
# only update the entry if sate changed
gechiang marked this conversation as resolved.
Show resolved Hide resolved
if self.ipv4_n_state[neighb] != self.new_ipv4_n_state[neighb]:
# state changed. Update state DB for this entry
state = self.new_ipv4_n_state[neighb]
self.db.set(self.db.STATE_DB, key, 'state', state)
gechiang marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

@qiluo-msft qiluo-msft Sep 8, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set [](start = 28, length = 3)

Suggest to use https://redis.io/commands/expire in case this process crashes or always fails.

If this is the right direction, "only update the entry if sate changed" may be not that important since we use pipeline and there is only one transaction. #WontFix

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@qiluo-msft Since this StateDB is required by SNMP proxy agent and potential future consumers, if this process always fails, it will not matter what the BGP StateDB entries gets properly deleted or not... those agents will not operate correctly until bgpmon daemon is healthy again. If my understanding is correct after reading through the "expire" documentation it will require bgpmon to periodically update the expire to ensure those BGP key states does not get expired/deleted when there is no changes needed. Since we want to favor the design for the steady state condition (no more peer state changes in steady state), we should not use "expire" or else bgpmon in steady state will end up having to periodically update the state DB even there are no state changes. In this desing, bgpmon when restart will force the state DB to clean up and provide the most updated state from the new snapshot it gathered from BGP/Zebra. So StateDB clean up is taken cared of when necessary to prevent stale entries from staying in the state DB. Also, using "expire" means someone (Redis) will need to keep track of the time for each key. This operation is not free no matter how efficient Redis implements it...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's assume there is a bug inside this daemon and it is stuck somewhere. It is better to expire and propagate this error to monitors. In that situation, it's dangerous to keep as false stable.


In reply to: 484678284 [](ancestors = 484678284)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@qiluo-msft Per our phone discussion I think we agreed that this is ok for now.

self.ipv4_n_state[neighb] = state
# remove this neighbor from old list since it is accounted for
self.ipv4_n.remove(neighb)
else:
# New neighbor found case. Add to dictionary and state DB
state = self.new_ipv4_n_state[neighb]
self.db.set(self.db.STATE_DB, key, 'state', state)
self.ipv4_n_state[neighb] = state
# Check for stale state entries to be cleaned up
while len(self.ipv4_n) > 0:
# remove this from the stateDB and the current nighbor state entry
neighb = self.ipv4_n.pop(0)
del_key = "NEIGH_STATE_TABLE|%s" % neighb
self.db.delete(self.db.STATE_DB, del_key)
del self.ipv4_n_state[neighb]
# Save the new List
self.ipv4_n = self.new_ipv4_n[:]

# handle IPV6 case
gechiang marked this conversation as resolved.
Show resolved Hide resolved
for i in range (0, len(self.new_ipv6_n)):
neighb = self.new_ipv6_n[i]
key = "NEIGH_STATE_TABLE|%s" % neighb
if neighb in self.ipv6_n:
# only update the entry if sate changed
gechiang marked this conversation as resolved.
Show resolved Hide resolved
if self.ipv6_n_state[neighb] != self.new_ipv6_n_state[neighb]:
# state changed. Update state DB for this entry
state = self.new_ipv6_n_state[neighb]
self.db.set(self.db.STATE_DB, key, 'state', state)
self.ipv6_n_state[neighb] = state
# remove this neighbor from old list since it is accounted for
self.ipv6_n.remove(neighb)
else:
# New neighbor found case. Add to dictionary and state DB
state = self.new_ipv6_n_state[neighb]
self.db.set(self.db.STATE_DB, key, 'state', state)
self.ipv6_n_state[neighb] = state
# Check for stale state entries to be cleaned up
while len(self.ipv6_n) > 0:
# remove this from the stateDB and the current nighbor state entry
neighb = self.ipv6_n.pop(0)
del_key = "NEIGH_STATE_TABLE|%s" % neighb
self.db.delete(self.db.STATE_DB, del_key)
del self.ipv6_n_state[neighb]
# Save the new List
self.ipv6_n = self.new_ipv6_n[:]

def main():

print "bgpmon service started"

try:
bgp_state_get = BgpStateGet()
except Exception, e:
gechiang marked this conversation as resolved.
Show resolved Hide resolved
syslog.syslog(syslog.LOG_ERR, "{}: error exit 1, reason {}".format(THIS_MODULE, str(e)))
exit(1)

# periodically obtain the new neighbor infomraton and update if necessary
while True:
time.sleep(15)
if bgp_state_get.bgp_activity_detected():
bgp_state_get.get_all_neigh_states()
bgp_state_get.update_neigh_states()

if __name__ == '__main__':
main()