sonic-net · gechiang · Sep 22, 2020 · Sep 22, 2020 · abdosi · Sep 22, 2020
diff --git a/dockers/docker-fpm-frr/base_image_files/monit_bgp b/dockers/docker-fpm-frr/base_image_files/monit_bgp
@@ -6,6 +6,7 @@
 ##  bgpd
 ##  staticd
 ##  bgpcfgd
+##  bgpmon
 ###############################################################################
 check process zebra matching "/usr/lib/frr/zebra"
     if does not exist for 5 times within 5 cycles then alert
@@ -21,3 +22,6 @@ check process staticd matching "/usr/lib/frr/staticd"
 
 check process bgpcfgd matching "python /usr/local/bin/bgpcfgd"
     if does not exist for 5 times within 5 cycles then alert
+
+check process bgpcfgd matching "python /usr/local/bin/bgpmon"
+    if does not exist for 5 times within 5 cycles then alert
diff --git a/dockers/docker-fpm-frr/start.sh b/dockers/docker-fpm-frr/start.sh
@@ -73,3 +73,4 @@ fi
 supervisorctl start fpmsyncd
 
 supervisorctl start bgpcfgd
+supervisorctl start bgpmon
diff --git a/dockers/docker-fpm-frr/supervisord.conf b/dockers/docker-fpm-frr/supervisord.conf
@@ -82,6 +82,15 @@ startsecs=0
 stdout_logfile=syslog
 stderr_logfile=syslog
 
+[program:bgpmon]
+command=/usr/local/bin/bgpmon
+priority=6
+autostart=false
+autorestart=false
+startsecs=0
+stdout_logfile=syslog
+stderr_logfile=syslog
+
 [program:vtysh_b]
 command=/usr/bin/vtysh -b
 priority=6

diff --git a/src/sonic-bgpcfgd/bgpmon_proj/__init__.py b/src/sonic-bgpcfgd/bgpmon_proj/__init__.py
diff --git a/src/sonic-bgpcfgd/bgpmon_proj/bgpmon.py b/src/sonic-bgpcfgd/bgpmon_proj/bgpmon.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python2
+
+""""
+Description: bgpmon.py -- populating bgp related information in stateDB.
+    script is started by supervisord in bgp docker when the docker is started.
+    Initial creation of this daemon is to assist SNMP agent in obtaining the 
+    BGP related information for its MIB support. The MIB that this daemon is
+    assiting is for the CiscoBgp4MIB (Neighbor state only). If there are other
+    BGP related items that needs to be updated in a periodic manner in the 
+    future, then more can be added into this process.
+    The script check if there are any bgp activities by monitoring the bgp
+    frr.log file timestamp.  If activity is detected, then it will request bgp
+    neighbor state via vtysh cli interface. This bgp activity monitoring is 
+    done periodically (every 15 second). When triggered, it looks specifically
+    for the neighbor state in the json output of show ip bgp neighbors json
+    and update the state DB for each neighbor accordingly.
+    In order to not disturb and hold on to the State DB access too long and
+    removal of the stale neighbors (neighbors that was there previously on 
+    previous get request but no longer there in the current get request), a
+    "previous" neighbor dictionary will be kept and used to determine if there
+    is a need to perform update or the peer is stale to be removed from the
+    state DB
+"""
+import commands
+import json
+import os
+import syslog
+import swsssdk
+import time
+
+PIPE_BATCH_MAX_COUNT = 50
+
+class BgpStateGet():
+    def __init__(self):
+        # list peer_l stores the Neighbor peer Ip address
+        # dic peer_state stores the Neighbor peer state entries
+        # list new_peer_l stores the new snapshot of Neighbor peer ip address
+        # dic new_peer_state stores the new snapshot of Neighbor peer states
+        self.peer_l = []
+        self.peer_state = {}
+        self.new_peer_l = []
+        self.new_peer_state = {}
+        self.cached_timestamp = 0
+        self.db = swsssdk.SonicV2Connector()
+        self.db.connect(self.db.STATE_DB, False)
+        client = self.db.get_redis_client(self.db.STATE_DB)
+        self.pipe = client.pipeline()
+        self.db.delete_all_by_pattern(self.db.STATE_DB, "NEIGH_STATE_TABLE|*" )
+
+    # A quick way to check if there are anything happening within BGP is to
+    # check its log file has any activities. This is by checking its modified
+    # timestamp against the cached timestamp that we keep and if there is a
+    # difference, there is activity detected. In case the log file got wiped
+    # out, it will default back to constant pulling every 15 seconds
+    def bgp_activity_detected(self):
+        try:
+            timestamp = os.stat("/var/log/frr/frr.log").st_mtime
+            if timestamp != self.cached_timestamp:
+                self.cached_timestamp = timestamp
+                return True
+            else:
+                return False
+        except (IOError, OSError):
+            return True
+
+    def update_new_peer_states(self, peer_dict):
+        peer_l = peer_dict["peers"].keys()
+        self.new_peer_l.extend(peer_l)
+        for i in range (0, len(peer_l)):
+            self.new_peer_state[peer_l[i]] = peer_dict["peers"][peer_l[i]]["state"]
+
+    # Get a new snapshot of BGP neighbors and store them in the "new" location
+    def get_all_neigh_states(self):
+        cmd = "vtysh -c 'show bgp summary json'"
+        rc, output = commands.getstatusoutput(cmd)
+        if rc:
+            syslog.syslog(syslog.LOG_ERR, "*ERROR* Failed with rc:{} when execute: {}".format(rc, cmd))
+            return
+
+        peer_info = json.loads(output)
+        # cmd ran successfully, safe to Clean the "new" lists/dic for new sanpshot
+        del self.new_peer_l[:]
+        self.new_peer_state.clear()
+        for key, value in peer_info.items():
+            if key == "ipv4Unicast" or key == "ipv6Unicast":
+                self.update_new_peer_states(value)
+
+    # This method will take the caller's dictionary which contains the peer state operation
+    # That need to be updated in StateDB using Redis pipeline.
+    # The data{} will be cleared at the end of this method before returning to caller.
+    def flush_pipe(self, data):
+        """Dump each entry in data{} into State DB via redis pipeline.
+        Args:
+            data: Neighbor state in dictionary format
+            {
+                'NEIGH_STATE_TABLE|ip_address_a': {'state':state},
+                'NEIGH_STATE_TABLE|ip_address_b': {'state':state},
+                'NEIGH_STATE_TABLE|ip_address_c': {'state':state},
+                'NEIGH_STATE_TABLE|ip_address_x': None,
+                'NEIGH_STATE_TABLE|ip_address_z': None
+                ...
+            }
+        """
+        for key, value in data.items():
+            if value is None:
+                # delete case
+                self.pipe.delete(key)
+            else:
+                # Add or Modify case
+                self.pipe.hmset(key, value)
+        self.pipe.execute()
+        data.clear()
+
+    def update_neigh_states(self):
+        data = {}
+        for i in range (0, len(self.new_peer_l)):
+            peer = self.new_peer_l[i]
+            key = "NEIGH_STATE_TABLE|%s" % peer
+            if peer in self.peer_l:
+                # only update the entry if state changed
+                if self.peer_state[peer] != self.new_peer_state[peer]:
+                    # state changed. Update state DB for this entry
+                    state = self.new_peer_state[peer]
+                    data[key] = {'state':state}
+                    self.peer_state[peer] = state
+                # remove this neighbor from old list since it is accounted for
+                self.peer_l.remove(peer)
+            else:
+                # New neighbor found case. Add to dictionary and state DB
+                state = self.new_peer_state[peer]
+                data[key] = {'state':state}
+                self.peer_state[peer] = state
+            if len(data) > PIPE_BATCH_MAX_COUNT:
+                self.flush_pipe(data)
+        # Check for stale state entries to be cleaned up
+        while len(self.peer_l) > 0:
+            # remove this from the stateDB and the current nighbor state entry
+            peer = self.peer_l.pop(0)
+            del_key = "NEIGH_STATE_TABLE|%s" % peer
+            data[del_key] = None
+            del self.peer_state[peer]
+            if len(data) > PIPE_BATCH_MAX_COUNT:
+                self.flush_pipe(data)
+        # If anything in the pipeline not yet flushed, flush them now
+        if len(data) > 0:
+            self.flush_pipe(data)
+        # Save the new List
+        self.peer_l = self.new_peer_l[:]
+
+def main():
+
+    print "bgpmon service started"
+
+    try:
+        bgp_state_get = BgpStateGet()
+    except Exception as e:
+        syslog.syslog(syslog.LOG_ERR, "{}: error exit 1, reason {}".format(THIS_MODULE, str(e)))
+        exit(1)
+
+    # periodically obtain the new neighbor infomraton and update if necessary
+    while True:
+        time.sleep(15)
+        if bgp_state_get.bgp_activity_detected():
+            bgp_state_get.get_all_neigh_states()
+            bgp_state_get.update_neigh_states()
+
+if __name__ == '__main__':
+    main()
diff --git a/src/sonic-bgpcfgd/setup.py b/src/sonic-bgpcfgd/setup.py
@@ -10,6 +10,11 @@
       url='https://github.com/Azure/sonic-buildimage',
       packages=setuptools.find_packages(),
       scripts=['bgpcfgd'],
+      entry_points={
+          'console_scripts': [
+              'bgpmon = bgpmon_proj.bgpmon:main',
+          ]
+      },
       install_requires=['jinja2>=2.10', 'netaddr', 'pyyaml'],
       setup_requires=['pytest-runner', 'pytest'],
 )