Skip to content

Commit

Permalink
[warm-reboot] add docker upgrade --warm option and roll back support (s…
Browse files Browse the repository at this point in the history
…onic-net#559)

* [warm-reboot] add docker upgrade --warm option and roll back support

Signed-off-by: Jipan Yang <jipan.yang@alibaba-inc.com>

* load docker image before disruptive operations to shorten control plane frozen time.

Signed-off-by: Jipan Yang <jipan.yang@alibaba-inc.com>
  • Loading branch information
jipanyang authored and lguohan committed Jul 24, 2019
1 parent 0fe279f commit 8810864
Showing 1 changed file with 165 additions and 62 deletions.
227 changes: 165 additions & 62 deletions sonic_installer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import time
import click
import urllib
import syslog
import subprocess
from swsssdk import ConfigDBConnector
from swsssdk import SonicV2Connector
Expand Down Expand Up @@ -265,6 +266,37 @@ def abort_if_false(ctx, param, value):
if not value:
ctx.abort()

def get_container_image_name(container_name):
# example image: docker-lldp-sv2:latest
cmd = "docker inspect --format '{{.Config.Image}}' " + container_name
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
if proc.returncode != 0:
sys.exit(proc.returncode)
image_latest = out.rstrip()

# example image_name: docker-lldp-sv2
cmd = "echo " + image_latest + " | cut -d ':' -f 1"
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
image_name = proc.stdout.read().rstrip()
return image_name

def get_container_image_id(image_tag):
# TODO: extract commond docker info fetching functions
# this is image_id for image with tag, like 'docker-teamd:latest'
cmd = "docker images --format '{{.ID}}' " + image_tag
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
image_id = proc.stdout.read().rstrip()
return image_id

def get_container_image_id_all(image_name):
# All images id under the image name like 'docker-teamd'
cmd = "docker images --format '{{.ID}}' " + image_name
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
image_id_all = proc.stdout.read()
image_id_all = image_id_all.splitlines()
image_id_all = set(image_id_all)
return image_id_all

# Main entrypoint
@click.group()
Expand Down Expand Up @@ -433,27 +465,19 @@ def cleanup():
@cli.command()
@click.option('-y', '--yes', is_flag=True, callback=abort_if_false,
expose_value=False, prompt='New docker image will be installed, continue?')
@click.option('--cleanup_image', is_flag=True, help="Clean up old docker image(s)")
@click.option('--enforce_check', is_flag=True, help="Enforce pending task check for docker upgrade")
@click.option('--cleanup_image', is_flag=True, help="Clean up old docker image")
@click.option('--skip_check', is_flag=True, help="Skip task check for docker upgrade")
@click.option('--tag', type=str, help="Tag for the new docker image")
@click.option('--warm', is_flag=True, help="Perform warm upgrade")
@click.argument('container_name', metavar='<container_name>', required=True,
type=click.Choice(["swss", "snmp", "lldp", "bgp", "pmon", "dhcp_relay", "telemetry", "teamd"]))
type=click.Choice(["swss", "snmp", "lldp", "bgp", "pmon", "dhcp_relay", "telemetry", "teamd", "radv", "amon"]))
@click.argument('url')
def upgrade_docker(container_name, url, cleanup_image, enforce_check, tag):
def upgrade_docker(container_name, url, cleanup_image, skip_check, tag, warm):
""" Upgrade docker image from local binary or URL"""

# example image: docker-lldp-sv2:latest
cmd = "docker inspect --format '{{.Config.Image}}' " + container_name
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
if proc.returncode != 0:
sys.exit(proc.returncode)
image_latest = out.rstrip()

# example image_name: docker-lldp-sv2
cmd = "echo " + image_latest + " | cut -d ':' -f 1"
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
image_name = proc.stdout.read().rstrip()
image_name = get_container_image_name(container_name)
image_latest = image_name + ":latest"
image_id_previous = get_container_image_id(image_latest)

DEFAULT_IMAGE_PATH = os.path.join("/tmp/", image_name)
if url.startswith('http://') or url.startswith('https://'):
Expand All @@ -474,87 +498,166 @@ def upgrade_docker(container_name, url, cleanup_image, enforce_check, tag):
click.echo("Image file '{}' does not exist or is not a regular file. Aborting...".format(image_path))
raise click.Abort()

warm = False
warm_configured = False
# warm restart enable/disable config is put in stateDB, not persistent across cold reboot, not saved to config_DB.json file
state_db = SonicV2Connector(host='127.0.0.1')
state_db.connect(state_db.STATE_DB, False)
TABLE_NAME_SEPARATOR = '|'
prefix = 'WARM_RESTART_ENABLE_TABLE' + TABLE_NAME_SEPARATOR
_hash = '{}{}'.format(prefix, container_name)
if state_db.get(state_db.STATE_DB, _hash, "enable") == "true":
warm = True
warm_configured = True
state_db.close(state_db.STATE_DB)

if container_name == "swss" or container_name == "bgp" or container_name == "teamd":
if warm_configured == False and warm:
run_command("config warm_restart enable %s" % container_name)

# Fetch tag of current running image
tag_previous = get_docker_tag_name(image_latest)
# Load the new image beforehand to shorten disruption time
run_command("docker load < %s" % image_path)
warm_app_names = []
# warm restart specific procssing for swss, bgp and teamd dockers.
if warm == True:
if warm_configured == True or warm:
# make sure orchagent is in clean state if swss is to be upgraded
if container_name == "swss":
skipPendingTaskCheck = " -s"
if enforce_check:
skipPendingTaskCheck = ""

cmd = "docker exec -i swss orchagent_restart_check -w 1000 " + skipPendingTaskCheck
for i in range(1, 6):
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
if proc.returncode != 0:
if enforce_check:
click.echo("Orchagent is not in clean state, RESTARTCHECK failed {}".format(i))
if i == 5:
sys.exit(proc.returncode)
else:
click.echo("Orchagent is not in clean state, upgrading it anyway")
break
skipPendingTaskCheck = ""
if skip_check:
skipPendingTaskCheck = " -s"

cmd = "docker exec -i swss orchagent_restart_check -w 2000 -r 5 " + skipPendingTaskCheck

proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
(out, err) = proc.communicate()
if proc.returncode != 0:
if not skip_check:
click.echo("Orchagent is not in clean state, RESTARTCHECK failed")
# Restore orignal config before exit
if warm_configured == False and warm:
run_command("config warm_restart disable %s" % container_name)
# Clean the image loaded earlier
image_id_latest = get_container_image_id(image_latest)
run_command("docker rmi -f %s" % image_id_latest)
# Re-point latest tag to previous tag
run_command("docker tag %s:%s %s" % (image_name, tag_previous, image_latest))

sys.exit(proc.returncode)
else:
click.echo("Orchagent is in clean state and frozen for warm upgrade")
break
run_command("sleep 1")
click.echo("Orchagent is not in clean state, upgrading it anyway")
else:
click.echo("Orchagent is in clean state and frozen for warm upgrade")

warm_app_names = ["orchagent", "neighsyncd"]

elif container_name == "bgp":
# Kill bgpd to restart the bgp graceful restart procedure
click.echo("Stopping bgp ...")
run_command("docker exec -i bgp pkill -9 zebra")
run_command("docker exec -i bgp pkill -9 bgpd")
run_command("sleep 2") # wait 2 seconds for bgp to settle down
warm_app_names = ["bgp"]
click.echo("Stopped bgp ...")

elif container_name == "teamd":
click.echo("Stopping teamd ...")
# Send USR1 signal to all teamd instances to stop them
# It will prepare teamd for warm-reboot
run_command("docker exec -i teamd pkill -USR1 teamd > /dev/null")
run_command("sleep 2") # wait 2 seconds for teamd to settle down
warm_app_names = ["teamsyncd"]
click.echo("Stopped teamd ...")

run_command("systemctl stop %s" % container_name)
# clean app reconcilation state from last warm start if exists
for warm_app_name in warm_app_names:
cmd = "docker exec -i database redis-cli -n 6 hdel 'WARM_RESTART_TABLE|" + warm_app_name + "' state"
run_command(cmd)

run_command("docker kill %s > /dev/null" % container_name)
run_command("docker rm %s " % container_name)
run_command("docker rmi %s " % image_latest)
run_command("docker load < %s" % image_path)
if tag == None:
# example image: docker-lldp-sv2:latest
tag = get_docker_tag_name(image_latest)
run_command("docker tag %s:latest %s:%s" % (image_name, image_name, tag))
run_command("systemctl restart %s" % container_name)

# Clean up old docker images
if cleanup_image:
# All images id under the image name
cmd = "docker images --format '{{.ID}}' " + image_name
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
image_id_all = proc.stdout.read()
image_id_all = image_id_all.splitlines()
image_id_all = set(image_id_all)

# this is image_id for image with "latest" tag
cmd = "docker images --format '{{.ID}}' " + image_latest
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
image_id_latest = proc.stdout.read().rstrip()

for id in image_id_all:
if id != image_id_latest:
run_command("docker rmi -f %s" % id)

run_command("sleep 5") # wait 5 seconds for application to sync
# All images id under the image name
image_id_all = get_container_image_id_all(image_name)

# this is image_id for image with "latest" tag
image_id_latest = get_container_image_id(image_latest)

for id in image_id_all:
if id != image_id_latest:
# Unless requested, the previoud docker image will be preserved
if not cleanup_image and id == image_id_previous:
continue
run_command("docker rmi -f %s" % id)

exp_state = "reconciled"
state = ""
# post warm restart specific procssing for swss, bgp and teamd dockers, wait for reconciliation state.
if warm_configured == True or warm:
count = 0
for warm_app_name in warm_app_names:
state = ""
cmd = "docker exec -i database redis-cli -n 6 hget 'WARM_RESTART_TABLE|" + warm_app_name + "' state"
# Wait up to 180 seconds for reconciled state
while state != exp_state and count < 90:
sys.stdout.write("\r {}: ".format(warm_app_name))
sys.stdout.write("[%-s" % ('='*count))
sys.stdout.flush()
count += 1
time.sleep(2)
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
state = proc.stdout.read().rstrip()
syslog.syslog("%s reached %s state"%(warm_app_name, state))
sys.stdout.write("]\n\r")
if state != exp_state:
click.echo("%s failed to reach %s state"%(warm_app_name, exp_state))
syslog.syslog(syslog.LOG_ERR, "%s failed to reach %s state"%(warm_app_name, exp_state))
else:
exp_state = "" # this is cold upgrade

# Restore to previous cold restart setting
if warm_configured == False and warm:
if container_name == "swss" or container_name == "bgp" or container_name == "teamd":
run_command("config warm_restart disable %s" % container_name)

if state == exp_state:
click.echo('Done')
else:
click.echo('Failed')
sys.exit(1)

# rollback docker image
@cli.command()
@click.option('-y', '--yes', is_flag=True, callback=abort_if_false,
expose_value=False, prompt='Docker image will be rolled back, continue?')
@click.argument('container_name', metavar='<container_name>', required=True,
type=click.Choice(["swss", "snmp", "lldp", "bgp", "pmon", "dhcp_relay", "telemetry", "teamd", "radv", "amon"]))
def rollback_docker(container_name):
""" Rollback docker image to previous version"""
image_name = get_container_image_name(container_name)
# All images id under the image name
image_id_all = get_container_image_id_all(image_name)
if len(image_id_all) != 2:
click.echo("Two images required, but there are '{}' images for '{}'. Aborting...".format(len(image_id_all), image_name))
raise click.Abort()

image_latest = image_name + ":latest"
image_id_previous = get_container_image_id(image_latest)

version_tag = ""
for id in image_id_all:
if id != image_id_previous:
version_tag = get_docker_tag_name(id)

# make previous image as latest
run_command("docker tag %s:%s %s:latest" % (image_name, version_tag, image_name))
if container_name == "swss" or container_name == "bgp" or container_name == "teamd":
click.echo("Cold reboot is required to restore system state after '{}' rollback !!".format(container_name))
else:
run_command("systemctl restart %s" % container_name)

click.echo('Done')

if __name__ == '__main__':
Expand Down

0 comments on commit 8810864

Please sign in to comment.