From ad25c26f66170b55edc51acd0e6937715913df50 Mon Sep 17 00:00:00 2001 From: Matthias Frei Date: Mon, 26 Jun 2023 11:34:28 +0200 Subject: [PATCH] end2end: refresh to get paths discovered after initial request --- daemon/internal/servers/grpc.go | 25 +++++----- tools/end2end/main.go | 32 +++++++++---- tools/integration/revocation_test.sh | 19 +++++++- tools/topodot.py | 68 ++++++++++++++++++++++++---- 4 files changed, 110 insertions(+), 34 deletions(-) diff --git a/daemon/internal/servers/grpc.go b/daemon/internal/servers/grpc.go index b9e7af48fe..56ced80196 100644 --- a/daemon/internal/servers/grpc.go +++ b/daemon/internal/servers/grpc.go @@ -90,14 +90,16 @@ func (s *DaemonServer) paths(ctx context.Context, defer cancelF() } srcIA, dstIA := addr.IA(req.SourceIsdAs), addr.IA(req.DestinationIsdAs) - go func() { - defer log.HandlePanic() - s.backgroundPaths(ctx, srcIA, dstIA, req.Refresh) - }() paths, err := s.fetchPaths(ctx, &s.foregroundPathDedupe, srcIA, dstIA, req.Refresh) if err != nil { log.FromCtx(ctx).Debug("Fetching paths", "err", err, "src", srcIA, "dst", dstIA, "refresh", req.Refresh) + // Retry with longer timeout in background + go func() { + defer log.HandlePanic() + s.backgroundPaths(opentracing.SpanFromContext(ctx), + srcIA, dstIA, req.Refresh) + }() return nil, err } reply := &sdpb.PathsResponse{} @@ -201,20 +203,17 @@ func linkTypeToPB(lt snet.LinkType) sdpb.LinkType { } } -func (s *DaemonServer) backgroundPaths(origCtx context.Context, src, dst addr.IA, refresh bool) { - backgroundTimeout := 5 * time.Second - deadline, ok := origCtx.Deadline() - if !ok || time.Until(deadline) > backgroundTimeout { - // the original context is large enough no need to spin a background fetch. - return - } +func (s *DaemonServer) backgroundPaths(span opentracing.Span, src, dst addr.IA, refresh bool) { + + // backgroundTimeout is very long; this allows a large number of retries + backgroundTimeout := 15 * time.Second ctx, cancelF := context.WithTimeout(context.Background(), backgroundTimeout) defer cancelF() var spanOpts []opentracing.StartSpanOption - if span := opentracing.SpanFromContext(origCtx); span != nil { + if span != nil { spanOpts = append(spanOpts, opentracing.FollowsFrom(span.Context())) } - span, ctx := opentracing.StartSpanFromContext(ctx, "fetch.paths.background", spanOpts...) + span, ctx = opentracing.StartSpanFromContext(ctx, "fetch.paths.background", spanOpts...) defer span.Finish() if _, err := s.fetchPaths(ctx, &s.backgroundPathDedupe, src, dst, refresh); err != nil { log.FromCtx(ctx).Debug("Error fetching paths (background)", "err", err, diff --git a/tools/end2end/main.go b/tools/end2end/main.go index 9789c1b4dc..41deae7299 100644 --- a/tools/end2end/main.go +++ b/tools/end2end/main.go @@ -22,7 +22,6 @@ import ( "errors" "flag" "fmt" - "math" "net" "os" "time" @@ -247,7 +246,8 @@ type client struct { port uint16 sdConn daemon.Connector - errorPaths map[snet.PathFingerprint]int // number of encountered errors/timeouts per path + errorPaths map[snet.PathFingerprint]struct{} + triedAllPaths bool } func (c *client) run() int { @@ -274,7 +274,7 @@ func (c *client) run() int { fmt.Sprintf("%v,[%v]:%d", integration.Local.IA, integration.Local.Host.IP, c.port)) c.sdConn = integration.SDConn() defer c.sdConn.Close() - c.errorPaths = make(map[snet.PathFingerprint]int) + c.errorPaths = make(map[snet.PathFingerprint]struct{}) return integration.AttemptRepeatedly("End2End", c.attemptRequest) } @@ -313,7 +313,7 @@ func (c *client) attemptRequest(n int) bool { tracing.Error(span, err) logger.Error("Error receiving pong", "err", err) if path != nil { - c.errorPaths[snet.Fingerprint(path)]++ + c.errorPaths[snet.Fingerprint(path)] = struct{}{} } return false } @@ -375,20 +375,34 @@ func (c *client) getRemote(ctx context.Context, n int) (snet.Path, error) { return err } + refresh := false + if c.triedAllPaths { + // All paths have been tried, and as we're trying again it appears there was no success. + // We'll refresh and retry all available paths. + // The refresh could help in case that the beaconing has discovered new paths since the + // daemon/CS have first cached the paths to this destination. + refresh = true + c.errorPaths = make(map[snet.PathFingerprint]struct{}) + c.triedAllPaths = false + } paths, err := c.sdConn.Paths(ctx, remote.IA, integration.Local.IA, - daemon.PathReqFlags{Refresh: false}) + daemon.PathReqFlags{Refresh: refresh}) if err != nil { return nil, withTag(serrors.WrapStr("requesting paths", err)) } - // Select path that errored fewest times before + // Select first path that didn't error before. var path snet.Path - lowestErrCount := math.MaxInt + lastAvailablePath := true for _, p := range paths { - if e := c.errorPaths[snet.Fingerprint(p)]; e < lowestErrCount { + if _, ok := c.errorPaths[snet.Fingerprint(p)]; !ok { + if path != nil { + lastAvailablePath = false + break + } path = p - lowestErrCount = e } } + c.triedAllPaths = lastAvailablePath if path == nil { return nil, withTag(serrors.New("no path found", "candidates", len(paths), diff --git a/tools/integration/revocation_test.sh b/tools/integration/revocation_test.sh index 9215c9ed6a..c66dd6157c 100755 --- a/tools/integration/revocation_test.sh +++ b/tools/integration/revocation_test.sh @@ -27,7 +27,7 @@ done # Bring down routers. echo "Revocation test" -echo "Stopping routers and waiting for ${SLEEP}s." +echo "Stopping routers." ./scion.sh mstop $REV_BRS if [ $? -ne 0 ]; then echo "Failed stopping routers." @@ -36,5 +36,20 @@ fi sleep 4 # Do another round of e2e test with retries echo "Testing connectivity between all the hosts (with retries)." -bin/end2end_integration -log.console info -attempts 15 -subset 1-ff00:0:131#2-ff00:0:222 $DOCKER_ARGS +bin/end2end_integration -log.console info -attempts 15 -subset 1-ff00:0:131#2-ff00:0:222 exit $? + +# Downed interfaces: +# 1-ff00:0:110#3 +# 2-ff00:0:222#302 +# 1-ff00:0:111#105 +# 1-ff00:0:111#103 +# 1-ff00:0:111#100 +# 1-ff00:0:131#480 +# 1-ff00:0:220#502 +# 1-ff00:0:210#452 +# 1-ff00:0:212#201 +# +# Remaining good paths: +# Hops: [1-ff00:0:131 479>111 1-ff00:0:130 105>1 1-ff00:0:120 2>501 2-ff00:0:220 503>450 2-ff00:0:210 451>7 2-ff00:0:211 4>301 2-ff00:0:222] +# Hops: [1-ff00:0:131 479>111 1-ff00:0:130 104>2 1-ff00:0:110 1>6 1-ff00:0:120 2>501 2-ff00:0:220 503>450 2-ff00:0:210 451>7 2-ff00:0:211 4>301 2-ff00:0:222] diff --git a/tools/topodot.py b/tools/topodot.py index 6ef4502fa3..1731e4afa1 100755 --- a/tools/topodot.py +++ b/tools/topodot.py @@ -21,9 +21,10 @@ from collections import defaultdict from plumbum import cli, local -from typing import Dict, List, NamedTuple +from typing import Dict, List, NamedTuple, Set -from topology.topo import LinkEP, LinkType, TopoID +from topology.scion_addr import ISD_AS +from topology.topo import LinkType, TopoID graph_fmt = """digraph topo {{ \tnode [margin=0.2] @@ -52,26 +53,61 @@ """ +class LinkEP: + def __init__(self, raw): + parts = raw.split('#') + if len(parts) != 2: + raise ValueError("Invalid link endpoint, expected exactly one '#'") + isd_as_brid = parts[0] + second_dash = isd_as_brid.find("-", isd_as_brid.find("-")+1) + if second_dash > 0: + self.isd_as = ISD_AS(isd_as_brid[:second_dash]) + # self.brid = isd_as_brid[second_dash+1:] + else: + self.isd_as = ISD_AS(isd_as_brid) + self.ifid = int(parts[1]) + + def __eq__(self, other): + if isinstance(other, self.__class__): + return self.__dict__ == other.__dict__ + else: + return False + + def __hash__(self): + return hash(tuple(self.__dict__)) + + def __str__(self): + return "%s#%s" % (self.isd_as, self.ifid) + + def __repr__(self): + return "" % self + + class TopoDot(cli.Application): show = cli.Flag( ["-s", "--show"], help="Run dot and show the graph, " + "instead of only outputting the dot file.", ) + mark_interfaces = cli.SwitchAttr( + ["--mark-interfaces"], + cli.Set(LinkEP, csv=True), + ) def main(self, topofile): if self.show: + print(self.mark_interfaces) prefix = pathlib.PurePath(topofile).stem + '-' with tempfile.NamedTemporaryFile(prefix=prefix, suffix='.png') as tmp: dot = local['dot'] xdg_open = local['xdg-open'] - p = dot.popen(('-Tpng', '-o', tmp.name), encoding='utf-8') - p.stdin.write(topodot(topofile)) + p = dot.popen(('-Tpng', '-Gdpi=300', '-o', tmp.name), encoding='utf-8') + p.stdin.write(topodot(topofile, self.mark_interfaces)) p.communicate() xdg_open(tmp.name) else: - sys.stdout.write(topodot(topofile)) + sys.stdout.write(topodot(topofile, self.mark_interfaces)) class Link(NamedTuple): @@ -80,12 +116,13 @@ class Link(NamedTuple): type: LinkType -def topodot(topofile) -> str: +def topodot(topofile, mark_interfaces) -> str: with open(topofile, 'r') as f: topo_config = yaml.safe_load(f) links = topo_links(topo_config) clusters = topo_clusters(topo_config) + marked_interfaces = set(mark_interfaces) def format_nodes(indent, ases): fmt = '\t' * indent + '"%s"' @@ -93,7 +130,8 @@ def format_nodes(indent, ases): def format_links(indent, links): fmt = '\t' * indent + '"%s" -> "%s"%s' - return '\n'.join(fmt % (link.a, link.b, fmt_attrs(link_attrs(link))) + return '\n'.join(fmt % (link.a.isd_as, link.b.isd_as, + fmt_attrs(link_attrs(link, marked_interfaces))) for link in links) formatted_clusters = [] @@ -116,10 +154,12 @@ def fmt_attrs(attrs: Dict[str, str]) -> str: return '' -def link_attrs(link: Link) -> Dict[str, str]: +def link_attrs(link: Link, marked_interfaces: Set[LinkEP]) -> Dict[str, str]: + taillabel = interface_label(link.a.ifid, marked=link.a in marked_interfaces) + headlabel = interface_label(link.b.ifid, marked=link.b in marked_interfaces) attrs = { - 'taillabel': link.a.ifid, - 'headlabel': link.b.ifid, + 'taillabel': taillabel, + 'headlabel': headlabel, } if link.type == LinkType.CORE: attrs['dir'] = 'none' @@ -127,9 +167,17 @@ def link_attrs(link: Link) -> Dict[str, str]: attrs['constraint'] = 'false' attrs['dir'] = 'none' attrs['style'] = 'dotted' + if link.a in marked_interfaces or link.b in marked_interfaces: + attrs['color'] = 'red' return attrs +def interface_label(ifid: int, marked: bool) -> str: + if marked: + return "⚡" + str(ifid) + return str(ifid) + + def topo_links(topo_config) -> List[Link]: return [ Link(a=LinkEP(link['a']),