From 09850212c61dffaa72ce52d075e27df592cfdb65 Mon Sep 17 00:00:00 2001 From: Matthias Frei Date: Thu, 22 Jun 2023 14:48:27 +0200 Subject: [PATCH 1/4] ci: fix e2e revocation test flakes (WIP) --- .buildkite/pipeline.sh | 2 +- .buildkite/pipeline.yml | 185 +++++++++++++++--------------- tools/end2end/main.go | 31 ++--- tools/end2end_integration/main.go | 2 +- 4 files changed, 112 insertions(+), 108 deletions(-) diff --git a/.buildkite/pipeline.sh b/.buildkite/pipeline.sh index d0c2641e76..824e48803f 100755 --- a/.buildkite/pipeline.sh +++ b/.buildkite/pipeline.sh @@ -19,4 +19,4 @@ export PARALLELISM=1 . ./.buildkite/pipeline_lib.sh cat .buildkite/pipeline.yml -gen_bazel_test_steps +#gen_bazel_test_steps diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 1979d9ce6f..5c7a666596 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,85 +1,85 @@ env: GOPROXY: "http://localhost:3200|https://proxy.golang.org|direct" steps: - - label: "Build :bazel:" - command: - - bazel build --verbose_failures --announce_rc //:all - - bazel run --verbose_failures //docker:prod //docker:test - key: build - retry: &automatic-retry - automatic: - - exit_status: -1 # Agent was lost - - exit_status: 255 # Forced agent shutdown - timeout_in_minutes: 10 - - wait - - label: "Unit Tests :bazel:" - command: - - bazel test --config=race --config=unit_all - key: unit_tests - artifact_paths: - - "artifacts.out/**/*" - retry: *automatic-retry - timeout_in_minutes: 20 - - label: "Lint :bash:" - command: - - make lint - key: lint - retry: *automatic-retry - timeout_in_minutes: 20 - - label: "Check Generated :bash:" - command: - - echo "--- go_deps.bzl" - - mkdir -p /tmp/test-artifacts - - cp go.mod go.sum go_deps.bzl /tmp/test-artifacts/ - - make go_deps.bzl -B - - make go-mod-tidy - - diff -u /tmp/test-artifacts/go.mod go.mod - - diff -u /tmp/test-artifacts/go.sum go.sum - - diff -u /tmp/test-artifacts/go_deps.bzl go_deps.bzl - - echo "--- protobuf" - - cp -R pkg/proto/ /tmp/test-artifacts - - make protobuf - - diff -ur /tmp/test-artifacts/proto/ pkg/proto/ - - echo "--- licenses" - - mkdir -p /tmp/test-artifacts/licenses - - ./tools/licenses.sh /tmp/test-artifacts/licenses - - diff -rNu3 /tmp/test-artifacts/licenses ./licenses/data - - echo "--- gomocks" - - ./tools/gomocks.py diff - - echo "--- antlr" - - rm -rf /tmp/test-artifacts/antlr - - cp -R antlr/ /tmp/test-artifacts/antlr - - make antlr - - diff -ur /tmp/test-artifacts/antlr/ antlr/ - - echo "--- testdata" - - ./tools/update_testdata.sh - timeout_in_minutes: 20 - key: check_generated - retry: *automatic-retry + # - label: "Build :bazel:" + # command: + # - bazel build --verbose_failures --announce_rc //:all + # - bazel run --verbose_failures //docker:prod //docker:test + # key: build + # retry: &automatic-retry + # automatic: + # - exit_status: -1 # Agent was lost + # - exit_status: 255 # Forced agent shutdown + # timeout_in_minutes: 10 + # - wait + # - label: "Unit Tests :bazel:" + # command: + # - bazel test --config=race --config=unit_all + # key: unit_tests + # artifact_paths: + # - "artifacts.out/**/*" + # retry: *automatic-retry + # timeout_in_minutes: 20 + # - label: "Lint :bash:" + # command: + # - make lint + # key: lint + # retry: *automatic-retry + # timeout_in_minutes: 20 + # - label: "Check Generated :bash:" + # command: + # - echo "--- go_deps.bzl" + # - mkdir -p /tmp/test-artifacts + # - cp go.mod go.sum go_deps.bzl /tmp/test-artifacts/ + # - make go_deps.bzl -B + # - make go-mod-tidy + # - diff -u /tmp/test-artifacts/go.mod go.mod + # - diff -u /tmp/test-artifacts/go.sum go.sum + # - diff -u /tmp/test-artifacts/go_deps.bzl go_deps.bzl + # - echo "--- protobuf" + # - cp -R pkg/proto/ /tmp/test-artifacts + # - make protobuf + # - diff -ur /tmp/test-artifacts/proto/ pkg/proto/ + # - echo "--- licenses" + # - mkdir -p /tmp/test-artifacts/licenses + # - ./tools/licenses.sh /tmp/test-artifacts/licenses + # - diff -rNu3 /tmp/test-artifacts/licenses ./licenses/data + # - echo "--- gomocks" + # - ./tools/gomocks.py diff + # - echo "--- antlr" + # - rm -rf /tmp/test-artifacts/antlr + # - cp -R antlr/ /tmp/test-artifacts/antlr + # - make antlr + # - diff -ur /tmp/test-artifacts/antlr/ antlr/ + # - echo "--- testdata" + # - ./tools/update_testdata.sh + # timeout_in_minutes: 20 + # key: check_generated + # retry: *automatic-retry - group: "End to End" key: e2e steps: - - label: "E2E: default :man_in_business_suit_levitating: (scion, ping)" - command: - - echo "--- build" - - make - - echo "--- start topology" - - ./scion.sh topology -c topology/default.topo - - ./scion.sh run - - tools/await-connectivity - - ./bin/scion_integration || ( echo "^^^ +++" && false ) - - ./bin/end2end_integration || ( echo "^^^ +++" && false ) - plugins: &shutdown-scion-post-command - - scionproto/metahook#v0.3.0: - post-command: | - echo "--- Shutting down SCION topology" - ./scion.sh stop - echo "SCION topology successfully shut down" - artifact_paths: - - "artifacts.out/**/*" - timeout_in_minutes: 15 - key: e2e_integration_tests_v2 - retry: *automatic-retry + # - label: "E2E: default :man_in_business_suit_levitating: (scion, ping)" + # command: + # - echo "--- build" + # - make + # - echo "--- start topology" + # - ./scion.sh topology -c topology/default.topo + # - ./scion.sh run + # - tools/await-connectivity + # - ./bin/scion_integration || ( echo "^^^ +++" && false ) + # - ./bin/end2end_integration || ( echo "^^^ +++" && false ) + # plugins: &shutdown-scion-post-command + # - scionproto/metahook#v0.3.0: + # post-command: | + # echo "--- Shutting down SCION topology" + # ./scion.sh stop + # echo "SCION topology successfully shut down" + # artifact_paths: + # - "artifacts.out/**/*" + # timeout_in_minutes: 15 + # key: e2e_integration_tests_v2 + # retry: *automatic-retry - label: "E2E: failing links :man_in_business_suit_levitating:" command: - echo "--- build" @@ -96,19 +96,20 @@ steps: timeout_in_minutes: 15 key: e2e_revocation_test_v2 retry: *automatic-retry - - label: "E2E: default :docker: (ping)" - command: - - echo "--- build" - - make build docker-images - - echo "--- start topology" - - ./scion.sh topology -d - - ./scion.sh run - - tools/await-connectivity - - echo "--- run tests" - - ./bin/end2end_integration -d || ( echo "^^^ +++" && false ) - plugins: *shutdown-scion-post-command - artifact_paths: - - "artifacts.out/**/*" - timeout_in_minutes: 15 - key: docker_integration_e2e_default - retry: *automatic-retry + parallelism: 30 + # - label: "E2E: default :docker: (ping)" + # command: + # - echo "--- build" + # - make build docker-images + # - echo "--- start topology" + # - ./scion.sh topology -d + # - ./scion.sh run + # - tools/await-connectivity + # - echo "--- run tests" + # - ./bin/end2end_integration -d || ( echo "^^^ +++" && false ) + # plugins: *shutdown-scion-post-command + # artifact_paths: + # - "artifacts.out/**/*" + # timeout_in_minutes: 15 + # key: docker_integration_e2e_default + # retry: *automatic-retry diff --git a/tools/end2end/main.go b/tools/end2end/main.go index 4db1e04f53..9789c1b4dc 100644 --- a/tools/end2end/main.go +++ b/tools/end2end/main.go @@ -22,6 +22,7 @@ import ( "errors" "flag" "fmt" + "math" "net" "os" "time" @@ -65,7 +66,7 @@ type Pong struct { var ( remote snet.UDPAddr - timeout = &util.DurWrap{Duration: 10 * time.Second} + timeout = &util.DurWrap{Duration: 5 * time.Second} scionPacketConnMetrics = metrics.NewSCIONPacketConnMetrics() scmpErrorsCounter = scionPacketConnMetrics.SCMPErrors epic bool @@ -246,7 +247,7 @@ type client struct { port uint16 sdConn daemon.Connector - errorPaths map[snet.PathFingerprint]struct{} + errorPaths map[snet.PathFingerprint]int // number of encountered errors/timeouts per path } func (c *client) run() int { @@ -273,7 +274,7 @@ func (c *client) run() int { fmt.Sprintf("%v,[%v]:%d", integration.Local.IA, integration.Local.Host.IP, c.port)) c.sdConn = integration.SDConn() defer c.sdConn.Close() - c.errorPaths = make(map[snet.PathFingerprint]struct{}) + c.errorPaths = make(map[snet.PathFingerprint]int) return integration.AttemptRepeatedly("End2End", c.attemptRequest) } @@ -295,6 +296,12 @@ func (c *client) attemptRequest(n int) bool { span, ctx = tracing.StartSpanFromCtx(ctx, "attempt.ping") defer span.Finish() + // While fetching paths may be slow and need a long timeout, the actual ping/pong + // is always quick if it works and only needs a very low timeout. + ctxPingpong, cancelReply := context.WithTimeout(ctx, 100*time.Millisecond) + defer cancelReply() + ctx = ctxPingpong + // Send ping if err := c.ping(ctx, n, path); err != nil { tracing.Error(span, err) @@ -306,7 +313,7 @@ func (c *client) attemptRequest(n int) bool { tracing.Error(span, err) logger.Error("Error receiving pong", "err", err) if path != nil { - c.errorPaths[snet.Fingerprint(path)] = struct{}{} + c.errorPaths[snet.Fingerprint(path)]++ } return false } @@ -369,22 +376,18 @@ func (c *client) getRemote(ctx context.Context, n int) (snet.Path, error) { } paths, err := c.sdConn.Paths(ctx, remote.IA, integration.Local.IA, - daemon.PathReqFlags{Refresh: n != 0}) + daemon.PathReqFlags{Refresh: false}) if err != nil { return nil, withTag(serrors.WrapStr("requesting paths", err)) } - // If all paths had an error, let's try them again. - if len(paths) <= len(c.errorPaths) { - c.errorPaths = make(map[snet.PathFingerprint]struct{}) - } - // Select first path that didn't error before. + // Select path that errored fewest times before var path snet.Path + lowestErrCount := math.MaxInt for _, p := range paths { - if _, ok := c.errorPaths[snet.Fingerprint(p)]; ok { - continue + if e := c.errorPaths[snet.Fingerprint(p)]; e < lowestErrCount { + path = p + lowestErrCount = e } - path = p - break } if path == nil { return nil, withTag(serrors.New("no path found", diff --git a/tools/end2end_integration/main.go b/tools/end2end_integration/main.go index 757d252381..03adeb21bf 100644 --- a/tools/end2end_integration/main.go +++ b/tools/end2end_integration/main.go @@ -37,7 +37,7 @@ import ( var ( subset string attempts int - timeout = &util.DurWrap{Duration: 10 * time.Second} + timeout = &util.DurWrap{Duration: 5 * time.Second} parallelism int name string cmd string From 2fd29491d07f39c73c4f1f4d57c7c6fd59128382 Mon Sep 17 00:00:00 2001 From: Matthias Frei Date: Thu, 22 Jun 2023 14:54:57 +0200 Subject: [PATCH 2/4] fixup: pipeline yaml --- .buildkite/pipeline.yml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 5c7a666596..f4555d5fba 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -90,12 +90,20 @@ steps: - tools/await-connectivity - ./bin/end2end_integration || ( echo "^^^ +++" && false ) - ./tools/integration/revocation_test.sh - plugins: *shutdown-scion-post-command + plugins: &shutdown-scion-post-command + - scionproto/metahook#v0.3.0: + post-command: | + echo "--- Shutting down SCION topology" + ./scion.sh stop + echo "SCION topology successfully shut down" artifact_paths: - "artifacts.out/**/*" timeout_in_minutes: 15 key: e2e_revocation_test_v2 - retry: *automatic-retry + retry: &automatic-retry + automatic: + - exit_status: -1 # Agent was lost + - exit_status: 255 # Forced agent shutdown parallelism: 30 # - label: "E2E: default :docker: (ping)" # command: From 7234f03427a3897cb7742c04897f00e2fc2224fe Mon Sep 17 00:00:00 2001 From: Matthias Frei Date: Mon, 26 Jun 2023 11:34:28 +0200 Subject: [PATCH 3/4] end2end: refresh to get paths discovered after initial request --- tools/end2end/main.go | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/tools/end2end/main.go b/tools/end2end/main.go index 9789c1b4dc..41deae7299 100644 --- a/tools/end2end/main.go +++ b/tools/end2end/main.go @@ -22,7 +22,6 @@ import ( "errors" "flag" "fmt" - "math" "net" "os" "time" @@ -247,7 +246,8 @@ type client struct { port uint16 sdConn daemon.Connector - errorPaths map[snet.PathFingerprint]int // number of encountered errors/timeouts per path + errorPaths map[snet.PathFingerprint]struct{} + triedAllPaths bool } func (c *client) run() int { @@ -274,7 +274,7 @@ func (c *client) run() int { fmt.Sprintf("%v,[%v]:%d", integration.Local.IA, integration.Local.Host.IP, c.port)) c.sdConn = integration.SDConn() defer c.sdConn.Close() - c.errorPaths = make(map[snet.PathFingerprint]int) + c.errorPaths = make(map[snet.PathFingerprint]struct{}) return integration.AttemptRepeatedly("End2End", c.attemptRequest) } @@ -313,7 +313,7 @@ func (c *client) attemptRequest(n int) bool { tracing.Error(span, err) logger.Error("Error receiving pong", "err", err) if path != nil { - c.errorPaths[snet.Fingerprint(path)]++ + c.errorPaths[snet.Fingerprint(path)] = struct{}{} } return false } @@ -375,20 +375,34 @@ func (c *client) getRemote(ctx context.Context, n int) (snet.Path, error) { return err } + refresh := false + if c.triedAllPaths { + // All paths have been tried, and as we're trying again it appears there was no success. + // We'll refresh and retry all available paths. + // The refresh could help in case that the beaconing has discovered new paths since the + // daemon/CS have first cached the paths to this destination. + refresh = true + c.errorPaths = make(map[snet.PathFingerprint]struct{}) + c.triedAllPaths = false + } paths, err := c.sdConn.Paths(ctx, remote.IA, integration.Local.IA, - daemon.PathReqFlags{Refresh: false}) + daemon.PathReqFlags{Refresh: refresh}) if err != nil { return nil, withTag(serrors.WrapStr("requesting paths", err)) } - // Select path that errored fewest times before + // Select first path that didn't error before. var path snet.Path - lowestErrCount := math.MaxInt + lastAvailablePath := true for _, p := range paths { - if e := c.errorPaths[snet.Fingerprint(p)]; e < lowestErrCount { + if _, ok := c.errorPaths[snet.Fingerprint(p)]; !ok { + if path != nil { + lastAvailablePath = false + break + } path = p - lowestErrCount = e } } + c.triedAllPaths = lastAvailablePath if path == nil { return nil, withTag(serrors.New("no path found", "candidates", len(paths), From aefa15c219625853d86f4e089443215686c3baa8 Mon Sep 17 00:00:00 2001 From: Matthias Frei Date: Mon, 26 Jun 2023 12:55:31 +0200 Subject: [PATCH 4/4] e2e: revert timeout values --- tools/end2end/main.go | 2 +- tools/end2end_integration/main.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/end2end/main.go b/tools/end2end/main.go index 41deae7299..c27f3438fe 100644 --- a/tools/end2end/main.go +++ b/tools/end2end/main.go @@ -65,7 +65,7 @@ type Pong struct { var ( remote snet.UDPAddr - timeout = &util.DurWrap{Duration: 5 * time.Second} + timeout = &util.DurWrap{Duration: 10 * time.Second} scionPacketConnMetrics = metrics.NewSCIONPacketConnMetrics() scmpErrorsCounter = scionPacketConnMetrics.SCMPErrors epic bool diff --git a/tools/end2end_integration/main.go b/tools/end2end_integration/main.go index 03adeb21bf..757d252381 100644 --- a/tools/end2end_integration/main.go +++ b/tools/end2end_integration/main.go @@ -37,7 +37,7 @@ import ( var ( subset string attempts int - timeout = &util.DurWrap{Duration: 5 * time.Second} + timeout = &util.DurWrap{Duration: 10 * time.Second} parallelism int name string cmd string