Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: fix e2e revocation test flakes (WIP) #4356

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ export PARALLELISM=1
. ./.buildkite/pipeline_lib.sh

cat .buildkite/pipeline.yml
gen_bazel_test_steps
#gen_bazel_test_steps
191 changes: 100 additions & 91 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,74 +1,95 @@
env:
GOPROXY: "http://localhost:3200|https://proxy.golang.org|direct"
steps:
- label: "Build :bazel:"
command:
- bazel build --verbose_failures --announce_rc //:all
- bazel run --verbose_failures //docker:prod //docker:test
key: build
retry: &automatic-retry
automatic:
- exit_status: -1 # Agent was lost
- exit_status: 255 # Forced agent shutdown
timeout_in_minutes: 10
- wait
- label: "Unit Tests :bazel:"
command:
- bazel test --config=race --config=unit_all
key: unit_tests
artifact_paths:
- "artifacts.out/**/*"
retry: *automatic-retry
timeout_in_minutes: 20
- label: "Lint :bash:"
command:
- make lint
key: lint
retry: *automatic-retry
timeout_in_minutes: 20
- label: "Check Generated :bash:"
command:
- echo "--- go_deps.bzl"
- mkdir -p /tmp/test-artifacts
- cp go.mod go.sum go_deps.bzl /tmp/test-artifacts/
- make go_deps.bzl -B
- make go-mod-tidy
- diff -u /tmp/test-artifacts/go.mod go.mod
- diff -u /tmp/test-artifacts/go.sum go.sum
- diff -u /tmp/test-artifacts/go_deps.bzl go_deps.bzl
- echo "--- protobuf"
- cp -R pkg/proto/ /tmp/test-artifacts
- make protobuf
- diff -ur /tmp/test-artifacts/proto/ pkg/proto/
- echo "--- licenses"
- mkdir -p /tmp/test-artifacts/licenses
- ./tools/licenses.sh /tmp/test-artifacts/licenses
- diff -rNu3 /tmp/test-artifacts/licenses ./licenses/data
- echo "--- gomocks"
- ./tools/gomocks.py diff
- echo "--- antlr"
- rm -rf /tmp/test-artifacts/antlr
- cp -R antlr/ /tmp/test-artifacts/antlr
- make antlr
- diff -ur /tmp/test-artifacts/antlr/ antlr/
- echo "--- testdata"
- ./tools/update_testdata.sh
timeout_in_minutes: 20
key: check_generated
retry: *automatic-retry
# - label: "Build :bazel:"
# command:
# - bazel build --verbose_failures --announce_rc //:all
# - bazel run --verbose_failures //docker:prod //docker:test
# key: build
# retry: &automatic-retry
# automatic:
# - exit_status: -1 # Agent was lost
# - exit_status: 255 # Forced agent shutdown
# timeout_in_minutes: 10
# - wait
# - label: "Unit Tests :bazel:"
# command:
# - bazel test --config=race --config=unit_all
# key: unit_tests
# artifact_paths:
# - "artifacts.out/**/*"
# retry: *automatic-retry
# timeout_in_minutes: 20
# - label: "Lint :bash:"
# command:
# - make lint
# key: lint
# retry: *automatic-retry
# timeout_in_minutes: 20
# - label: "Check Generated :bash:"
# command:
# - echo "--- go_deps.bzl"
# - mkdir -p /tmp/test-artifacts
# - cp go.mod go.sum go_deps.bzl /tmp/test-artifacts/
# - make go_deps.bzl -B
# - make go-mod-tidy
# - diff -u /tmp/test-artifacts/go.mod go.mod
# - diff -u /tmp/test-artifacts/go.sum go.sum
# - diff -u /tmp/test-artifacts/go_deps.bzl go_deps.bzl
# - echo "--- protobuf"
# - cp -R pkg/proto/ /tmp/test-artifacts
# - make protobuf
# - diff -ur /tmp/test-artifacts/proto/ pkg/proto/
# - echo "--- licenses"
# - mkdir -p /tmp/test-artifacts/licenses
# - ./tools/licenses.sh /tmp/test-artifacts/licenses
# - diff -rNu3 /tmp/test-artifacts/licenses ./licenses/data
# - echo "--- gomocks"
# - ./tools/gomocks.py diff
# - echo "--- antlr"
# - rm -rf /tmp/test-artifacts/antlr
# - cp -R antlr/ /tmp/test-artifacts/antlr
# - make antlr
# - diff -ur /tmp/test-artifacts/antlr/ antlr/
# - echo "--- testdata"
# - ./tools/update_testdata.sh
# timeout_in_minutes: 20
# key: check_generated
# retry: *automatic-retry
- group: "End to End"
key: e2e
steps:
- label: "E2E: default :man_in_business_suit_levitating: (scion, ping)"
# - label: "E2E: default :man_in_business_suit_levitating: (scion, ping)"
# command:
# - echo "--- build"
# - make
# - echo "--- start topology"
# - ./scion.sh topology -c topology/default.topo
# - ./scion.sh run
# - tools/await-connectivity
# - ./bin/scion_integration || ( echo "^^^ +++" && false )
# - ./bin/end2end_integration || ( echo "^^^ +++" && false )
# plugins: &shutdown-scion-post-command
# - scionproto/metahook#v0.3.0:
# post-command: |
# echo "--- Shutting down SCION topology"
# ./scion.sh stop
# echo "SCION topology successfully shut down"
# artifact_paths:
# - "artifacts.out/**/*"
# timeout_in_minutes: 15
# key: e2e_integration_tests_v2
# retry: *automatic-retry
- label: "E2E: failing links :man_in_business_suit_levitating:"
command:
- echo "--- build"
- make
- echo "--- start topology"
- ./scion.sh topology -c topology/default.topo
- ./scion.sh topology -c topology/default-no-peers.topo
- ./scion.sh run
- tools/await-connectivity
- ./bin/scion_integration || ( echo "^^^ +++" && false )
- ./bin/end2end_integration || ( echo "^^^ +++" && false )
- ./tools/integration/revocation_test.sh
plugins: &shutdown-scion-post-command
- scionproto/metahook#v0.3.0:
post-command: |
Expand All @@ -78,37 +99,25 @@ steps:
artifact_paths:
- "artifacts.out/**/*"
timeout_in_minutes: 15
key: e2e_integration_tests_v2
retry: *automatic-retry
- label: "E2E: failing links :man_in_business_suit_levitating:"
command:
- echo "--- build"
- make
- echo "--- start topology"
- ./scion.sh topology -c topology/default-no-peers.topo
- ./scion.sh run
- tools/await-connectivity
- ./bin/end2end_integration || ( echo "^^^ +++" && false )
- ./tools/integration/revocation_test.sh
plugins: *shutdown-scion-post-command
artifact_paths:
- "artifacts.out/**/*"
timeout_in_minutes: 15
key: e2e_revocation_test_v2
retry: *automatic-retry
- label: "E2E: default :docker: (ping)"
command:
- echo "--- build"
- make build docker-images
- echo "--- start topology"
- ./scion.sh topology -d
- ./scion.sh run
- tools/await-connectivity
- echo "--- run tests"
- ./bin/end2end_integration -d || ( echo "^^^ +++" && false )
plugins: *shutdown-scion-post-command
artifact_paths:
- "artifacts.out/**/*"
timeout_in_minutes: 15
key: docker_integration_e2e_default
retry: *automatic-retry
retry: &automatic-retry
automatic:
- exit_status: -1 # Agent was lost
- exit_status: 255 # Forced agent shutdown
parallelism: 30
# - label: "E2E: default :docker: (ping)"
# command:
# - echo "--- build"
# - make build docker-images
# - echo "--- start topology"
# - ./scion.sh topology -d
# - ./scion.sh run
# - tools/await-connectivity
# - echo "--- run tests"
# - ./bin/end2end_integration -d || ( echo "^^^ +++" && false )
# plugins: *shutdown-scion-post-command
# artifact_paths:
# - "artifacts.out/**/*"
# timeout_in_minutes: 15
# key: docker_integration_e2e_default
# retry: *automatic-retry
37 changes: 27 additions & 10 deletions tools/end2end/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,8 @@ type client struct {
port uint16
sdConn daemon.Connector

errorPaths map[snet.PathFingerprint]struct{}
errorPaths map[snet.PathFingerprint]struct{}
triedAllPaths bool
}

func (c *client) run() int {
Expand Down Expand Up @@ -295,6 +296,12 @@ func (c *client) attemptRequest(n int) bool {
span, ctx = tracing.StartSpanFromCtx(ctx, "attempt.ping")
defer span.Finish()

// While fetching paths may be slow and need a long timeout, the actual ping/pong
// is always quick if it works and only needs a very low timeout.
ctxPingpong, cancelReply := context.WithTimeout(ctx, 100*time.Millisecond)
defer cancelReply()
ctx = ctxPingpong

// Send ping
if err := c.ping(ctx, n, path); err != nil {
tracing.Error(span, err)
Expand Down Expand Up @@ -368,24 +375,34 @@ func (c *client) getRemote(ctx context.Context, n int) (snet.Path, error) {
return err
}

refresh := false
if c.triedAllPaths {
// All paths have been tried, and as we're trying again it appears there was no success.
// We'll refresh and retry all available paths.
// The refresh could help in case that the beaconing has discovered new paths since the
// daemon/CS have first cached the paths to this destination.
refresh = true
c.errorPaths = make(map[snet.PathFingerprint]struct{})
c.triedAllPaths = false
}
paths, err := c.sdConn.Paths(ctx, remote.IA, integration.Local.IA,
daemon.PathReqFlags{Refresh: n != 0})
daemon.PathReqFlags{Refresh: refresh})
if err != nil {
return nil, withTag(serrors.WrapStr("requesting paths", err))
}
// If all paths had an error, let's try them again.
if len(paths) <= len(c.errorPaths) {
c.errorPaths = make(map[snet.PathFingerprint]struct{})
}
// Select first path that didn't error before.
var path snet.Path
lastAvailablePath := true
for _, p := range paths {
if _, ok := c.errorPaths[snet.Fingerprint(p)]; ok {
continue
if _, ok := c.errorPaths[snet.Fingerprint(p)]; !ok {
if path != nil {
lastAvailablePath = false
break
}
path = p
}
path = p
break
}
c.triedAllPaths = lastAvailablePath
if path == nil {
return nil, withTag(serrors.New("no path found",
"candidates", len(paths),
Expand Down