scionproto · matzf · Jun 22, 2023 · Jun 22, 2023 · Jun 26, 2023 · Jun 26, 2023
diff --git a/.buildkite/pipeline.sh b/.buildkite/pipeline.sh
@@ -19,4 +19,4 @@ export PARALLELISM=1
 . ./.buildkite/pipeline_lib.sh
 
 cat .buildkite/pipeline.yml
-gen_bazel_test_steps
+#gen_bazel_test_steps
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -1,74 +1,95 @@
 env:
   GOPROXY: "http://localhost:3200|https://proxy.golang.org|direct"
 steps:
-  - label: "Build :bazel:"
-    command:
-      - bazel build --verbose_failures --announce_rc //:all
-      - bazel run --verbose_failures //docker:prod //docker:test
-    key: build
-    retry: &automatic-retry
-      automatic:
-        - exit_status: -1 # Agent was lost
-        - exit_status: 255 # Forced agent shutdown
-    timeout_in_minutes: 10
-  - wait
-  - label: "Unit Tests :bazel:"
-    command:
-      - bazel test --config=race --config=unit_all
-    key: unit_tests
-    artifact_paths:
-      - "artifacts.out/**/*"
-    retry: *automatic-retry
-    timeout_in_minutes: 20
-  - label: "Lint :bash:"
-    command:
-      - make lint
-    key: lint
-    retry: *automatic-retry
-    timeout_in_minutes: 20
-  - label: "Check Generated :bash:"
-    command:
-      - echo "--- go_deps.bzl"
-      - mkdir -p /tmp/test-artifacts
-      - cp go.mod go.sum go_deps.bzl /tmp/test-artifacts/
-      - make go_deps.bzl -B
-      - make go-mod-tidy
-      - diff -u /tmp/test-artifacts/go.mod go.mod
-      - diff -u /tmp/test-artifacts/go.sum go.sum
-      - diff -u /tmp/test-artifacts/go_deps.bzl go_deps.bzl
-      - echo "--- protobuf"
-      - cp -R pkg/proto/ /tmp/test-artifacts
-      - make protobuf
-      - diff -ur /tmp/test-artifacts/proto/ pkg/proto/
-      - echo "--- licenses"
-      - mkdir -p /tmp/test-artifacts/licenses
-      - ./tools/licenses.sh /tmp/test-artifacts/licenses
-      - diff -rNu3 /tmp/test-artifacts/licenses ./licenses/data
-      - echo "--- gomocks"
-      - ./tools/gomocks.py diff
-      - echo "--- antlr"
-      - rm -rf /tmp/test-artifacts/antlr
-      - cp -R antlr/ /tmp/test-artifacts/antlr
-      - make antlr
-      - diff -ur /tmp/test-artifacts/antlr/ antlr/
-      - echo "--- testdata"
-      - ./tools/update_testdata.sh
-    timeout_in_minutes: 20
-    key: check_generated
-    retry: *automatic-retry
+  # - label: "Build :bazel:"
+  #   command:
+  #     - bazel build --verbose_failures --announce_rc //:all
+  #     - bazel run --verbose_failures //docker:prod //docker:test
+  #   key: build
+  #   retry: &automatic-retry
+  #     automatic:
+  #       - exit_status: -1 # Agent was lost
+  #       - exit_status: 255 # Forced agent shutdown
+  #   timeout_in_minutes: 10
+  # - wait
+  # - label: "Unit Tests :bazel:"
+  #   command:
+  #     - bazel test --config=race --config=unit_all
+  #   key: unit_tests
+  #   artifact_paths:
+  #     - "artifacts.out/**/*"
+  #   retry: *automatic-retry
+  #   timeout_in_minutes: 20
+  # - label: "Lint :bash:"
+  #   command:
+  #     - make lint
+  #   key: lint
+  #   retry: *automatic-retry
+  #   timeout_in_minutes: 20
+  # - label: "Check Generated :bash:"
+  #   command:
+  #     - echo "--- go_deps.bzl"
+  #     - mkdir -p /tmp/test-artifacts
+  #     - cp go.mod go.sum go_deps.bzl /tmp/test-artifacts/
+  #     - make go_deps.bzl -B
+  #     - make go-mod-tidy
+  #     - diff -u /tmp/test-artifacts/go.mod go.mod
+  #     - diff -u /tmp/test-artifacts/go.sum go.sum
+  #     - diff -u /tmp/test-artifacts/go_deps.bzl go_deps.bzl
+  #     - echo "--- protobuf"
+  #     - cp -R pkg/proto/ /tmp/test-artifacts
+  #     - make protobuf
+  #     - diff -ur /tmp/test-artifacts/proto/ pkg/proto/
+  #     - echo "--- licenses"
+  #     - mkdir -p /tmp/test-artifacts/licenses
+  #     - ./tools/licenses.sh /tmp/test-artifacts/licenses
+  #     - diff -rNu3 /tmp/test-artifacts/licenses ./licenses/data
+  #     - echo "--- gomocks"
+  #     - ./tools/gomocks.py diff
+  #     - echo "--- antlr"
+  #     - rm -rf /tmp/test-artifacts/antlr
+  #     - cp -R antlr/ /tmp/test-artifacts/antlr
+  #     - make antlr
+  #     - diff -ur /tmp/test-artifacts/antlr/ antlr/
+  #     - echo "--- testdata"
+  #     - ./tools/update_testdata.sh
+  #   timeout_in_minutes: 20
+  #   key: check_generated
+  #   retry: *automatic-retry
   - group: "End to End"
     key: e2e
     steps:
-    - label: "E2E: default :man_in_business_suit_levitating: (scion, ping)"
+    # - label: "E2E: default :man_in_business_suit_levitating: (scion, ping)"
+    #   command:
+    #     - echo "--- build"
+    #     - make
+    #     - echo "--- start topology"
+    #     - ./scion.sh topology -c topology/default.topo
+    #     - ./scion.sh run
+    #     - tools/await-connectivity
+    #     - ./bin/scion_integration || ( echo "^^^ +++" && false )
+    #     - ./bin/end2end_integration || ( echo "^^^ +++" && false )
+    #   plugins: &shutdown-scion-post-command
+    #     - scionproto/metahook#v0.3.0:
+    #         post-command: |
+    #           echo "--- Shutting down SCION topology"
+    #           ./scion.sh stop
+    #           echo "SCION topology successfully shut down"
+    #   artifact_paths:
+    #     - "artifacts.out/**/*"
+    #   timeout_in_minutes: 15
+    #   key: e2e_integration_tests_v2
+    #   retry: *automatic-retry
+    - label: "E2E: failing links :man_in_business_suit_levitating:"
       command:
         - echo "--- build"
         - make
         - echo "--- start topology"
-        - ./scion.sh topology -c topology/default.topo
+        - ./scion.sh topology -c topology/default-no-peers.topo
         - ./scion.sh run
         - tools/await-connectivity
-        - ./bin/scion_integration || ( echo "^^^ +++" && false )
         - ./bin/end2end_integration || ( echo "^^^ +++" && false )
+        - ./tools/integration/revocation_test.sh
       plugins: &shutdown-scion-post-command
         - scionproto/metahook#v0.3.0:
             post-command: |
@@ -78,37 +99,25 @@ steps:
       artifact_paths:
         - "artifacts.out/**/*"
       timeout_in_minutes: 15
-      key: e2e_integration_tests_v2
-      retry: *automatic-retry
-    - label: "E2E: failing links :man_in_business_suit_levitating:"
-      command:
-        - echo "--- build"
-        - make
-        - echo "--- start topology"
-        - ./scion.sh topology -c topology/default-no-peers.topo
-        - ./scion.sh run
-        - tools/await-connectivity
-        - ./bin/end2end_integration || ( echo "^^^ +++" && false )
-        - ./tools/integration/revocation_test.sh
-      plugins: *shutdown-scion-post-command
-      artifact_paths:
-        - "artifacts.out/**/*"
-      timeout_in_minutes: 15
       key: e2e_revocation_test_v2
-      retry: *automatic-retry
-    - label: "E2E: default :docker: (ping)"
-      command:
-        - echo "--- build"
-        - make build docker-images
-        - echo "--- start topology"
-        - ./scion.sh topology -d
-        - ./scion.sh run
-        - tools/await-connectivity
-        - echo "--- run tests"
-        - ./bin/end2end_integration -d || ( echo "^^^ +++" && false )
-      plugins: *shutdown-scion-post-command
-      artifact_paths:
-        - "artifacts.out/**/*"
-      timeout_in_minutes: 15
-      key: docker_integration_e2e_default
-      retry: *automatic-retry
+      retry: &automatic-retry
+        automatic:
+          - exit_status: -1 # Agent was lost
+          - exit_status: 255 # Forced agent shutdown
+      parallelism: 30
+    # - label: "E2E: default :docker: (ping)"
+    #   command:
+    #     - echo "--- build"
+    #     - make build docker-images
+    #     - echo "--- start topology"
+    #     - ./scion.sh topology -d
+    #     - ./scion.sh run
+    #     - tools/await-connectivity
+    #     - echo "--- run tests"
+    #     - ./bin/end2end_integration -d || ( echo "^^^ +++" && false )
+    #   plugins: *shutdown-scion-post-command
+    #   artifact_paths:
+    #     - "artifacts.out/**/*"
+    #   timeout_in_minutes: 15
+    #   key: docker_integration_e2e_default
+    #   retry: *automatic-retry
diff --git a/tools/end2end/main.go b/tools/end2end/main.go
@@ -246,7 +246,8 @@ type client struct {
 	port   uint16
 	sdConn daemon.Connector
 
-	errorPaths map[snet.PathFingerprint]struct{}
+	errorPaths    map[snet.PathFingerprint]struct{}
+	triedAllPaths bool
 }
 
 func (c *client) run() int {
@@ -295,6 +296,12 @@ func (c *client) attemptRequest(n int) bool {
 	span, ctx = tracing.StartSpanFromCtx(ctx, "attempt.ping")
 	defer span.Finish()
 
+	// While fetching paths may be slow and need a long timeout, the actual ping/pong
+	// is always quick if it works and only needs a very low timeout.
+	ctxPingpong, cancelReply := context.WithTimeout(ctx, 100*time.Millisecond)
+	defer cancelReply()
+	ctx = ctxPingpong
+
 	// Send ping
 	if err := c.ping(ctx, n, path); err != nil {
 		tracing.Error(span, err)
@@ -368,24 +375,34 @@ func (c *client) getRemote(ctx context.Context, n int) (snet.Path, error) {
 		return err
 	}
 
+	refresh := false
+	if c.triedAllPaths {
+		// All paths have been tried, and as we're trying again it appears there was no success.
+		// We'll refresh and retry all available paths.
+		// The refresh could help in case that the beaconing has discovered new paths since the
+		// daemon/CS have first cached the paths to this destination.
+		refresh = true
+		c.errorPaths = make(map[snet.PathFingerprint]struct{})
+		c.triedAllPaths = false
+	}
 	paths, err := c.sdConn.Paths(ctx, remote.IA, integration.Local.IA,
-		daemon.PathReqFlags{Refresh: n != 0})
+		daemon.PathReqFlags{Refresh: refresh})
 	if err != nil {
 		return nil, withTag(serrors.WrapStr("requesting paths", err))
 	}
-	// If all paths had an error, let's try them again.
-	if len(paths) <= len(c.errorPaths) {
-		c.errorPaths = make(map[snet.PathFingerprint]struct{})
-	}
 	// Select first path that didn't error before.
 	var path snet.Path
+	lastAvailablePath := true
 	for _, p := range paths {
-		if _, ok := c.errorPaths[snet.Fingerprint(p)]; ok {
-			continue
+		if _, ok := c.errorPaths[snet.Fingerprint(p)]; !ok {
+			if path != nil {
+				lastAvailablePath = false
+				break
+			}
+			path = p
 		}
-		path = p
-		break
 	}
+	c.triedAllPaths = lastAvailablePath
 	if path == nil {
 		return nil, withTag(serrors.New("no path found",
 			"candidates", len(paths),