Skip to content

Commit

Permalink
ci: fix e2e revocation test flakes (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
matzf committed Jun 22, 2023
1 parent 5f7802c commit 0985021
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 108 deletions.
2 changes: 1 addition & 1 deletion .buildkite/pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ export PARALLELISM=1
. ./.buildkite/pipeline_lib.sh

cat .buildkite/pipeline.yml
gen_bazel_test_steps
#gen_bazel_test_steps
185 changes: 93 additions & 92 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,85 +1,85 @@
env:
GOPROXY: "http://localhost:3200|https://proxy.golang.org|direct"
steps:
- label: "Build :bazel:"
command:
- bazel build --verbose_failures --announce_rc //:all
- bazel run --verbose_failures //docker:prod //docker:test
key: build
retry: &automatic-retry
automatic:
- exit_status: -1 # Agent was lost
- exit_status: 255 # Forced agent shutdown
timeout_in_minutes: 10
- wait
- label: "Unit Tests :bazel:"
command:
- bazel test --config=race --config=unit_all
key: unit_tests
artifact_paths:
- "artifacts.out/**/*"
retry: *automatic-retry
timeout_in_minutes: 20
- label: "Lint :bash:"
command:
- make lint
key: lint
retry: *automatic-retry
timeout_in_minutes: 20
- label: "Check Generated :bash:"
command:
- echo "--- go_deps.bzl"
- mkdir -p /tmp/test-artifacts
- cp go.mod go.sum go_deps.bzl /tmp/test-artifacts/
- make go_deps.bzl -B
- make go-mod-tidy
- diff -u /tmp/test-artifacts/go.mod go.mod
- diff -u /tmp/test-artifacts/go.sum go.sum
- diff -u /tmp/test-artifacts/go_deps.bzl go_deps.bzl
- echo "--- protobuf"
- cp -R pkg/proto/ /tmp/test-artifacts
- make protobuf
- diff -ur /tmp/test-artifacts/proto/ pkg/proto/
- echo "--- licenses"
- mkdir -p /tmp/test-artifacts/licenses
- ./tools/licenses.sh /tmp/test-artifacts/licenses
- diff -rNu3 /tmp/test-artifacts/licenses ./licenses/data
- echo "--- gomocks"
- ./tools/gomocks.py diff
- echo "--- antlr"
- rm -rf /tmp/test-artifacts/antlr
- cp -R antlr/ /tmp/test-artifacts/antlr
- make antlr
- diff -ur /tmp/test-artifacts/antlr/ antlr/
- echo "--- testdata"
- ./tools/update_testdata.sh
timeout_in_minutes: 20
key: check_generated
retry: *automatic-retry
# - label: "Build :bazel:"
# command:
# - bazel build --verbose_failures --announce_rc //:all
# - bazel run --verbose_failures //docker:prod //docker:test
# key: build
# retry: &automatic-retry
# automatic:
# - exit_status: -1 # Agent was lost
# - exit_status: 255 # Forced agent shutdown
# timeout_in_minutes: 10
# - wait
# - label: "Unit Tests :bazel:"
# command:
# - bazel test --config=race --config=unit_all
# key: unit_tests
# artifact_paths:
# - "artifacts.out/**/*"
# retry: *automatic-retry
# timeout_in_minutes: 20
# - label: "Lint :bash:"
# command:
# - make lint
# key: lint
# retry: *automatic-retry
# timeout_in_minutes: 20
# - label: "Check Generated :bash:"
# command:
# - echo "--- go_deps.bzl"
# - mkdir -p /tmp/test-artifacts
# - cp go.mod go.sum go_deps.bzl /tmp/test-artifacts/
# - make go_deps.bzl -B
# - make go-mod-tidy
# - diff -u /tmp/test-artifacts/go.mod go.mod
# - diff -u /tmp/test-artifacts/go.sum go.sum
# - diff -u /tmp/test-artifacts/go_deps.bzl go_deps.bzl
# - echo "--- protobuf"
# - cp -R pkg/proto/ /tmp/test-artifacts
# - make protobuf
# - diff -ur /tmp/test-artifacts/proto/ pkg/proto/
# - echo "--- licenses"
# - mkdir -p /tmp/test-artifacts/licenses
# - ./tools/licenses.sh /tmp/test-artifacts/licenses
# - diff -rNu3 /tmp/test-artifacts/licenses ./licenses/data
# - echo "--- gomocks"
# - ./tools/gomocks.py diff
# - echo "--- antlr"
# - rm -rf /tmp/test-artifacts/antlr
# - cp -R antlr/ /tmp/test-artifacts/antlr
# - make antlr
# - diff -ur /tmp/test-artifacts/antlr/ antlr/
# - echo "--- testdata"
# - ./tools/update_testdata.sh
# timeout_in_minutes: 20
# key: check_generated
# retry: *automatic-retry
- group: "End to End"
key: e2e
steps:
- label: "E2E: default :man_in_business_suit_levitating: (scion, ping)"
command:
- echo "--- build"
- make
- echo "--- start topology"
- ./scion.sh topology -c topology/default.topo
- ./scion.sh run
- tools/await-connectivity
- ./bin/scion_integration || ( echo "^^^ +++" && false )
- ./bin/end2end_integration || ( echo "^^^ +++" && false )
plugins: &shutdown-scion-post-command
- scionproto/metahook#v0.3.0:
post-command: |
echo "--- Shutting down SCION topology"
./scion.sh stop
echo "SCION topology successfully shut down"
artifact_paths:
- "artifacts.out/**/*"
timeout_in_minutes: 15
key: e2e_integration_tests_v2
retry: *automatic-retry
# - label: "E2E: default :man_in_business_suit_levitating: (scion, ping)"
# command:
# - echo "--- build"
# - make
# - echo "--- start topology"
# - ./scion.sh topology -c topology/default.topo
# - ./scion.sh run
# - tools/await-connectivity
# - ./bin/scion_integration || ( echo "^^^ +++" && false )
# - ./bin/end2end_integration || ( echo "^^^ +++" && false )
# plugins: &shutdown-scion-post-command
# - scionproto/metahook#v0.3.0:
# post-command: |
# echo "--- Shutting down SCION topology"
# ./scion.sh stop
# echo "SCION topology successfully shut down"
# artifact_paths:
# - "artifacts.out/**/*"
# timeout_in_minutes: 15
# key: e2e_integration_tests_v2
# retry: *automatic-retry
- label: "E2E: failing links :man_in_business_suit_levitating:"
command:
- echo "--- build"
Expand All @@ -96,19 +96,20 @@ steps:
timeout_in_minutes: 15
key: e2e_revocation_test_v2
retry: *automatic-retry
- label: "E2E: default :docker: (ping)"
command:
- echo "--- build"
- make build docker-images
- echo "--- start topology"
- ./scion.sh topology -d
- ./scion.sh run
- tools/await-connectivity
- echo "--- run tests"
- ./bin/end2end_integration -d || ( echo "^^^ +++" && false )
plugins: *shutdown-scion-post-command
artifact_paths:
- "artifacts.out/**/*"
timeout_in_minutes: 15
key: docker_integration_e2e_default
retry: *automatic-retry
parallelism: 30
# - label: "E2E: default :docker: (ping)"
# command:
# - echo "--- build"
# - make build docker-images
# - echo "--- start topology"
# - ./scion.sh topology -d
# - ./scion.sh run
# - tools/await-connectivity
# - echo "--- run tests"
# - ./bin/end2end_integration -d || ( echo "^^^ +++" && false )
# plugins: *shutdown-scion-post-command
# artifact_paths:
# - "artifacts.out/**/*"
# timeout_in_minutes: 15
# key: docker_integration_e2e_default
# retry: *automatic-retry
31 changes: 17 additions & 14 deletions tools/end2end/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"errors"
"flag"
"fmt"
"math"
"net"
"os"
"time"
Expand Down Expand Up @@ -65,7 +66,7 @@ type Pong struct {

var (
remote snet.UDPAddr
timeout = &util.DurWrap{Duration: 10 * time.Second}
timeout = &util.DurWrap{Duration: 5 * time.Second}
scionPacketConnMetrics = metrics.NewSCIONPacketConnMetrics()
scmpErrorsCounter = scionPacketConnMetrics.SCMPErrors
epic bool
Expand Down Expand Up @@ -246,7 +247,7 @@ type client struct {
port uint16
sdConn daemon.Connector

errorPaths map[snet.PathFingerprint]struct{}
errorPaths map[snet.PathFingerprint]int // number of encountered errors/timeouts per path
}

func (c *client) run() int {
Expand All @@ -273,7 +274,7 @@ func (c *client) run() int {
fmt.Sprintf("%v,[%v]:%d", integration.Local.IA, integration.Local.Host.IP, c.port))
c.sdConn = integration.SDConn()
defer c.sdConn.Close()
c.errorPaths = make(map[snet.PathFingerprint]struct{})
c.errorPaths = make(map[snet.PathFingerprint]int)
return integration.AttemptRepeatedly("End2End", c.attemptRequest)
}

Expand All @@ -295,6 +296,12 @@ func (c *client) attemptRequest(n int) bool {
span, ctx = tracing.StartSpanFromCtx(ctx, "attempt.ping")
defer span.Finish()

// While fetching paths may be slow and need a long timeout, the actual ping/pong
// is always quick if it works and only needs a very low timeout.
ctxPingpong, cancelReply := context.WithTimeout(ctx, 100*time.Millisecond)
defer cancelReply()
ctx = ctxPingpong

// Send ping
if err := c.ping(ctx, n, path); err != nil {
tracing.Error(span, err)
Expand All @@ -306,7 +313,7 @@ func (c *client) attemptRequest(n int) bool {
tracing.Error(span, err)
logger.Error("Error receiving pong", "err", err)
if path != nil {
c.errorPaths[snet.Fingerprint(path)] = struct{}{}
c.errorPaths[snet.Fingerprint(path)]++
}
return false
}
Expand Down Expand Up @@ -369,22 +376,18 @@ func (c *client) getRemote(ctx context.Context, n int) (snet.Path, error) {
}

paths, err := c.sdConn.Paths(ctx, remote.IA, integration.Local.IA,
daemon.PathReqFlags{Refresh: n != 0})
daemon.PathReqFlags{Refresh: false})
if err != nil {
return nil, withTag(serrors.WrapStr("requesting paths", err))
}
// If all paths had an error, let's try them again.
if len(paths) <= len(c.errorPaths) {
c.errorPaths = make(map[snet.PathFingerprint]struct{})
}
// Select first path that didn't error before.
// Select path that errored fewest times before
var path snet.Path
lowestErrCount := math.MaxInt
for _, p := range paths {
if _, ok := c.errorPaths[snet.Fingerprint(p)]; ok {
continue
if e := c.errorPaths[snet.Fingerprint(p)]; e < lowestErrCount {
path = p
lowestErrCount = e
}
path = p
break
}
if path == nil {
return nil, withTag(serrors.New("no path found",
Expand Down
2 changes: 1 addition & 1 deletion tools/end2end_integration/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import (
var (
subset string
attempts int
timeout = &util.DurWrap{Duration: 10 * time.Second}
timeout = &util.DurWrap{Duration: 5 * time.Second}
parallelism int
name string
cmd string
Expand Down

0 comments on commit 0985021

Please sign in to comment.