Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

integ test: snapshot mesh frozen bug test #19435

Merged
merged 4 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions .github/workflows/test-integrations.yml
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,84 @@ jobs:
DD_ENV: ci
run: datadog-ci junit upload --service "$GITHUB_REPOSITORY" $TEST_RESULTS_DIR/results.xml

integration-test-with-deployer:
runs-on: ${{ fromJSON(needs.setup.outputs.compute-large ) }}
needs:
- setup
permissions:
id-token: write # NOTE: this permission is explicitly required for Vault auth.
contents: read
strategy:
fail-fast: false
steps:
- name: Checkout code
uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
# NOTE: This step is specifically needed for ENT. It allows us to access the required private HashiCorp repos.
- name: Setup Git
if: ${{ endsWith(github.repository, '-enterprise') }}
run: git config --global url."https://${{ secrets.ELEVATED_GITHUB_TOKEN }}:@github.com".insteadOf "https://github.com"
- uses: actions/setup-go@fac708d6674e30b6ba41289acaab6d4b75aa0753 # v4.0.1
with:
go-version-file: 'go.mod'
- run: go env
- name: Build image
run: make test-compat-integ-setup
- name: Integration Tests
run: |
mkdir -p "${{ env.TEST_RESULTS_DIR }}"
export NOLOGBUFFER=1
cd ./test-integ/connect
go run gotest.tools/gotestsum@v${{env.GOTESTSUM_VERSION}} \
--raw-command \
--format=standard-verbose \
--debug \
--packages="./..." \
-- \
go test \
-tags "${{ env.GOTAGS }}" \
-timeout=20m \
-parallel=2 \
-json . \
--target-image ${{ env.CONSUL_LATEST_IMAGE_NAME }} \
--target-version local
env:
# this is needed because of incompatibility between RYUK container and GHA
GOTESTSUM_JUNITFILE: ${{ env.TEST_RESULTS_DIR }}/results.xml
GOTESTSUM_FORMAT: standard-verbose
COMPOSE_INTERACTIVE_NO_CLI: 1
# tput complains if this isn't set to something.
TERM: ansi
# NOTE: ENT specific step as we store secrets in Vault.
- name: Authenticate to Vault
if: ${{ endsWith(github.repository, '-enterprise') }}
id: vault-auth
run: vault-auth

# NOTE: ENT specific step as we store secrets in Vault.
- name: Fetch Secrets
if: ${{ endsWith(github.repository, '-enterprise') }}
id: secrets
uses: hashicorp/vault-action@v2.5.0
with:
url: ${{ steps.vault-auth.outputs.addr }}
caCertificate: ${{ steps.vault-auth.outputs.ca_certificate }}
token: ${{ steps.vault-auth.outputs.token }}
secrets: |
kv/data/github/${{ github.repository }}/datadog apikey | DATADOG_API_KEY;

- name: prepare datadog-ci
if: ${{ !endsWith(github.repository, '-enterprise') }}
run: |
curl -L --fail "https://github.com/DataDog/datadog-ci/releases/latest/download/datadog-ci_linux-x64" --output "/usr/local/bin/datadog-ci"
chmod +x /usr/local/bin/datadog-ci

- name: upload coverage
# do not run on forks
if: github.event.pull_request.head.repo.full_name == github.repository
env:
DATADOG_API_KEY: "${{ endsWith(github.repository, '-enterprise') && env.DATADOG_API_KEY || secrets.DATADOG_API_KEY }}"
DD_ENV: ci
run: datadog-ci junit upload --service "$GITHUB_REPOSITORY" $TEST_RESULTS_DIR/results.xml

test-integrations-success:
needs:
Expand All @@ -498,6 +576,7 @@ jobs:
- generate-envoy-job-matrices
- envoy-integration-test
- compatibility-integration-test
- integration-test-with-deployer
runs-on: ${{ fromJSON(needs.setup.outputs.compute-small) }}
if: always() && needs.conditional-skip.outputs.skip-ci != 'true'
steps:
Expand Down
217 changes: 217 additions & 0 deletions test-integ/connect/snapshot_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package connect

import (
"fmt"
"io"
"net/http"
"net/url"
"testing"
"time"

"github.com/stretchr/testify/require"

"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/test/integration/consul-container/libs/utils"
"github.com/hashicorp/consul/testing/deployer/sprawl/sprawltest"
"github.com/hashicorp/consul/testing/deployer/topology"
)

// Test_Snapshot_Restore_Agentless verifies consul agent can continue
// to push envoy confgi after restoring from a snapshot.
//
// - This test is to detect server agent frozen after restoring from a snapshot
// (https://github.com/hashicorp/consul/pull/18636)
//
// - This bug only appeared in agentless mode
//
// Steps:
// 1. The test spins up a one-server cluster with static-server and static-client.
// 2. A snapshot is taken and the cluster is restored from the snapshot
// 3. A new static-server replaces the old one
// 4. At the end, we assert the static-client's upstream is updated with the
// new static-server
func Test_Snapshot_Restore_Agentless(t *testing.T) {
t.Parallel()

staticServerSID := topology.NewServiceID("static-server", "default", "default")
staticClientSID := topology.NewServiceID("static-client", "default", "default")

clu := &topology.Config{
Images: utils.TargetImages(),
Networks: []*topology.Network{
{Name: "dc1"},
},
Clusters: []*topology.Cluster{
{
Name: "dc1",
Nodes: []*topology.Node{
{
Kind: topology.NodeKindServer,
// NOTE: uncomment the following lines to trigger the agent frozen bug
// Images: topology.Images{
// ConsulEnterprise: "hashicorp/consul-enterprise:1.16.1-ent",
// },
Name: "dc1-server1",
Addresses: []*topology.Address{
{Network: "dc1"},
},
},
{
Kind: topology.NodeKindDataplane,
Name: "dc1-client1",
Services: []*topology.Service{
{
ID: staticServerSID,
Image: "docker.mirror.hashicorp.services/fortio/fortio",
Port: 8080,
EnvoyAdminPort: 19000,
CheckTCP: "127.0.0.1:8080",
Command: []string{
"server",
"-http-port", "8080",
"-redirect-port", "-disabled",
},
},
},
},
{
Kind: topology.NodeKindDataplane,
Name: "dc1-client2",
Services: []*topology.Service{
{
ID: staticClientSID,
Image: "docker.mirror.hashicorp.services/fortio/fortio",
Port: 8080,
EnvoyAdminPort: 19000,
CheckTCP: "127.0.0.1:8080",
Command: []string{
"server",
"-http-port", "8080",
"-redirect-port", "-disabled",
},
Upstreams: []*topology.Upstream{
{
ID: staticServerSID,
LocalPort: 5000,
},
},
},
},
},
// Client3 for second static-server
{
Kind: topology.NodeKindDataplane,
Name: "dc1-client3",
Disabled: true,
Services: []*topology.Service{
{
ID: staticServerSID,
Image: "docker.mirror.hashicorp.services/fortio/fortio",
Port: 8080,
EnvoyAdminPort: 19000,
CheckTCP: "127.0.0.1:8080",
Command: []string{
"server",
"-http-port", "8080",
"-redirect-port", "-disabled",
},
},
},
},
},
Enterprise: true,
InitialConfigEntries: []api.ConfigEntry{
&api.ProxyConfigEntry{
Kind: api.ProxyDefaults,
Name: "global",
Partition: "default",
Config: map[string]any{
"protocol": "http",
},
},
&api.ServiceConfigEntry{
Kind: api.ServiceDefaults,
Name: "static-server",
Partition: "default",
Namespace: "default",
},
&api.ServiceIntentionsConfigEntry{
Kind: api.ServiceIntentions,
Name: "static-server",
Partition: "default",
Namespace: "default",
Sources: []*api.SourceIntention{
{
Name: "static-client",
Action: api.IntentionActionAllow,
},
},
},
},
},
},
}
sp := sprawltest.Launch(t, clu)

client, err := sp.HTTPClientForCluster("dc1")
require.NoError(t, err)

staticClient := sp.Topology().Clusters["dc1"].ServiceByID(
topology.NewNodeID("dc1-client2", "default"),
staticClientSID,
)
staticClientAddress := fmt.Sprintf("%s:%d", staticClient.Node.LocalAddress(), staticClient.Port)

// The following url causes the static-client's fortio server to
// fetch the ?url= param (the upstream static-server in our case).
url := fmt.Sprintf("http://%s/fortio/fetch2?url=%s", staticClientAddress,
url.QueryEscape("http://localhost:5000"),
)

// We retry the first request until we get 200 OK since it may take a while
// for the server to be available.
// Use a custom retry.Timer since the default one usually times out too early.
retrySendRequest := func(isSuccess bool) {
t.Log("static-client sending requests to static-server...")
retry.RunWith(&retry.Timer{Timeout: 60 * time.Second, Wait: time.Millisecond * 500}, t, func(r *retry.R) {
resp, err := client.Post(url, "text/plain", nil)
require.NoError(r, err)
defer resp.Body.Close()

if isSuccess {
require.Equal(r, http.StatusOK, resp.StatusCode)
} else {
require.NotEqual(r, http.StatusOK, resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
require.NoError(r, err)
fmt.Println("Body: ", string(body), resp.StatusCode)
})
}
retrySendRequest(true)
t.Log("...ok, got 200 responses")

t.Log("Take a snapshot of the cluster and restore ...")
err = sp.SnapshotSave("dc1")
require.NoError(t, err)

// Shutdown existing static-server
cfg := sp.Config()
cluster := cfg.Cluster("dc1")
cluster.Nodes[1].Disabled = true // client 1 -- static-server
require.NoError(t, sp.Relaunch(cfg))
retrySendRequest(false)

// Add a new static-server
cfg = sp.Config()
cluster = cfg.Cluster("dc1")
cluster.Nodes[3].Disabled = false // client 3 -- static-server
require.NoError(t, sp.Relaunch(cfg))

// Ensure the static-client connected to static-server
retrySendRequest(true)
}
7 changes: 4 additions & 3 deletions testing/deployer/sprawl/boot.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ func (s *Sprawl) assignIPAddresses() error {
return fmt.Errorf("unknown network %q", addr.Network)
}
addr.IPAddress = net.IPByIndex(node.Index)
s.logger.Info("assign addr", "node", node.Name, "addr", addr.IPAddress)
}
}
}
Expand Down Expand Up @@ -504,7 +505,7 @@ func (s *Sprawl) waitForClientAntiEntropyOnce(cluster *topology.Cluster) error {
logger.Debug("all nodes have posted node updates, so first anti-entropy has happened", "elapsed", dur)
return nil
}
logger.Debug("not all client nodes have posted node updates yet", "nodes", stragglers)
logger.Debug("not all nodes have posted node updates yet", "nodes", stragglers)

time.Sleep(1 * time.Second)
}
Expand All @@ -514,10 +515,10 @@ func newGossipKey() (string, error) {
key := make([]byte, 16)
n, err := rand.Reader.Read(key)
if err != nil {
return "", fmt.Errorf("Error reading random data: %s", err)
return "", fmt.Errorf("error reading random data: %s", err)
}
if n != 16 {
return "", fmt.Errorf("Couldn't read enough entropy. Generate more entropy!")
return "", fmt.Errorf("couldn't read enough entropy. Generate more entropy")
}
return base64.StdEncoding.EncodeToString(key), nil
}
26 changes: 26 additions & 0 deletions testing/deployer/sprawl/sprawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,32 @@ func (s *Sprawl) Relaunch(
return nil
}

// SnapshotSave saves a snapshot of a cluster and restore with the snapshot
func (s *Sprawl) SnapshotSave(clusterName string) error {
cluster, ok := s.topology.Clusters[clusterName]
if !ok {
return fmt.Errorf("no such cluster: %s", clusterName)
}
var (
client = s.clients[cluster.Name]
)
snapshot := client.Snapshot()
snap, _, err := snapshot.Save(nil)
if err != nil {
return fmt.Errorf("error saving snapshot: %w", err)
}
s.logger.Info("snapshot saved")
time.Sleep(3 * time.Second)
defer snap.Close()

// Restore the snapshot.
if err := snapshot.Restore(nil, snap); err != nil {
return fmt.Errorf("error restoring snapshot: %w", err)
}
s.logger.Info("snapshot restored")
return nil
}

// Leader returns the cluster leader agent, or an error if no leader is
// available.
func (s *Sprawl) Leader(clusterName string) (*topology.Node, error) {
Expand Down
4 changes: 2 additions & 2 deletions testing/deployer/topology/default_consul.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@

package topology

const DefaultConsulImage = "hashicorp/consul:1.15.2"
const DefaultConsulEnterpriseImage = "hashicorp/consul-enterprise:1.15.2-ent"
const DefaultConsulImage = "hashicorp/consul:1.16.2"
const DefaultConsulEnterpriseImage = "hashicorp/consul-enterprise:1.16.2-ent"
4 changes: 2 additions & 2 deletions testing/deployer/topology/topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ func (n *Node) LocalAddress() string {
for _, a := range n.Addresses {
if a.IsLocal() {
if a.IPAddress == "" {
panic("node has no assigned local address")
panic("node has no assigned local address: " + n.Name)
}
return a.IPAddress
}
Expand All @@ -538,7 +538,7 @@ func (n *Node) LocalProxyPort() int {
if a.ProxyPort > 0 {
return a.ProxyPort
}
panic("node has no assigned local address")
panic("node has no assigned local address: " + n.Name)
}
}
panic("node has no local network")
Expand Down
Loading