Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

spire-agent: re-attest without restarting #4991

Merged
merged 6 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 27 additions & 21 deletions pkg/agent/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,28 +192,34 @@ func (m *manager) Initialize(ctx context.Context) error {
func (m *manager) Run(ctx context.Context) error {
defer m.client.Release()

err := util.RunTasks(ctx,
m.runSynchronizer,
m.runSyncSVIDs,
m.runSVIDObserver,
m.runBundleObserver,
m.svid.Run)
for {
err := util.RunTasks(ctx,
m.runSynchronizer,
m.runSyncSVIDs,
m.runSVIDObserver,
m.runBundleObserver,
m.svid.Run)

switch {
case err == nil || errors.Is(err, context.Canceled):
m.c.Log.Info("Cache manager stopped")
return nil
case nodeutil.ShouldAgentReattest(err):
m.c.Log.WithError(err).Warn("Agent needs to re-attest; removing SVID and shutting down")
m.deleteSVID()
return err
case nodeutil.ShouldAgentShutdown(err):
m.c.Log.WithError(err).Warn("Agent is banned: removing SVID and shutting down")
m.deleteSVID()
return err
default:
m.c.Log.WithError(err).Error("Cache manager crashed")
return err
switch {
case err == nil || errors.Is(err, context.Canceled):
m.c.Log.Info("Cache manager stopped")
return nil
case nodeutil.ShouldAgentReattest(err):
m.c.Log.WithError(err).Warn("Agent needs to re-attest; will attempt to re-attest")
reattestError := m.svid.Reattest(ctx)
if reattestError != nil {
m.c.Log.WithError(reattestError).Error("Agent failed re-attestation; removing SVID and shutting down")
m.deleteSVID()
return err
}
case nodeutil.ShouldAgentShutdown(err):
m.c.Log.WithError(err).Warn("Agent is banned: removing SVID and shutting down")
m.deleteSVID()
return err
default:
m.c.Log.WithError(err).Error("Cache manager crashed")
return err
}
}
}

Expand Down
24 changes: 24 additions & 0 deletions pkg/agent/svid/rotator.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

type Rotator interface {
Run(ctx context.Context) error
Reattest(ctx context.Context) error

State() State
Subscribe() observer.Stream
Expand Down Expand Up @@ -137,6 +138,29 @@ func (r *rotator) SetRotationFinishedHook(f func()) {
r.rotationFinishedHook = f
}

func (r *rotator) Reattest(ctx context.Context) (err error) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add unit tests for this new function?

state, ok := r.state.Value().(State)
if !ok {
return fmt.Errorf("unexpected value type: %T", r.state.Value())
}

if state.Reattestable {
if !r.c.DisableReattestToRenew {
err = r.reattest(ctx)
} else {
return errors.New("re-attestation is disabled")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure about this error, since a user must DIsableReattesttion to get into this case...
Maybe it worth to have a single if?

if state.Reattestable && !r.c.DisableReattestToRenew {

and if that is the case this code is pretty much the same that is in rotateSVIDIfNeeded and we can expose that instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's what i had before and @amartinezfayo asked me to change. Either way is fine with me.

rotateSVIDIfNeeded can't be used directly because that only rotates if the SVID is expired.

}
} else {
return errors.New("attestation method is not re-attestable")
}

if err == nil && r.rotationFinishedHook != nil {
r.rotationFinishedHook()
}

return err
}

func (r *rotator) rotateSVIDIfNeeded(ctx context.Context) (err error) {
state, ok := r.state.Value().(State)
if !ok {
Expand Down
23 changes: 17 additions & 6 deletions pkg/agent/svid/rotator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,11 @@ func TestRotator(t *testing.T) {
}

for _, tt := range []struct {
name string
notAfter time.Duration
shouldRotate bool
reattest bool
name string
notAfter time.Duration
shouldRotate bool
reattest bool
forceRotation bool
}{
{
name: "not expired at startup",
Expand Down Expand Up @@ -94,6 +95,13 @@ func TestRotator(t *testing.T) {
shouldRotate: true,
reattest: true,
},
{
name: "reattest when requested",
notAfter: time.Minute,
shouldRotate: false,
reattest: true,
forceRotation: true,
},
} {
t.Run(tt.name, func(t *testing.T) {
svidKM := keymanager.ForSVID(fakeagentkeymanager.New(t, ""))
Expand Down Expand Up @@ -200,6 +208,9 @@ func TestRotator(t *testing.T) {
}
t.Fatal("timed out waiting for rotation check to finish")
}
} else if tt.forceRotation {
err := rotator.Reattest(context.Background())
require.NoError(t, err)
}

// Shut down the rotator
Expand All @@ -213,7 +224,7 @@ func TestRotator(t *testing.T) {

// If rotation was supposed to happen, wait for the SVID changes
// on the state stream.
if tt.shouldRotate {
if tt.shouldRotate || tt.forceRotation {
require.True(t, stream.HasNext(), "SVID stream should have changes")
stream.Next()
} else {
Expand All @@ -224,7 +235,7 @@ func TestRotator(t *testing.T) {
// the appropriate number of times.
state := stream.Value().(State)
require.Len(t, state.SVID, 1)
if tt.shouldRotate {
if tt.shouldRotate || tt.forceRotation {
assert.NotEqual(t, svid, state.SVID)
assert.NotEqual(t, svidKey, state.Key)
assert.Equal(t, 2, mockClient.releaseCount, "client might not released after rotation")
Expand Down
14 changes: 12 additions & 2 deletions test/integration/suites/evict-agent/02-bootstrap-agent
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
#!/bin/bash

log-debug "bootstrapping agent..."
docker-compose exec -T spire-server \
/opt/spire/bin/spire-server bundle show > conf/agent/bootstrap.crt

MAXCHECKS=30
CHECKINTERVAL=1
for ((i=1;i<=MAXCHECKS;i++)); do
log-info "trying to bootstrap agent ($i of $MAXCHECKS max)..."
docker-compose logs spire-agent
if docker-compose exec -T spire-server \
/opt/spire/bin/spire-server bundle show > conf/agent/bootstrap.crt; then
exit 0
fi
sleep "${CHECKINTERVAL}"
done
13 changes: 13 additions & 0 deletions test/integration/suites/evict-agent/07-start-agent
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
#!/bin/bash

log-debug "starting agent again..."

docker-up spire-agent

# Check at most 30 times (with one second in between) that the agent is back up
MAXCHECKS=30
CHECKINTERVAL=1
for ((i=1;i<=MAXCHECKS;i++)); do
log-info "checking that the agent is back up ($i of $MAXCHECKS max)..."
docker-compose logs spire-agent
if docker-compose logs spire-agent | grep "Starting Workload and SDS APIs"; then
exit 0
fi
sleep "${CHECKINTERVAL}"
done
21 changes: 0 additions & 21 deletions test/integration/suites/evict-agent/08-delete-agent

This file was deleted.

44 changes: 44 additions & 0 deletions test/integration/suites/evict-agent/08-evict-agent
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash

log-debug "deleting agent..."

# Check at most 30 times (with one second in between) that we can evict the agent, it may take a while for it to start up
MAXCHECKS=30
CHECKINTERVAL=1
for ((i=1;i<=MAXCHECKS;i++)); do
log-info "attempting to evict agent ($i of $MAXCHECKS max)..."
if docker-compose exec -T spire-server \
/opt/spire/bin/spire-server agent evict \
-spiffeID "spiffe://domain.test/spire/agent/x509pop/$(fingerprint conf/agent/agent.crt.pem)"; then
exit 0
fi
sleep "${CHECKINTERVAL}"
done


# Check at most 30 times (with one second in between) that the agent has to re-attest
MAXCHECKS=30
CHECKINTERVAL=1
for ((i=1;i<=MAXCHECKS;i++)); do
log-info "checking for agent to get notification and try to reattest ($i of $MAXCHECKS max)..."
docker-compose logs spire-agent
if docker-compose logs spire-agent | grep "Agent needs to re-attest; will attempt to re-attest"; then
exit 0
fi
sleep "${CHECKINTERVAL}"
done

# Check at most 30 times (with one second in between) that the agent has re-attested
MAXCHECKS=30
CHECKINTERVAL=1
for ((i=1;i<=MAXCHECKS;i++)); do
log-info "checking for agent to get notification and try to reattest ($i of $MAXCHECKS max)..."
docker-compose logs spire-agent
if docker-compose logs spire-agent | grep "Successfully reattested node"; then
exit 0
fi
sleep "${CHECKINTERVAL}"
done

fail-now "timed out waiting for agent to shut down"

Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,7 @@ docker-compose exec -T spire-server \
/opt/spire/bin/spire-server agent evict -spiffeID $AGENT_B_SPIFFE_ID || fail-now "failed to evict agent b."

check-evict-agents $AGENT_A_SPIFFE_ID $AGENT_B_SPIFFE_ID
check-attested-agents

# spire-agent-a will re-attest but spire-agent-b won't because join_token implements trust on first use model.
AGENT_A_SPIFFE_ID_PATH="/spire/agent/x509pop/$(fingerprint conf/agent/agent.crt.pem)"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you move this PATH at the start and use it to create AGENT_A_SPIFFE_ID?

check-attested-agents $AGENT_A_SPIFFE_ID_PATH
Loading