Skip to content

Commit

Permalink
Add metrics to acs for eni provisioning workflow monitoring
Browse files Browse the repository at this point in the history
  • Loading branch information
Charles Cheng committed Nov 27, 2024
1 parent a080504 commit 0d2c701
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 2 deletions.
25 changes: 25 additions & 0 deletions ecs-agent/acs/session/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ type session struct {
disconnectJitter time.Duration
inactiveInstanceReconnectDelay time.Duration
lastConnectedTime time.Time
firstACSConnectionTime time.Time
firstDiscoverPollEndpointTime time.Time
}

// NewSession creates a new Session.
Expand Down Expand Up @@ -158,6 +160,8 @@ func NewSession(containerInstanceARN string,
disconnectJitter: wsclient.DisconnectJitterMax,
inactiveInstanceReconnectDelay: inactiveInstanceReconnectDelay,
lastConnectedTime: time.Time{},
firstACSConnectionTime: time.Time{},
firstDiscoverPollEndpointTime: time.Time{},
}
}

Expand Down Expand Up @@ -234,7 +238,14 @@ func (s *session) Start(ctx context.Context) error {
// startSessionOnce creates a session with ACS and handles requests using the passed
// in arguments.
func (s *session) startSessionOnce(ctx context.Context) error {
if s.GetFirstDiscoverPollEndpointTime().IsZero() {
s.firstDiscoverPollEndpointTime = time.Now()
}

discoverPollEndpointMetric := s.metricsFactory.New(metrics.ACSDiscoverPollEndpointDurationName)
acsEndpoint, err := s.ecsClient.DiscoverPollEndpoint(s.containerInstanceARN)
discoverPollEndpointMetric.Done(err)

if err != nil {
logger.Error("ACS: Unable to discover poll endpoint", logger.Fields{
"containerInstanceARN": s.containerInstanceARN,
Expand All @@ -253,6 +264,7 @@ func (s *session) startSessionOnce(ctx context.Context) error {

// Invoke Connect method as soon as we create client. This will ensure all the
// request handlers to be associated with this client have a valid connection.
acsConnectionMetric := s.metricsFactory.New(metrics.ACSConnectionMetricDurationName)
disconnectTimer, err := client.Connect(metrics.ACSDisconnectTimeoutMetricName, s.disconnectTimeout,
s.disconnectJitter)
if err != nil {
Expand All @@ -262,8 +274,13 @@ func (s *session) startSessionOnce(ctx context.Context) error {
})
return err
}
acsConnectionMetric.Done(err)
defer disconnectTimer.Stop()

if s.GetFirstACSConnectionTime().IsZero() {
s.firstACSConnectionTime = time.Now()
}

// Record the timestamp of the last connection to ACS.
s.lastConnectedTime = time.Now()

Expand Down Expand Up @@ -475,3 +492,11 @@ func formatDockerVersion(dockerVersionValue string) string {
func (s *session) GetLastConnectedTime() time.Time {
return s.lastConnectedTime
}

func (s *session) GetFirstACSConnectionTime() time.Time {
return s.firstACSConnectionTime
}

func (s *session) GetFirstDiscoverPollEndpointTime() time.Time {
return s.firstDiscoverPollEndpointTime
}
125 changes: 123 additions & 2 deletions ecs-agent/acs/session/session_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"github.com/aws/amazon-ecs-agent/ecs-agent/doctor"
"github.com/aws/amazon-ecs-agent/ecs-agent/eventstream"
metricsfactory "github.com/aws/amazon-ecs-agent/ecs-agent/metrics"
mock_metrics "github.com/aws/amazon-ecs-agent/ecs-agent/metrics/mocks"
"github.com/aws/amazon-ecs-agent/ecs-agent/utils/retry"
mock_retry "github.com/aws/amazon-ecs-agent/ecs-agent/utils/retry/mock"
"github.com/aws/amazon-ecs-agent/ecs-agent/wsclient"
Expand Down Expand Up @@ -224,6 +225,16 @@ func TestSessionReconnectsOnConnectErrors(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes()
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes()

ecsClient := mock_ecs.NewMockECSClient(ctrl)
ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes()

Expand All @@ -243,7 +254,7 @@ func TestSessionReconnectsOnConnectErrors(t *testing.T) {
// Connect fails 10 times.
mockWsClient.EXPECT().Connect(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, io.EOF).Times(10),
// Cancel trying to connect to ACS on the 11th attempt.
// Failure to retry on Connect() errors should cause the test to time out as the context is never canceled.
// Failure to retry on ConnACSDisconnectTimeoutMetricNameect() errors should cause the test to time out as the context is never canceled.
mockWsClient.EXPECT().Connect(gomock.Any(), gomock.Any(), gomock.Any()).Do(func(interface{},
interface{}, interface{}) {
cancel()
Expand All @@ -253,6 +264,7 @@ func TestSessionReconnectsOnConnectErrors(t *testing.T) {
containerInstanceARN: testconst.ContainerInstanceARN,
ecsClient: ecsClient,
clientFactory: mockClientFactory,
metricsFactory: mockMetricsFactory,
heartbeatTimeout: 20 * time.Millisecond,
heartbeatJitter: 10 * time.Millisecond,
disconnectTimeout: 30 * time.Millisecond,
Expand Down Expand Up @@ -345,6 +357,16 @@ func TestSessionReconnectsWithoutBackoffOnEOFError(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes()
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes()

ecsClient := mock_ecs.NewMockECSClient(ctrl)
ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes()

Expand Down Expand Up @@ -377,6 +399,7 @@ func TestSessionReconnectsWithoutBackoffOnEOFError(t *testing.T) {
inactiveInstanceCB: noopFunc,
backoff: mockBackoff,
clientFactory: mockClientFactory,
metricsFactory: mockMetricsFactory,
heartbeatTimeout: 20 * time.Millisecond,
heartbeatJitter: 10 * time.Millisecond,
disconnectTimeout: 30 * time.Millisecond,
Expand All @@ -394,6 +417,16 @@ func TestSessionReconnectsWithBackoffOnNonEOFError(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes()
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes()

ecsClient := mock_ecs.NewMockECSClient(ctrl)
ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes()

Expand Down Expand Up @@ -427,6 +460,7 @@ func TestSessionReconnectsWithBackoffOnNonEOFError(t *testing.T) {
inactiveInstanceCB: noopFunc,
backoff: mockBackoff,
clientFactory: mockClientFactory,
metricsFactory: mockMetricsFactory,
heartbeatTimeout: 20 * time.Millisecond,
heartbeatJitter: 10 * time.Millisecond,
disconnectTimeout: 30 * time.Millisecond,
Expand All @@ -444,6 +478,16 @@ func TestSessionCallsInactiveInstanceCB(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes()
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes()

ecsClient := mock_ecs.NewMockECSClient(ctrl)
ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes()

Expand Down Expand Up @@ -480,6 +524,7 @@ func TestSessionCallsInactiveInstanceCB(t *testing.T) {
ecsClient: ecsClient,
inactiveInstanceCB: inactiveInstanceCB,
clientFactory: mockClientFactory,
metricsFactory: mockMetricsFactory,
heartbeatTimeout: 20 * time.Millisecond,
heartbeatJitter: 10 * time.Millisecond,
disconnectTimeout: 30 * time.Millisecond,
Expand All @@ -499,6 +544,16 @@ func TestSessionReconnectDelayForInactiveInstanceError(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes()
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes()

ecsClient := mock_ecs.NewMockECSClient(ctrl)
ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes()

Expand Down Expand Up @@ -540,6 +595,7 @@ func TestSessionReconnectDelayForInactiveInstanceError(t *testing.T) {
ecsClient: ecsClient,
inactiveInstanceCB: noopFunc,
clientFactory: mockClientFactory,
metricsFactory: mockMetricsFactory,
heartbeatTimeout: 20 * time.Millisecond,
heartbeatJitter: 10 * time.Millisecond,
disconnectTimeout: 30 * time.Millisecond,
Expand All @@ -559,6 +615,16 @@ func TestSessionReconnectsOnServeErrors(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes()
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes()

ecsClient := mock_ecs.NewMockECSClient(ctrl)
ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes()

Expand Down Expand Up @@ -590,6 +656,7 @@ func TestSessionReconnectsOnServeErrors(t *testing.T) {
ecsClient: ecsClient,
inactiveInstanceCB: noopFunc,
clientFactory: mockClientFactory,
metricsFactory: mockMetricsFactory,
heartbeatTimeout: 20 * time.Millisecond,
heartbeatJitter: 10 * time.Millisecond,
disconnectTimeout: 30 * time.Millisecond,
Expand All @@ -608,6 +675,16 @@ func TestSessionStopsWhenContextIsCanceled(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes()
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes()
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes()

ecsClient := mock_ecs.NewMockECSClient(ctrl)
ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes()

Expand Down Expand Up @@ -635,6 +712,7 @@ func TestSessionStopsWhenContextIsCanceled(t *testing.T) {
ecsClient: ecsClient,
inactiveInstanceCB: noopFunc,
clientFactory: mockClientFactory,
metricsFactory: mockMetricsFactory,
heartbeatTimeout: 20 * time.Millisecond,
heartbeatJitter: 10 * time.Millisecond,
disconnectTimeout: 30 * time.Millisecond,
Expand All @@ -653,6 +731,16 @@ func TestSessionStopsWhenContextIsErrorDueToTimeout(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry)
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any())

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any())
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry)

ecsClient := mock_ecs.NewMockECSClient(ctrl)
ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes()

Expand All @@ -677,6 +765,7 @@ func TestSessionStopsWhenContextIsErrorDueToTimeout(t *testing.T) {
ecsClient: ecsClient,
inactiveInstanceCB: noopFunc,
clientFactory: mockClientFactory,
metricsFactory: mockMetricsFactory,
heartbeatTimeout: 20 * time.Millisecond,
heartbeatJitter: 10 * time.Millisecond,
inactiveInstanceReconnectDelay: 1 * time.Hour,
Expand All @@ -694,6 +783,16 @@ func TestSessionReconnectsOnDiscoverPollEndpointError(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).Times(2)
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).Times(2)

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any())
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry)

ecsClient := mock_ecs.NewMockECSClient(ctrl)
ctx, cancel := context.WithCancel(context.Background())

Expand Down Expand Up @@ -725,6 +824,7 @@ func TestSessionReconnectsOnDiscoverPollEndpointError(t *testing.T) {
ecsClient: ecsClient,
inactiveInstanceCB: noopFunc,
clientFactory: mockClientFactory,
metricsFactory: mockMetricsFactory,
heartbeatTimeout: 20 * time.Millisecond,
heartbeatJitter: 10 * time.Millisecond,
disconnectTimeout: 30 * time.Millisecond,
Expand Down Expand Up @@ -756,6 +856,16 @@ func TestConnectionIsClosedOnIdle(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry)
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any())

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any())

ecsClient := mock_ecs.NewMockECSClient(ctrl)
ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes()
ctx, cancel := context.WithCancel(context.Background())
Expand Down Expand Up @@ -783,6 +893,7 @@ func TestConnectionIsClosedOnIdle(t *testing.T) {
ecsClient: ecsClient,
inactiveInstanceCB: noopFunc,
clientFactory: mockClientFactory,
metricsFactory: mockMetricsFactory,
heartbeatTimeout: 20 * time.Millisecond,
heartbeatJitter: 10 * time.Millisecond,
disconnectTimeout: 30 * time.Millisecond,
Expand Down Expand Up @@ -1003,6 +1114,16 @@ func TestSessionCorrectlySetsSendCredentials(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)

mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).Times(10)
mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).Times(10)

mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl)
mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).Times(10)
mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).Times(10)

const numInvocations = 10
ecsClient := mock_ecs.NewMockECSClient(ctrl)
ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes()
Expand All @@ -1025,7 +1146,7 @@ func TestSessionCorrectlySetsSendCredentials(t *testing.T) {
nil,
noopFunc,
mockClientFactory,
metricsfactory.NewNopEntryFactory(),
mockMetricsFactory,
agentVersion,
agentGitShortHash,
dockerVersion,
Expand Down
5 changes: 5 additions & 0 deletions ecs-agent/metrics/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ const (
ACSDisconnectTimeoutMetricName = agentAvailabilityNamespace + ".ACSDisconnectTimeout"
TCSDisconnectTimeoutMetricName = agentAvailabilityNamespace + ".TCSDisconnectTimeout"

// ACS Session Metrics
acsStartSessionNamespace = "ACSStartSession"
ACSDiscoverPollEndpointDurationName = acsStartSessionNamespace + ".DiscoverPollEndpointDuration"
ACSConnectionMetricDurationName = acsStartSessionNamespace + ".ACSConnectEndpointDuration"

dbClientMetricNamespace = "Data"
GetNetworkConfigurationByTaskMetricName = dbClientMetricNamespace + ".GetNetworkConfigurationByTask"
SaveNetworkNamespaceMetricName = dbClientMetricNamespace + ".SaveNetworkNamespace"
Expand Down

0 comments on commit 0d2c701

Please sign in to comment.