From 22afc6ece9715514108aec8335918aebe5fd6c46 Mon Sep 17 00:00:00 2001 From: Jay Deokar Date: Wed, 16 Nov 2022 21:08:24 -0500 Subject: [PATCH] Add metrics for ec2 api calls made by CNI and expose via prometheus --- cmd/cni-metrics-helper/metrics/cni_metrics.go | 14 +++++ cmd/cni-metrics-helper/metrics/cni_test1.data | 10 ++++ .../metrics/metrics_test.go | 4 ++ pkg/awsutils/awsutils.go | 58 +++++++++++++++++++ .../metrics-helper/metric_helper_test.go | 34 +++++++++++ 5 files changed, 120 insertions(+) diff --git a/cmd/cni-metrics-helper/metrics/cni_metrics.go b/cmd/cni-metrics-helper/metrics/cni_metrics.go index c78d3f8fbcb..2d284b837df 100644 --- a/cmd/cni-metrics-helper/metrics/cni_metrics.go +++ b/cmd/cni-metrics-helper/metrics/cni_metrics.go @@ -147,6 +147,20 @@ var InterestingCNIMetrics = map[string]metricsConvert{ actionFunc: metricsAdd, data: &dataPoints{}, logToFile: true}}}, + "awscni_ec2api_req_count": { + actions: []metricsAction{ + {cwMetricName: "ec2ApiReqCount", + matchFunc: matchAny, + actionFunc: metricsAdd, + data: &dataPoints{}, + logToFile: true}}}, + "awscni_ec2api_error_count": { + actions: []metricsAction{ + {cwMetricName: "ec2ApiErrCount", + matchFunc: matchAny, + actionFunc: metricsAdd, + data: &dataPoints{}, + logToFile: true}}}, } // CNIMetricsTarget defines data structure for kube-state-metric target diff --git a/cmd/cni-metrics-helper/metrics/cni_test1.data b/cmd/cni-metrics-helper/metrics/cni_test1.data index e833b491bd7..654f845b8af 100644 --- a/cmd/cni-metrics-helper/metrics/cni_test1.data +++ b/cmd/cni-metrics-helper/metrics/cni_test1.data @@ -7,6 +7,16 @@ awscni_assigned_ip_addresses 1 # HELP awscni_aws_api_error_count The number of times AWS API returns an error # TYPE awscni_aws_api_error_count counter awscni_aws_api_error_count{api="DeleteNetworkInterface",error="InvalidParameterValue"} 14 +# HELP awscni_ec2api_req_count The number of requests made to EC2 APIs by CNI +# TYPE awscni_ec2api_req_count counter +awscni_ec2api_req_count{fn="AssignPrivateIpAddresses"} 1 +awscni_ec2api_req_count{fn="AttachNetworkInterface"} 1 +awscni_ec2api_req_count{fn="CreateNetworkInterface"} 1 +awscni_ec2api_req_count{fn="DeleteNetworkInterface"} 1 +awscni_ec2api_req_count{fn="DescribeInstances"} 1 +awscni_ec2api_req_count{fn="DescribeNetworkInterfaces"} 13 +awscni_ec2api_req_count{fn="DetachNetworkInterface"} 1 +awscni_ec2api_req_count{fn="ModifyNetworkInterfaceAttribute"} 2 # HELP awscni_aws_api_latency_ms AWS API call latency in ms # TYPE awscni_aws_api_latency_ms summary awscni_aws_api_latency_ms{api="AssignPrivateIpAddresses",error="false",quantile="0.5"} NaN diff --git a/cmd/cni-metrics-helper/metrics/metrics_test.go b/cmd/cni-metrics-helper/metrics/metrics_test.go index 849387dc6b8..bc81c53537f 100644 --- a/cmd/cni-metrics-helper/metrics/metrics_test.go +++ b/cmd/cni-metrics-helper/metrics/metrics_test.go @@ -72,6 +72,10 @@ func TestAPIServerMetric(t *testing.T) { // verify awscni_assigned_ip_addresses value assert.Equal(t, 1.0, actions[0].data.curSingleDataPoint) + // verify awscni_ec2api_req_count value + actions = InterestingCNIMetrics["awscni_ec2api_req_count"].actions + assert.Equal(t, 21.0, actions[0].data.curSingleDataPoint) + actions = InterestingCNIMetrics["awscni_total_ip_addresses"].actions // verify awscni_total_ip_addresses value assert.Equal(t, 10.0, actions[0].data.curSingleDataPoint) diff --git a/pkg/awsutils/awsutils.go b/pkg/awsutils/awsutils.go index ce798d698b1..3e9129087c9 100644 --- a/pkg/awsutils/awsutils.go +++ b/pkg/awsutils/awsutils.go @@ -105,6 +105,20 @@ var ( }, []string{"fn", "error"}, ) + ec2ApiReq = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "awscni_ec2api_req_count", + Help: "The number of requests made to EC2 APIs by CNI", + }, + []string{"fn"}, + ) + ec2ApiErr = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "awscni_ec2api_error_count", + Help: "The number of failed EC2 APIs requests", + }, + []string{"fn"}, + ) prometheusRegistered = false ) @@ -306,6 +320,8 @@ func prometheusRegister() { prometheus.MustRegister(awsAPILatency) prometheus.MustRegister(awsAPIErr) prometheus.MustRegister(awsUtilsErr) + prometheus.MustRegister(ec2ApiReq) + prometheus.MustRegister(ec2ApiErr) prometheusRegistered = true } } @@ -555,6 +571,7 @@ func (cache *EC2InstanceMetadataCache) RefreshSGIDs(mac string) error { } start := time.Now() _, err = cache.ec2SVC.ModifyNetworkInterfaceAttributeWithContext(context.Background(), attributeInput) + ec2ApiReq.WithLabelValues("ModifyNetworkInterfaceAttribute").Inc() awsAPILatency.WithLabelValues("ModifyNetworkInterfaceAttribute", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { if aerr, ok := err.(awserr.Error); ok { @@ -564,6 +581,7 @@ func (cache *EC2InstanceMetadataCache) RefreshSGIDs(mac string) error { } CheckAPIErrorAndBroadcastEvent(err, "ec2:ModifyNetworkInterfaceAttribute") awsAPIErrInc("ModifyNetworkInterfaceAttribute", err) + ec2ApiErr.WithLabelValues("ModifyNetworkInterfaceAttribute").Inc() //No need to return error here since retry will happen in 30seconds and also //If update failed due to stale ENI then returning error will prevent updating SG //for following ENIs since the list is sorted @@ -701,10 +719,12 @@ func (cache *EC2InstanceMetadataCache) awsGetFreeDeviceNumber() (int, error) { start := time.Now() result, err := cache.ec2SVC.DescribeInstancesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("DescribeInstances").Inc() awsAPILatency.WithLabelValues("DescribeInstances", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeInstances") awsAPIErrInc("DescribeInstances", err) + ec2ApiErr.WithLabelValues("DescribeInstances").Inc() log.Errorf("awsGetFreeDeviceNumber: Unable to retrieve instance data from EC2 control plane %v", err) return 0, errors.Wrap(err, "find a free device number for ENI: not able to retrieve instance data from EC2 control plane") @@ -765,10 +785,12 @@ func (cache *EC2InstanceMetadataCache) AllocENI(useCustomCfg bool, sg []*string, start := time.Now() _, err = cache.ec2SVC.ModifyNetworkInterfaceAttributeWithContext(context.Background(), attributeInput) + ec2ApiReq.WithLabelValues("ModifyNetworkInterfaceAttribute").Inc() awsAPILatency.WithLabelValues("ModifyNetworkInterfaceAttribute", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:ModifyNetworkInterfaceAttribute") awsAPIErrInc("ModifyNetworkInterfaceAttribute", err) + ec2ApiErr.WithLabelValues("ModifyNetworkInterfaceAttribute").Inc() err := cache.FreeENI(eniID) if err != nil { awsUtilsErrInc("ENICleanupUponModifyNetworkErr", err) @@ -795,10 +817,12 @@ func (cache *EC2InstanceMetadataCache) attachENI(eniID string) (string, error) { } start := time.Now() attachOutput, err := cache.ec2SVC.AttachNetworkInterfaceWithContext(context.Background(), attachInput) + ec2ApiReq.WithLabelValues("AttachNetworkInterface").Inc() awsAPILatency.WithLabelValues("AttachNetworkInterface", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:AttachNetworkInterface") awsAPIErrInc("AttachNetworkInterface", err) + ec2ApiErr.WithLabelValues("AttachNetworkInterface").Inc() log.Errorf("Failed to attach ENI %s: %v", eniID, err) return "", errors.Wrap(err, "attachENI: failed to attach ENI") } @@ -844,10 +868,12 @@ func (cache *EC2InstanceMetadataCache) createENI(useCustomCfg bool, sg []*string start := time.Now() result, err := cache.ec2SVC.CreateNetworkInterfaceWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("CreateNetworkInterface").Inc() awsAPILatency.WithLabelValues("CreateNetworkInterface", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:CreateNetworkInterface") awsAPIErrInc("CreateNetworkInterface", err) + ec2ApiErr.WithLabelValues("CreateNetworkInterface").Inc() log.Errorf("Failed to CreateNetworkInterface %v", err) return "", errors.Wrap(err, "failed to create network interface") } @@ -894,10 +920,12 @@ func (cache *EC2InstanceMetadataCache) TagENI(eniID string, currentTags map[stri return retry.NWithBackoff(retry.NewSimpleBackoff(500*time.Millisecond, maxENIBackoffDelay, 0.3, 2), 5, func() error { start := time.Now() _, err := cache.ec2SVC.CreateTagsWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("CreateTags").Inc() awsAPILatency.WithLabelValues("CreateTags", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:CreateTags") awsAPIErrInc("CreateTags", err) + ec2ApiErr.WithLabelValues("CreateTags").Inc() log.Warnf("Failed to tag the newly created ENI %s:", eniID) return err } @@ -953,10 +981,12 @@ func (cache *EC2InstanceMetadataCache) freeENI(eniName string, sleepDelayAfterDe err = retry.NWithBackoff(retry.NewSimpleBackoff(time.Millisecond*200, maxBackoffDelay, 0.15, 2.0), maxENIEC2APIRetries, func() error { start := time.Now() _, ec2Err := cache.ec2SVC.DetachNetworkInterfaceWithContext(context.Background(), detachInput) + ec2ApiReq.WithLabelValues("DetachNetworkInterface").Inc() awsAPILatency.WithLabelValues("DetachNetworkInterface", fmt.Sprint(ec2Err != nil), awsReqStatus(ec2Err)).Observe(msSince(start)) if ec2Err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:DetachNetworkInterface") awsAPIErrInc("DetachNetworkInterface", ec2Err) + ec2ApiErr.WithLabelValues("DetachNetworkInterface").Inc() log.Errorf("Failed to detach ENI %s %v", eniName, ec2Err) return errors.New("unable to detach ENI from EC2 instance, giving up") } @@ -989,6 +1019,7 @@ func (cache *EC2InstanceMetadataCache) getENIAttachmentID(eniID string) (*string start := time.Now() result, err := cache.ec2SVC.DescribeNetworkInterfacesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("DescribeNetworkInterfaces").Inc() awsAPILatency.WithLabelValues("DescribeNetworkInterfaces", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { if aerr, ok := err.(awserr.Error); ok { @@ -998,6 +1029,7 @@ func (cache *EC2InstanceMetadataCache) getENIAttachmentID(eniID string) (*string } CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces") awsAPIErrInc("DescribeNetworkInterfaces", err) + ec2ApiErr.WithLabelValues("DescribeNetworkInterfaces").Inc() log.Errorf("Failed to get ENI %s information from EC2 control plane %v", eniID, err) return nil, errors.Wrap(err, "failed to describe network interface") } @@ -1025,6 +1057,7 @@ func (cache *EC2InstanceMetadataCache) deleteENI(eniName string, maxBackoffDelay err := retry.NWithBackoff(retry.NewSimpleBackoff(time.Millisecond*500, maxBackoffDelay, 0.15, 2.0), maxENIEC2APIRetries, func() error { start := time.Now() _, ec2Err := cache.ec2SVC.DeleteNetworkInterfaceWithContext(context.Background(), deleteInput) + ec2ApiReq.WithLabelValues("DeleteNetworkInterface").Inc() awsAPILatency.WithLabelValues("DeleteNetworkInterface", fmt.Sprint(ec2Err != nil), awsReqStatus(ec2Err)).Observe(msSince(start)) if ec2Err != nil { if aerr, ok := ec2Err.(awserr.Error); ok { @@ -1036,6 +1069,7 @@ func (cache *EC2InstanceMetadataCache) deleteENI(eniName string, maxBackoffDelay } CheckAPIErrorAndBroadcastEvent(ec2Err, "ec2:DeleteNetworkInterface") awsAPIErrInc("DeleteNetworkInterface", ec2Err) + ec2ApiErr.WithLabelValues("DeleteNetworkInterface").Inc() log.Debugf("Not able to delete ENI: %v ", ec2Err) return errors.Wrapf(ec2Err, "unable to delete ENI") } @@ -1053,6 +1087,7 @@ func (cache *EC2InstanceMetadataCache) GetIPv4sFromEC2(eniID string) (addrList [ start := time.Now() result, err := cache.ec2SVC.DescribeNetworkInterfacesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("DescribeNetworkInterfaces").Inc() awsAPILatency.WithLabelValues("DescribeNetworkInterfaces", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { if aerr, ok := err.(awserr.Error); ok { @@ -1062,6 +1097,7 @@ func (cache *EC2InstanceMetadataCache) GetIPv4sFromEC2(eniID string) (addrList [ } CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces") awsAPIErrInc("DescribeNetworkInterfaces", err) + ec2ApiErr.WithLabelValues("DescribeNetworkInterfaces").Inc() log.Errorf("Failed to get ENI %s information from EC2 control plane %v", eniID, err) return nil, errors.Wrap(err, "failed to describe network interface") } @@ -1082,6 +1118,7 @@ func (cache *EC2InstanceMetadataCache) GetIPv4PrefixesFromEC2(eniID string) (add start := time.Now() result, err := cache.ec2SVC.DescribeNetworkInterfacesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("DescribeNetworkInterfaces").Inc() awsAPILatency.WithLabelValues("DescribeNetworkInterfaces", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { if aerr, ok := err.(awserr.Error); ok { @@ -1092,6 +1129,7 @@ func (cache *EC2InstanceMetadataCache) GetIPv4PrefixesFromEC2(eniID string) (add } CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces") awsAPIErrInc("DescribeNetworkInterfaces", err) + ec2ApiErr.WithLabelValues("DescribeNetworkInterfaces").Inc() log.Errorf("Failed to get ENI %s information from EC2 control plane %v", eniID, err) return nil, errors.Wrap(err, "failed to describe network interface") } @@ -1112,6 +1150,7 @@ func (cache *EC2InstanceMetadataCache) GetIPv6PrefixesFromEC2(eniID string) (add start := time.Now() result, err := cache.ec2SVC.DescribeNetworkInterfacesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("DescribeNetworkInterfaces").Inc() awsAPILatency.WithLabelValues("DescribeNetworkInterfaces", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { if aerr, ok := err.(awserr.Error); ok { @@ -1122,6 +1161,7 @@ func (cache *EC2InstanceMetadataCache) GetIPv6PrefixesFromEC2(eniID string) (add } CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces") awsAPIErrInc("DescribeNetworkInterfaces", err) + ec2ApiErr.WithLabelValues("DescribeNetworkInterfaces").Inc() log.Errorf("Failed to get ENI %s information from EC2 control plane %v", eniID, err) return nil, errors.Wrap(err, "failed to describe network interface") } @@ -1155,12 +1195,14 @@ func (cache *EC2InstanceMetadataCache) DescribeAllENIs() (DescribeAllENIsResult, input := &ec2.DescribeNetworkInterfacesInput{NetworkInterfaceIds: aws.StringSlice(eniIDs)} start := time.Now() ec2Response, err = cache.ec2SVC.DescribeNetworkInterfacesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("DescribeNetworkInterfaces").Inc() awsAPILatency.WithLabelValues("DescribeNetworkInterfaces", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err == nil { // No error, exit the loop break } awsAPIErrInc("DescribeNetworkInterfaces", err) + ec2ApiErr.WithLabelValues("DescribeNetworkInterfaces").Inc() CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces") log.Errorf("Failed to call ec2:DescribeNetworkInterfaces for %v: %v", aws.StringValueSlice(input.NetworkInterfaceIds), err) if aerr, ok := err.(awserr.Error); ok { @@ -1347,10 +1389,12 @@ func (cache *EC2InstanceMetadataCache) AllocIPAddress(eniID string) error { start := time.Now() output, err := cache.ec2SVC.AssignPrivateIpAddressesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("AssignPrivateIpAddresses").Inc() awsAPILatency.WithLabelValues("AssignPrivateIpAddresses", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:AssignPrivateIpAddresses") awsAPIErrInc("AssignPrivateIpAddresses", err) + ec2ApiErr.WithLabelValues("AssignPrivateIpAddresses").Inc() log.Errorf("Failed to allocate a private IP address %v", err) return errors.Wrap(err, "failed to assign private IP addresses") } @@ -1369,7 +1413,9 @@ func (cache *EC2InstanceMetadataCache) FetchInstanceTypeLimits() error { var eniLimits InstanceTypeLimits describeInstanceTypesInput := &ec2.DescribeInstanceTypesInput{InstanceTypes: []*string{aws.String(cache.instanceType)}} output, err := cache.ec2SVC.DescribeInstanceTypesWithContext(context.Background(), describeInstanceTypesInput) + ec2ApiReq.WithLabelValues("DescribeInstanceTypes").Inc() if err != nil || len(output.InstanceTypes) != 1 { + ec2ApiErr.WithLabelValues("DescribeInstanceTypes").Inc() CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeInstanceTypes") return errors.New(fmt.Sprintf("Failed calling DescribeInstanceTypes for `%s`: %v", cache.instanceType, err)) } @@ -1473,6 +1519,7 @@ func (cache *EC2InstanceMetadataCache) AllocIPAddresses(eniID string, numIPs int start := time.Now() output, err := cache.ec2SVC.AssignPrivateIpAddressesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("AssignPrivateIpAddresses").Inc() awsAPILatency.WithLabelValues("AssignPrivateIpAddresses", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:AssignPrivateIpAddresses") @@ -1483,6 +1530,7 @@ func (cache *EC2InstanceMetadataCache) AllocIPAddresses(eniID string, numIPs int } log.Errorf("Failed to allocate a private IP/Prefix addresses on ENI %v: %v", eniID, err) awsAPIErrInc("AssignPrivateIpAddresses", err) + ec2ApiErr.WithLabelValues("AssignPrivateIpAddresses").Inc() return nil, err } if output != nil { @@ -1503,11 +1551,13 @@ func (cache *EC2InstanceMetadataCache) AllocIPv6Prefixes(eniID string) ([]*strin } start := time.Now() output, err := cache.ec2SVC.AssignIpv6AddressesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("AssignIpv6Addresses").Inc() awsAPILatency.WithLabelValues("AssignIpv6AddressesWithContext", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:AssignPrivateIpv6Addresses") log.Errorf("Failed to allocate IPv6 Prefixes on ENI %v: %v", eniID, err) awsAPIErrInc("AssignPrivateIpv6Addresses", err) + ec2ApiErr.WithLabelValues("AssignIpv6Addresses").Inc() return nil, errors.Wrap(err, "allocate IPv6 prefix: failed to allocate an IPv6 prefix address") } if output != nil { @@ -1602,10 +1652,12 @@ func (cache *EC2InstanceMetadataCache) DeallocIPAddresses(eniID string, ips []st start := time.Now() _, err := cache.ec2SVC.UnassignPrivateIpAddressesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("UnassignPrivateIpAddresses").Inc() awsAPILatency.WithLabelValues("UnassignPrivateIpAddresses", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:UnassignPrivateIpAddresses") awsAPIErrInc("UnassignPrivateIpAddresses", err) + ec2ApiErr.WithLabelValues("UnassignPrivateIpAddresses").Inc() log.Errorf("Failed to deallocate a private IP address %v", err) return errors.Wrap(err, fmt.Sprintf("deallocate IP addresses: failed to deallocate private IP addresses: %s", ips)) } @@ -1628,10 +1680,12 @@ func (cache *EC2InstanceMetadataCache) DeallocPrefixAddresses(eniID string, pref start := time.Now() _, err := cache.ec2SVC.UnassignPrivateIpAddressesWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("UnassignPrivateIpAddresses").Inc() awsAPILatency.WithLabelValues("UnassignPrivateIpAddresses", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:UnassignPrivateIpAddresses") awsAPIErrInc("UnassignPrivateIpAddresses", err) + ec2ApiErr.WithLabelValues("UnassignPrivateIpAddresses").Inc() log.Errorf("Failed to deallocate a Prefixes address %v", err) return errors.Wrap(err, fmt.Sprintf("deallocate prefix: failed to deallocate Prefix addresses: %v", prefixes)) } @@ -1688,10 +1742,12 @@ func (cache *EC2InstanceMetadataCache) tagENIcreateTS(eniID string, maxBackoffDe _ = retry.NWithBackoff(retry.NewSimpleBackoff(500*time.Millisecond, maxBackoffDelay, 0.3, 2), 5, func() error { start := time.Now() _, err := cache.ec2SVC.CreateTagsWithContext(context.Background(), input) + ec2ApiReq.WithLabelValues("CreateTags").Inc() awsAPILatency.WithLabelValues("CreateTags", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start)) if err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:CreateTags") awsAPIErrInc("CreateTags", err) + ec2ApiErr.WithLabelValues("CreateTags").Inc() log.Warnf("Failed to add tag to ENI %s: %v", eniID, err) return err } @@ -1867,8 +1923,10 @@ func (cache *EC2InstanceMetadataCache) getENIsFromPaginatedDescribeNetworkInterf if err := cache.ec2SVC.DescribeNetworkInterfacesPagesWithContext(context.TODO(), input, pageFn); err != nil { CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces") awsAPIErrInc("DescribeNetworkInterfaces", err) + ec2ApiErr.WithLabelValues("DescribeNetworkInterfaces").Inc() return err } + ec2ApiReq.WithLabelValues("DescribeNetworkInterfaces").Inc() return innerErr } diff --git a/test/integration/metrics-helper/metric_helper_test.go b/test/integration/metrics-helper/metric_helper_test.go index 252d151dde9..1ba4b3f209d 100644 --- a/test/integration/metrics-helper/metric_helper_test.go +++ b/test/integration/metrics-helper/metric_helper_test.go @@ -15,6 +15,7 @@ package metrics_helper import ( "fmt" + "math" "time" "github.com/aws/amazon-vpc-cni-k8s/test/framework/resources/k8s/manifest" @@ -84,6 +85,39 @@ var _ = Describe("test cni-metrics-helper publishes metrics", func() { By("validating the addReqCount increased on the node after a deployment is created") Expect(addReqCountIncreased).To(BeTrue()) + + getMetricStatisticsInput = &cloudwatch.GetMetricStatisticsInput{ + Dimensions: []*cloudwatch.Dimension{ + { + Name: aws.String("CLUSTER_ID"), + Value: aws.String(ngName), + }, + }, + MetricName: aws.String("ec2ApiReqCount"), + Namespace: aws.String("Kubernetes"), + Period: aws.Int64(int64(60)), + // Start time should sync with when when this test started + StartTime: aws.Time(time.Now().Add(time.Duration(-10) * time.Minute)), + EndTime: aws.Time(time.Now()), + Statistics: aws.StringSlice([]string{"Maximum"}), + } + getMetricOutput, err = f.CloudServices.CloudWatch().GetMetricStatistics(getMetricStatisticsInput) + Expect(err).ToNot(HaveOccurred()) + + dataPoints = getMetricOutput.Datapoints + fmt.Fprintf(GinkgoWriter, "data points: %+v", dataPoints) + + ec2APIRequestsCountIncreased := false + lastVal = *dataPoints[0].Maximum + for _, dp := range dataPoints { + if math.Abs(*dp.Maximum-lastVal) > 0 { + ec2APIRequestsCountIncreased = true + } + lastVal = *dp.Maximum + } + + By("validating the ec2ApiReqCount increased after a deployment is created") + Expect(ec2APIRequestsCountIncreased).To(BeTrue()) }) }) })