Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add event recorder utils to raise aws-node pod events #1536

Merged
merged 2 commits into from
Apr 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions charts/aws-vpc-cni/templates/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,7 @@ rules:
resources:
- '*'
verbs: ["list", "watch"]
- apiGroups: ["", "events.k8s.io"]
resources:
- events
verbs: ["create", "patch", "list", "get"]
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does it need patch?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is required for event aggregation. For eg, without patch we wouldn't see the aggregated info [x12 over 21s] below:
Warning MissingIAMPermissions 20s (x12 over 21s) aws-node Unauthorized operation ...

FYI we removed get recently as it was not required.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the explanation!

3 changes: 3 additions & 0 deletions cmd/aws-k8s-agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (

"github.com/aws/amazon-vpc-cni-k8s/pkg/ipamd"
"github.com/aws/amazon-vpc-cni-k8s/pkg/k8sapi"
"github.com/aws/amazon-vpc-cni-k8s/pkg/utils/eventrecorder"
"github.com/aws/amazon-vpc-cni-k8s/pkg/utils/logger"
"github.com/aws/amazon-vpc-cni-k8s/pkg/version"
)
Expand Down Expand Up @@ -62,6 +63,8 @@ func _main() int {
return 1
}

eventrecorder.InitEventRecorder(rawK8SClient)

ipamContext, err := ipamd.New(rawK8SClient, cacheK8SClient)
if err != nil {
log.Errorf("Initialization failure: %v", err)
Expand Down
4 changes: 4 additions & 0 deletions config/master/aws-k8s-cni-cn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ rules:
resources:
- '*'
verbs: ["list", "watch"]
- apiGroups: ["", "events.k8s.io"]
resources:
- events
verbs: ["create", "patch", "list", "get"]
---
# Source: aws-vpc-cni/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
4 changes: 4 additions & 0 deletions config/master/aws-k8s-cni-us-gov-east-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ rules:
resources:
- '*'
verbs: ["list", "watch"]
- apiGroups: ["", "events.k8s.io"]
resources:
- events
verbs: ["create", "patch", "list", "get"]
---
# Source: aws-vpc-cni/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
4 changes: 4 additions & 0 deletions config/master/aws-k8s-cni-us-gov-west-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ rules:
resources:
- '*'
verbs: ["list", "watch"]
- apiGroups: ["", "events.k8s.io"]
resources:
- events
verbs: ["create", "patch", "list", "get"]
---
# Source: aws-vpc-cni/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
4 changes: 4 additions & 0 deletions config/master/aws-k8s-cni.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ rules:
resources:
- '*'
verbs: ["list", "watch"]
- apiGroups: ["", "events.k8s.io"]
sushrk marked this conversation as resolved.
Show resolved Hide resolved
sushrk marked this conversation as resolved.
Show resolved Hide resolved
resources:
- events
verbs: ["create", "patch", "list", "get"]
---
# Source: aws-vpc-cni/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
40 changes: 40 additions & 0 deletions pkg/awsutils/awsutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (

"github.com/aws/amazon-vpc-cni-k8s/pkg/awsutils/awssession"
"github.com/aws/amazon-vpc-cni-k8s/pkg/ec2wrapper"
"github.com/aws/amazon-vpc-cni-k8s/pkg/utils/eventrecorder"
"github.com/aws/amazon-vpc-cni-k8s/pkg/utils/logger"
"github.com/aws/amazon-vpc-cni-k8s/pkg/utils/retry"
"github.com/aws/aws-sdk-go/aws"
Expand All @@ -36,6 +37,7 @@ import (
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
v1 "k8s.io/api/core/v1"
sushrk marked this conversation as resolved.
Show resolved Hide resolved
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
)
Expand Down Expand Up @@ -78,6 +80,9 @@ var (

var log = logger.Get()

// event recorder to raise aws-node pod events eg. if EC2 API calls fail due to UnauthorizedOperation error
var eventRecorder *eventrecorder.EventRecorder

var (
awsAPILatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Expand Down Expand Up @@ -411,6 +416,8 @@ func New(useCustomNetworking, disableENIProvisioning, v4Enabled, v6Enabled bool)
if err != nil {
return nil, err
}
// event recorder to raise events for failed EC2 API calls
eventRecorder = eventrecorder.Get()

// Clean up leaked ENIs in the background
if !disableENIProvisioning {
Expand Down Expand Up @@ -555,6 +562,7 @@ func (cache *EC2InstanceMetadataCache) RefreshSGIDs(mac string) error {
awsAPIErrInc("IMDSMetaDataOutOfSync", err)
}
}
CheckAPIErrorAndBroadcastEvent(err, "ec2:ModifyNetworkInterfaceAttribute")
awsAPIErrInc("ModifyNetworkInterfaceAttribute", err)
//No need to return error here since retry will happen in 30seconds and also
//If update failed due to stale ENI then returning error will prevent updating SG
Expand Down Expand Up @@ -695,6 +703,7 @@ func (cache *EC2InstanceMetadataCache) awsGetFreeDeviceNumber() (int, error) {
result, err := cache.ec2SVC.DescribeInstancesWithContext(context.Background(), input)
awsAPILatency.WithLabelValues("DescribeInstances", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeInstances")
awsAPIErrInc("DescribeInstances", err)
log.Errorf("awsGetFreeDeviceNumber: Unable to retrieve instance data from EC2 control plane %v", err)
return 0, errors.Wrap(err,
Expand Down Expand Up @@ -758,6 +767,7 @@ func (cache *EC2InstanceMetadataCache) AllocENI(useCustomCfg bool, sg []*string,
_, err = cache.ec2SVC.ModifyNetworkInterfaceAttributeWithContext(context.Background(), attributeInput)
awsAPILatency.WithLabelValues("ModifyNetworkInterfaceAttribute", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:ModifyNetworkInterfaceAttribute")
awsAPIErrInc("ModifyNetworkInterfaceAttribute", err)
err := cache.FreeENI(eniID)
if err != nil {
Expand Down Expand Up @@ -787,6 +797,7 @@ func (cache *EC2InstanceMetadataCache) attachENI(eniID string) (string, error) {
attachOutput, err := cache.ec2SVC.AttachNetworkInterfaceWithContext(context.Background(), attachInput)
awsAPILatency.WithLabelValues("AttachNetworkInterface", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:AttachNetworkInterface")
awsAPIErrInc("AttachNetworkInterface", err)
log.Errorf("Failed to attach ENI %s: %v", eniID, err)
return "", errors.Wrap(err, "attachENI: failed to attach ENI")
Expand Down Expand Up @@ -835,6 +846,7 @@ func (cache *EC2InstanceMetadataCache) createENI(useCustomCfg bool, sg []*string
result, err := cache.ec2SVC.CreateNetworkInterfaceWithContext(context.Background(), input)
awsAPILatency.WithLabelValues("CreateNetworkInterface", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:CreateNetworkInterface")
awsAPIErrInc("CreateNetworkInterface", err)
log.Errorf("Failed to CreateNetworkInterface %v", err)
return "", errors.Wrap(err, "failed to create network interface")
Expand Down Expand Up @@ -884,6 +896,7 @@ func (cache *EC2InstanceMetadataCache) TagENI(eniID string, currentTags map[stri
_, err := cache.ec2SVC.CreateTagsWithContext(context.Background(), input)
awsAPILatency.WithLabelValues("CreateTags", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:CreateTags")
awsAPIErrInc("CreateTags", err)
log.Warnf("Failed to tag the newly created ENI %s:", eniID)
return err
Expand Down Expand Up @@ -942,6 +955,7 @@ func (cache *EC2InstanceMetadataCache) freeENI(eniName string, sleepDelayAfterDe
_, ec2Err := cache.ec2SVC.DetachNetworkInterfaceWithContext(context.Background(), detachInput)
awsAPILatency.WithLabelValues("DetachNetworkInterface", fmt.Sprint(ec2Err != nil), awsReqStatus(ec2Err)).Observe(msSince(start))
if ec2Err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:DetachNetworkInterface")
awsAPIErrInc("DetachNetworkInterface", ec2Err)
log.Errorf("Failed to detach ENI %s %v", eniName, ec2Err)
return errors.New("unable to detach ENI from EC2 instance, giving up")
Expand Down Expand Up @@ -982,6 +996,7 @@ func (cache *EC2InstanceMetadataCache) getENIAttachmentID(eniID string) (*string
return nil, ErrENINotFound
}
}
CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces")
awsAPIErrInc("DescribeNetworkInterfaces", err)
log.Errorf("Failed to get ENI %s information from EC2 control plane %v", eniID, err)
return nil, errors.Wrap(err, "failed to describe network interface")
Expand Down Expand Up @@ -1019,6 +1034,7 @@ func (cache *EC2InstanceMetadataCache) deleteENI(eniName string, maxBackoffDelay
return nil
}
}
CheckAPIErrorAndBroadcastEvent(ec2Err, "ec2:DeleteNetworkInterface")
awsAPIErrInc("DeleteNetworkInterface", ec2Err)
log.Debugf("Not able to delete ENI: %v ", ec2Err)
return errors.Wrapf(ec2Err, "unable to delete ENI")
Expand All @@ -1044,6 +1060,7 @@ func (cache *EC2InstanceMetadataCache) GetIPv4sFromEC2(eniID string) (addrList [
return nil, ErrENINotFound
}
}
CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces")
awsAPIErrInc("DescribeNetworkInterfaces", err)
log.Errorf("Failed to get ENI %s information from EC2 control plane %v", eniID, err)
return nil, errors.Wrap(err, "failed to describe network interface")
Expand Down Expand Up @@ -1071,7 +1088,9 @@ func (cache *EC2InstanceMetadataCache) GetIPv4PrefixesFromEC2(eniID string) (add
if aerr.Code() == "InvalidNetworkInterfaceID.NotFound" {
return nil, ErrENINotFound
}

}
CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces")
awsAPIErrInc("DescribeNetworkInterfaces", err)
log.Errorf("Failed to get ENI %s information from EC2 control plane %v", eniID, err)
return nil, errors.Wrap(err, "failed to describe network interface")
Expand Down Expand Up @@ -1099,7 +1118,9 @@ func (cache *EC2InstanceMetadataCache) GetIPv6PrefixesFromEC2(eniID string) (add
if aerr.Code() == "InvalidNetworkInterfaceID.NotFound" {
return nil, ErrENINotFound
}

}
CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces")
awsAPIErrInc("DescribeNetworkInterfaces", err)
log.Errorf("Failed to get ENI %s information from EC2 control plane %v", eniID, err)
return nil, errors.Wrap(err, "failed to describe network interface")
Expand Down Expand Up @@ -1140,6 +1161,7 @@ func (cache *EC2InstanceMetadataCache) DescribeAllENIs() (DescribeAllENIsResult,
break
}
awsAPIErrInc("DescribeNetworkInterfaces", err)
CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces")
log.Errorf("Failed to call ec2:DescribeNetworkInterfaces for %v: %v", aws.StringValueSlice(input.NetworkInterfaceIds), err)
if aerr, ok := err.(awserr.Error); ok {
if aerr.Code() == "InvalidNetworkInterfaceID.NotFound" {
Expand Down Expand Up @@ -1327,6 +1349,7 @@ func (cache *EC2InstanceMetadataCache) AllocIPAddress(eniID string) error {
output, err := cache.ec2SVC.AssignPrivateIpAddressesWithContext(context.Background(), input)
awsAPILatency.WithLabelValues("AssignPrivateIpAddresses", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:AssignPrivateIpAddresses")
awsAPIErrInc("AssignPrivateIpAddresses", err)
log.Errorf("Failed to allocate a private IP address %v", err)
return errors.Wrap(err, "failed to assign private IP addresses")
Expand All @@ -1347,6 +1370,7 @@ func (cache *EC2InstanceMetadataCache) FetchInstanceTypeLimits() error {
describeInstanceTypesInput := &ec2.DescribeInstanceTypesInput{InstanceTypes: []*string{aws.String(cache.instanceType)}}
output, err := cache.ec2SVC.DescribeInstanceTypesWithContext(context.Background(), describeInstanceTypesInput)
if err != nil || len(output.InstanceTypes) != 1 {
CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeInstanceTypes")
return errors.New(fmt.Sprintf("Failed calling DescribeInstanceTypes for `%s`: %v", cache.instanceType, err))
}
info := output.InstanceTypes[0]
Expand Down Expand Up @@ -1451,6 +1475,7 @@ func (cache *EC2InstanceMetadataCache) AllocIPAddresses(eniID string, numIPs int
output, err := cache.ec2SVC.AssignPrivateIpAddressesWithContext(context.Background(), input)
awsAPILatency.WithLabelValues("AssignPrivateIpAddresses", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:AssignPrivateIpAddresses")
if containsPrivateIPAddressLimitExceededError(err) {
log.Debug("AssignPrivateIpAddresses returned PrivateIpAddressLimitExceeded. This can happen if the data store is out of sync." +
"Returning without an error here since we will verify the actual state by calling EC2 to see what addresses have already assigned to this ENI.")
Expand Down Expand Up @@ -1480,6 +1505,7 @@ func (cache *EC2InstanceMetadataCache) AllocIPv6Prefixes(eniID string) ([]*strin
output, err := cache.ec2SVC.AssignIpv6AddressesWithContext(context.Background(), input)
awsAPILatency.WithLabelValues("AssignIpv6AddressesWithContext", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:AssignPrivateIpv6Addresses")
log.Errorf("Failed to allocate IPv6 Prefixes on ENI %v: %v", eniID, err)
awsAPIErrInc("AssignPrivateIpv6Addresses", err)
return nil, errors.Wrap(err, "allocate IPv6 prefix: failed to allocate an IPv6 prefix address")
Expand Down Expand Up @@ -1578,6 +1604,7 @@ func (cache *EC2InstanceMetadataCache) DeallocIPAddresses(eniID string, ips []st
_, err := cache.ec2SVC.UnassignPrivateIpAddressesWithContext(context.Background(), input)
awsAPILatency.WithLabelValues("UnassignPrivateIpAddresses", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:UnassignPrivateIpAddresses")
awsAPIErrInc("UnassignPrivateIpAddresses", err)
log.Errorf("Failed to deallocate a private IP address %v", err)
return errors.Wrap(err, fmt.Sprintf("deallocate IP addresses: failed to deallocate private IP addresses: %s", ips))
Expand All @@ -1603,6 +1630,7 @@ func (cache *EC2InstanceMetadataCache) DeallocPrefixAddresses(eniID string, pref
_, err := cache.ec2SVC.UnassignPrivateIpAddressesWithContext(context.Background(), input)
awsAPILatency.WithLabelValues("UnassignPrivateIpAddresses", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:UnassignPrivateIpAddresses")
awsAPIErrInc("UnassignPrivateIpAddresses", err)
log.Errorf("Failed to deallocate a Prefixes address %v", err)
return errors.Wrap(err, fmt.Sprintf("deallocate prefix: failed to deallocate Prefix addresses: %v", prefixes))
Expand Down Expand Up @@ -1662,6 +1690,7 @@ func (cache *EC2InstanceMetadataCache) tagENIcreateTS(eniID string, maxBackoffDe
_, err := cache.ec2SVC.CreateTagsWithContext(context.Background(), input)
awsAPILatency.WithLabelValues("CreateTags", fmt.Sprint(err != nil), awsReqStatus(err)).Observe(msSince(start))
if err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:CreateTags")
awsAPIErrInc("CreateTags", err)
log.Warnf("Failed to add tag to ENI %s: %v", eniID, err)
return err
Expand Down Expand Up @@ -1836,6 +1865,8 @@ func (cache *EC2InstanceMetadataCache) getENIsFromPaginatedDescribeNetworkInterf
}

if err := cache.ec2SVC.DescribeNetworkInterfacesPagesWithContext(context.TODO(), input, pageFn); err != nil {
CheckAPIErrorAndBroadcastEvent(err, "ec2:DescribeNetworkInterfaces")
awsAPIErrInc("DescribeNetworkInterfaces", err)
return err
}
return innerErr
Expand Down Expand Up @@ -1864,3 +1895,12 @@ func (cache *EC2InstanceMetadataCache) IsPrimaryENI(eniID string) bool {
}
return false
}

func CheckAPIErrorAndBroadcastEvent(err error, api string) {
if aerr, ok := err.(awserr.Error); ok {
if aerr.Code() == "UnauthorizedOperation" {
eventRecorder.BroadcastEvent(v1.EventTypeWarning, "MissingIAMPermissions",
fmt.Sprintf("Unauthorized operation: failed to call %v due to missing permissions. Please refer https://github.com/aws/amazon-vpc-cni-k8s/blob/master/docs/iam-policy.md to attach relevant policy to IAM role", api))
}
}
}
9 changes: 5 additions & 4 deletions pkg/ipamd/ipamd.go
Original file line number Diff line number Diff line change
Expand Up @@ -462,8 +462,9 @@ func (c *IPAMContext) nodeInit() error {

metadataResult, err := c.awsClient.DescribeAllENIs()
if err != nil {
return errors.New("ipamd init: failed to retrieve attached ENIs info")
return errors.Wrap(err, "ipamd init: failed to retrieve attached ENIs info")
}

log.Debugf("DescribeAllENIs success: ENIs: %d, tagged: %d", len(metadataResult.ENIMetadata), len(metadataResult.TagMap))
c.awsClient.SetCNIUnmanagedENIs(metadataResult.MultiCardENIIDs)
c.setUnmanagedENIs(metadataResult.TagMap)
Expand Down Expand Up @@ -938,7 +939,7 @@ func (c *IPAMContext) tryAssignIPs() (increasedPool bool, err error) {
err = c.awsClient.AllocIPAddresses(eni.ID, 1)
if err != nil {
ipamdErrInc("increaseIPPoolAllocIPAddressesFailed")
return false, errors.Wrap(err, fmt.Sprintf("failed to allocate one IP addresses on ENI %s, err: %v", eni.ID, err))
return false, errors.Wrap(err, fmt.Sprintf("failed to allocate one IP addresses on ENI %s, err ", eni.ID))
}
}
// This call to EC2 is needed to verify which IPs got attached to this ENI.
Expand Down Expand Up @@ -1908,13 +1909,13 @@ func (c *IPAMContext) GetPod(podName, namespace string) (*corev1.Pod, error) {
}

// AnnotatePod annotates the pod with the provided key and value
func (c *IPAMContext) AnnotatePod(podNamespace, podName, key, val string) error {
func (c *IPAMContext) AnnotatePod(podName, podNamespace, key, val string) error {
ctx := context.TODO()
var err error

err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
pod := &corev1.Pod{}
if pod, err = c.GetPod(podNamespace, podName); err != nil {
if pod, err = c.GetPod(podName, podNamespace); err != nil {
return err
}

Expand Down
Loading