Skip to content

Commit

Permalink
add logs and metrics dimentions to find sts call success/failures on …
Browse files Browse the repository at this point in the history
…global/regional endpoints
  • Loading branch information
sushanth0910 committed Nov 13, 2024
1 parent 9831b89 commit 6fe64b3
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 19 deletions.
31 changes: 17 additions & 14 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ import (
)

const (
Namespace = "aws_iam_authenticator"
Malformed = "malformed_request"
Invalid = "invalid_token"
STSError = "sts_error"
STSThrottling = "sts_throttling"
Unknown = "uknown_user"
Success = "success"
Namespace = "aws_iam_authenticator"
Malformed = "malformed_request"
Invalid = "invalid_token"
STSError = "sts_error"
STSThrottling = "sts_throttling"
Unknown = "uknown_user"
Success = "success"
STSGlobal = "sts_global"
STSRegional = "sts_regional"
InvalidSTSEndpoint = "invalid_sts_endpoint"
)

var authenticatorMetrics Metrics
Expand All @@ -38,10 +41,10 @@ type Metrics struct {
ConfigMapWatchFailures prometheus.Counter
Latency *prometheus.HistogramVec
EC2DescribeInstanceCallCount prometheus.Counter
StsConnectionFailure prometheus.Counter
StsConnectionFailure *prometheus.CounterVec
StsResponses *prometheus.CounterVec
DynamicFileFailures prometheus.Counter
StsThrottling prometheus.Counter
StsThrottling *prometheus.CounterVec
E2ELatency *prometheus.HistogramVec
DynamicFileEnabled prometheus.Gauge
DynamicFileOnly prometheus.Gauge
Expand All @@ -65,26 +68,26 @@ func createMetrics(reg prometheus.Registerer) Metrics {
Help: "Dynamic file failures",
},
),
StsConnectionFailure: factory.NewCounter(
StsConnectionFailure: factory.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Name: "sts_connection_failures_total",
Help: "Sts call could not succeed or timedout",
},
}, []string{"StsEndpointType"},
),
StsThrottling: factory.NewCounter(
StsThrottling: factory.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Name: "sts_throttling_total",
Help: "Sts call got throttled",
},
}, []string{"StsEndpointType"},
),
StsResponses: factory.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Name: "sts_responses_total",
Help: "Sts responses with error code label",
}, []string{"ResponseCode"},
}, []string{"ResponseCode", "StsEndpointType"},
),
Latency: factory.NewHistogramVec(
prometheus.HistogramOpts{
Expand Down
19 changes: 14 additions & 5 deletions pkg/token/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -565,14 +565,23 @@ func (v tokenVerifier) Verify(token string) (*Identity, error) {
req.Header.Set(clusterIDHeader, v.clusterID)
req.Header.Set("accept", "application/json")

stsEndpointType := metrics.InvalidSTSEndpoint
if parsedURL.Host == "sts.amazonaws.com" {
stsEndpointType = metrics.STSGlobal
} else if strings.HasPrefix(parsedURL.Host, "sts.") {
stsEndpointType = metrics.STSRegional
}

logrus.Infof("Sending request to %s endpoint, host: %s", stsEndpointType, parsedURL.Host)

response, err := v.client.Do(req)
if err != nil {
metrics.Get().StsConnectionFailure.Inc()
metrics.Get().StsConnectionFailure.WithLabelValues(stsEndpointType).Inc()
// special case to avoid printing the full URL if possible
if urlErr, ok := err.(*url.Error); ok {
return nil, NewSTSError(fmt.Sprintf("error during GET: %v", urlErr.Err))
return nil, NewSTSError(fmt.Sprintf("error during GET: %v on %s endpoint", urlErr.Err, stsEndpointType))
}
return nil, NewSTSError(fmt.Sprintf("error during GET: %v", err))
return nil, NewSTSError(fmt.Sprintf("error during GET: %v on %s endpoint", err, stsEndpointType))
}
defer response.Body.Close()

Expand All @@ -581,13 +590,13 @@ func (v tokenVerifier) Verify(token string) (*Identity, error) {
return nil, NewSTSError(fmt.Sprintf("error reading HTTP result: %v", err))
}

metrics.Get().StsResponses.WithLabelValues(fmt.Sprint(response.StatusCode)).Inc()
metrics.Get().StsResponses.WithLabelValues(fmt.Sprint(response.StatusCode), stsEndpointType).Inc()
if response.StatusCode != 200 {
responseStr := string(responseBody[:])
// refer to https://docs.aws.amazon.com/STS/latest/APIReference/CommonErrors.html and log
// response body for STS Throttling is {"Error":{"Code":"Throttling","Message":"Rate exceeded","Type":"Sender"},"RequestId":"xxx"}
if strings.Contains(responseStr, "Throttling") {
metrics.Get().StsThrottling.Inc()
metrics.Get().StsThrottling.WithLabelValues(stsEndpointType).Inc()
return nil, NewSTSThrottling(responseStr)
}
return nil, NewSTSError(fmt.Sprintf("error from AWS (expected 200, got %d). Body: %s", response.StatusCode, responseStr))
Expand Down

0 comments on commit 6fe64b3

Please sign in to comment.