Skip to content

Commit

Permalink
Merge pull request #130 from unmarshall/metrics
Browse files Browse the repository at this point in the history
Fixed recording of erroneous metrics for driver and API requests
  • Loading branch information
unmarshall authored Feb 20, 2024
2 parents 78c7bbf + 1c564fc commit fc7faad
Show file tree
Hide file tree
Showing 18 changed files with 955 additions and 39 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ require (
github.com/gardener/machine-controller-manager v0.52.0
github.com/onsi/ginkgo/v2 v2.13.0
github.com/onsi/gomega v1.29.0
github.com/prometheus/client_golang v1.16.0 // indirect
github.com/prometheus/client_golang v1.16.0
github.com/spf13/pflag v1.0.5
golang.org/x/crypto v0.14.0
golang.org/x/exp v0.0.0-20230905200255-921286631fa9
Expand Down
3 changes: 1 addition & 2 deletions pkg/azure/access/helpers/disk.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package helpers

import (
"context"
"time"

"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5"
Expand All @@ -32,7 +31,7 @@ const (
// DeleteDisk deletes disk for passed in resourceGroup and diskName.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func DeleteDisk(ctx context.Context, client *armcompute.DisksClient, resourceGroup, diskName string) (err error) {
defer instrument.RecordAzAPIMetric(err, diskDeleteServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(diskDeleteServiceLabel, &err)()
var poller *runtime.Poller[armcompute.DisksClientDeleteResponse]
poller, err = client.BeginDelete(ctx, resourceGroup, diskName, nil)
if err != nil {
Expand Down
5 changes: 2 additions & 3 deletions pkg/azure/access/helpers/marketplaceagreement.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package helpers

import (
"context"
"time"

"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5"
Expand All @@ -33,7 +32,7 @@ const (
// GetAgreementTerms fetches the agreement terms for the purchase plan.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func GetAgreementTerms(ctx context.Context, mktPlaceAgreementAccess *armmarketplaceordering.MarketplaceAgreementsClient, purchasePlan armcompute.PurchasePlan) (agreementTerms *armmarketplaceordering.AgreementTerms, err error) {
defer instrument.RecordAzAPIMetric(err, mktPlaceAgreementGetServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(mktPlaceAgreementGetServiceLabel, &err)()
resp, err := mktPlaceAgreementAccess.Get(ctx, armmarketplaceordering.OfferTypeVirtualmachine, *purchasePlan.Publisher, *purchasePlan.Product, *purchasePlan.Name, nil)
if err != nil {
errors.LogAzAPIError(err, "Failed to get marketplace agreement for PurchasePlan: %+v", purchasePlan)
Expand All @@ -46,7 +45,7 @@ func GetAgreementTerms(ctx context.Context, mktPlaceAgreementAccess *armmarketpl
// AcceptAgreement updates the agreementTerms as accepted.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func AcceptAgreement(ctx context.Context, mktPlaceAgreementAccess *armmarketplaceordering.MarketplaceAgreementsClient, purchasePlan armcompute.PurchasePlan, existingAgreement armmarketplaceordering.AgreementTerms) (err error) {
defer instrument.RecordAzAPIMetric(err, mktPlaceAgreementCreateServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(mktPlaceAgreementCreateServiceLabel, &err)()
updatedAgreement := existingAgreement
updatedAgreement.Properties.Accepted = to.Ptr(true)
_, err = mktPlaceAgreementAccess.Create(ctx, armmarketplaceordering.OfferTypeVirtualmachine, *purchasePlan.Publisher, *purchasePlan.Product, *purchasePlan.Name, updatedAgreement, nil)
Expand Down
10 changes: 6 additions & 4 deletions pkg/azure/access/helpers/nic.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import (

// labels used for recording prometheus metrics
const (
subnetGetServiceLabel = "subnet_get"
nicGetServiceLabel = "nic_get"
nicDeleteServiceLabel = "nic_delete"
nicCreateServiceLabel = "nic_create"
Expand All @@ -40,7 +39,8 @@ const (
// DeleteNIC deletes the NIC identified by a resourceGroup and nicName.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func DeleteNIC(ctx context.Context, client *armnetwork.InterfacesClient, resourceGroup, nicName string) (err error) {
defer instrument.RecordAzAPIMetric(err, nicDeleteServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(nicDeleteServiceLabel, &err)()

var poller *runtime.Poller[armnetwork.InterfacesClientDeleteResponse]
delCtx, cancelFn := context.WithTimeout(ctx, defaultDeleteNICTimeout)
defer cancelFn()
Expand All @@ -61,7 +61,8 @@ func DeleteNIC(ctx context.Context, client *armnetwork.InterfacesClient, resourc
// GetNIC fetches a NIC identified by resourceGroup and nic name.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func GetNIC(ctx context.Context, client *armnetwork.InterfacesClient, resourceGroup, nicName string) (nic *armnetwork.Interface, err error) {
defer instrument.RecordAzAPIMetric(err, nicGetServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(nicGetServiceLabel, &err)()

resp, err := client.Get(ctx, resourceGroup, nicName, nil)
if err != nil {
if errors.IsNotFoundAzAPIError(err) {
Expand All @@ -76,7 +77,8 @@ func GetNIC(ctx context.Context, client *armnetwork.InterfacesClient, resourceGr
// CreateNIC creates a NIC given the resourceGroup, nic name and NIC creation parameters.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func CreateNIC(ctx context.Context, nicAccess *armnetwork.InterfacesClient, resourceGroup string, nicParams armnetwork.Interface, nicName string) (nic *armnetwork.Interface, err error) {
defer instrument.RecordAzAPIMetric(err, nicCreateServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(nicCreateServiceLabel, &err)()

var (
poller *runtime.Poller[armnetwork.InterfacesClientCreateOrUpdateResponse]
creationResp armnetwork.InterfacesClientCreateOrUpdateResponse
Expand Down
4 changes: 2 additions & 2 deletions pkg/azure/access/helpers/resourcegraph.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ package helpers
import (
"context"
"fmt"
"time"

"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resourcegraph/armresourcegraph"
Expand All @@ -37,7 +36,8 @@ type MapperFn[T any] func(map[string]interface{}) *T
// The result of the query are then mapped using a mapperFn and the result or an error is returned.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func QueryAndMap[T any](ctx context.Context, client *armresourcegraph.Client, subscriptionID string, mapperFn MapperFn[T], queryTemplate string, templateArgs ...any) (results []T, err error) {
defer instrument.RecordAzAPIMetric(err, resourceGraphQueryServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(resourceGraphQueryServiceLabel, &err)()

query := fmt.Sprintf(queryTemplate, templateArgs...)
resources, err := client.Resources(ctx,
armresourcegraph.QueryRequest{
Expand Down
4 changes: 2 additions & 2 deletions pkg/azure/access/helpers/resourcegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package helpers

import (
"context"
"time"

"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources"
"github.com/gardener/machine-controller-manager-provider-azure/pkg/azure/access/errors"
Expand All @@ -30,7 +29,8 @@ const (
// ResourceGroupExists checks if the given resourceGroup exists.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func ResourceGroupExists(ctx context.Context, client *armresources.ResourceGroupsClient, resourceGroup string) (exists bool, err error) {
defer instrument.RecordAzAPIMetric(err, resourceGroupExistsServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(resourceGroupExistsServiceLabel, &err)()

resp, err := client.CheckExistence(ctx, resourceGroup, nil)
if err != nil {
if errors.IsNotFoundAzAPIError(err) {
Expand Down
6 changes: 4 additions & 2 deletions pkg/azure/access/helpers/subnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,20 @@ package helpers

import (
"context"
"time"

"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v4"
"github.com/gardener/machine-controller-manager-provider-azure/pkg/azure/access/errors"
"github.com/gardener/machine-controller-manager-provider-azure/pkg/azure/instrument"
)

const subnetGetServiceLabel = "subnet_get"

// GetSubnet fetches a Subnet resource given a resourceGroup, virtualNetworkName and subnetName.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func GetSubnet(ctx context.Context, subnetAccess *armnetwork.SubnetsClient, resourceGroup, virtualNetworkName, subnetName string) (subnet *armnetwork.Subnet, err error) {
var subnetResp armnetwork.SubnetsClientGetResponse
defer instrument.RecordAzAPIMetric(err, subnetGetServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(subnetGetServiceLabel, &err)()

subnetResp, err = subnetAccess.Get(ctx, resourceGroup, virtualNetworkName, subnetName, nil)
if err != nil {
errors.LogAzAPIError(err, "Failed to GET Subnet for [resourceGroup: %s, virtualNetworkName: %s, subnetName: %s]", resourceGroup, virtualNetworkName, subnetName)
Expand Down
12 changes: 8 additions & 4 deletions pkg/azure/access/helpers/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ const (
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func GetVirtualMachine(ctx context.Context, vmClient *armcompute.VirtualMachinesClient, resourceGroup, vmName string) (vm *armcompute.VirtualMachine, err error) {
var getResp armcompute.VirtualMachinesClientGetResponse
defer instrument.RecordAzAPIMetric(err, vmGetServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(vmGetServiceLabel, &err)()

getResp, err = vmClient.Get(ctx, resourceGroup, vmName, nil)
if err != nil {
if errors.IsNotFoundAzAPIError(err) {
Expand All @@ -61,7 +62,8 @@ func GetVirtualMachine(ctx context.Context, vmClient *armcompute.VirtualMachines
// If cascade delete is set for associated NICs and Disks then these resources will also be deleted along with the VM.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func DeleteVirtualMachine(ctx context.Context, vmAccess *armcompute.VirtualMachinesClient, resourceGroup, vmName string) (err error) {
defer instrument.RecordAzAPIMetric(err, vmDeleteServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(vmDeleteServiceLabel, &err)()

delCtx, cancelFn := context.WithTimeout(ctx, defaultDeleteVMTimeout)
defer cancelFn()
poller, err := vmAccess.BeginDelete(delCtx, resourceGroup, vmName, nil)
Expand All @@ -80,7 +82,8 @@ func DeleteVirtualMachine(ctx context.Context, vmAccess *armcompute.VirtualMachi
// CreateVirtualMachine creates a Virtual Machine given a resourceGroup and virtual machine creation parameters.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func CreateVirtualMachine(ctx context.Context, vmAccess *armcompute.VirtualMachinesClient, resourceGroup string, vmCreationParams armcompute.VirtualMachine) (vm *armcompute.VirtualMachine, err error) {
defer instrument.RecordAzAPIMetric(err, vmCreateServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(vmCreateServiceLabel, &err)()

createCtx, cancelFn := context.WithTimeout(ctx, defaultCreateVMTimeout)
defer cancelFn()
vmName := *vmCreationParams.Name
Expand All @@ -101,7 +104,8 @@ func CreateVirtualMachine(ctx context.Context, vmAccess *armcompute.VirtualMachi
// SetCascadeDeleteForNICsAndDisks sets cascade deletion for NICs and Disks (OSDisk and DataDisks) associated to passed virtual machine.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func SetCascadeDeleteForNICsAndDisks(ctx context.Context, vmClient *armcompute.VirtualMachinesClient, resourceGroup string, vmName string, vmUpdateParams *armcompute.VirtualMachineUpdate) (err error) {
defer instrument.RecordAzAPIMetric(err, vmUpdateServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(vmUpdateServiceLabel, &err)()

updCtx, cancelFn := context.WithTimeout(ctx, defaultUpdateVMTimeout)
defer cancelFn()
poller, err := vmClient.BeginUpdate(updCtx, resourceGroup, vmName, *vmUpdateParams, nil)
Expand Down
4 changes: 2 additions & 2 deletions pkg/azure/access/helpers/vmimage.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package helpers

import (
"context"
"time"

"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5"
"github.com/gardener/machine-controller-manager-provider-azure/pkg/azure/access/errors"
Expand All @@ -28,7 +27,8 @@ const vmImageGetServiceLabel = "virtual_machine_image_get"
// GetVMImage fetches the VM Image given a location and image reference.
// NOTE: All calls to this Azure API are instrumented as prometheus metric.
func GetVMImage(ctx context.Context, vmImagesAccess *armcompute.VirtualMachineImagesClient, location string, imageRef armcompute.ImageReference) (vmImage *armcompute.VirtualMachineImage, err error) {
defer instrument.RecordAzAPIMetric(err, vmImageGetServiceLabel, time.Now())
defer instrument.AZAPIMetricRecorderFn(vmImageGetServiceLabel, &err)()

resp, err := vmImagesAccess.Get(ctx, location, *imageRef.Publisher, *imageRef.Offer, *imageRef.SKU, *imageRef.Version, nil)
if err != nil {
errors.LogAzAPIError(err, "Failed to get VM Image [Location: %s, ImageRef: %+v]", location, imageRef)
Expand Down
5 changes: 4 additions & 1 deletion pkg/azure/api/validation/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import (
"strings"

"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
"github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes"
"github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/validation/field"
Expand All @@ -26,7 +28,8 @@ const providerAzure = "Azure"
// If it is not then it will return an error indicating that this provider implementation cannot fulfill the request.
func ValidateMachineClassProvider(mcc *v1alpha1.MachineClass) error {
if mcc.Provider != providerAzure {
return field.Invalid(field.NewPath("provider"), mcc.Provider, fmt.Sprintf("Request for provider %s cannot be fulfilled. Only %s provider is supported.", mcc.Provider, providerAzure))
err := field.Invalid(field.NewPath("provider"), mcc.Provider, fmt.Sprintf("Request for provider %s cannot be fulfilled. Only %s provider is supported.", mcc.Provider, providerAzure))
return status.Error(codes.InvalidArgument, fmt.Sprintf("error validating provider %v", err))
}
return nil
}
Expand Down
21 changes: 21 additions & 0 deletions pkg/azure/instrument/instrument.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"strconv"
"time"

"github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes"
"github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status"
"github.com/gardener/machine-controller-manager/pkg/util/provider/metrics"
)
Expand Down Expand Up @@ -67,6 +68,8 @@ func RecordDriverAPIMetric(err error, operation string, invocationTime time.Time
)
if errors.As(err, &statusErr) {
labels = append(labels, strconv.Itoa(int(statusErr.Code())))
} else {
labels = append(labels, strconv.Itoa(int(codes.Internal)))
}
metrics.DriverFailedAPIRequests.
WithLabelValues(labels...).
Expand All @@ -80,3 +83,21 @@ func RecordDriverAPIMetric(err error, operation string, invocationTime time.Time
operation,
).Observe(elapsed.Seconds())
}

// AZAPIMetricRecorderFn returns a function that can be used to record a prometheus metric for Azure API calls.
// NOTE: a pointer to an error (which itself is a fat interface pointer) is necessary to enable the callers of this function to enclose this call into a `defer` statement.
func AZAPIMetricRecorderFn(azServiceName string, err *error) func() {
invocationTime := time.Now()
return func() {
RecordAzAPIMetric(*err, azServiceName, invocationTime)
}
}

// DriverAPIMetricRecorderFn returns a function that can be used to record a prometheus metric for driver API calls.
// NOTE: a pointer to an error (which itself is a fat interface pointer) is necessary to enable the callers of this function to enclose this call into a `defer` statement.
func DriverAPIMetricRecorderFn(operation string, err *error) func() {
invocationTime := time.Now()
return func() {
RecordDriverAPIMetric(*err, operation, invocationTime)
}
}
114 changes: 114 additions & 0 deletions pkg/azure/instrument/instrument_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package instrument

import (
"errors"
"strconv"
"testing"

"github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes"
"github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status"
"github.com/gardener/machine-controller-manager/pkg/util/provider/metrics"
. "github.com/onsi/gomega"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
)

var (
testErr = errors.New("test-error")
defaultErrorCode = strconv.Itoa(int(codes.Internal))
testStatusErr = status.New(codes.InvalidArgument, "test-status-error")
)

const serviceName = "test-service"

func TestAPIMetricRecorderFn(t *testing.T) {
testCases := []struct {
name string
err error
}{
{"assert that function captures failed API request count when the error is not nil", testErr},
{"assert that function captures successful API request count when the error is nil", nil},
}
g := NewWithT(t)
reg := prometheus.NewRegistry()
g.Expect(reg.Register(metrics.APIRequestCount)).To(Succeed())
g.Expect(reg.Register(metrics.APIFailedRequestCount)).To(Succeed())
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
defer metrics.APIRequestCount.Reset()
defer metrics.APIFailedRequestCount.Reset()
defer metrics.APIRequestDuration.Reset()
_ = deferredMetricsRecorderInvoker(tc.err != nil, false, AZAPIMetricRecorderFn)
if tc.err != nil {
g.Expect(testutil.CollectAndCount(metrics.APIRequestCount)).To(Equal(0))
g.Expect(testutil.CollectAndCount(metrics.APIFailedRequestCount)).To(Equal(1))
g.Expect(testutil.ToFloat64(metrics.APIFailedRequestCount.WithLabelValues(prometheusProviderLabelValue, serviceName))).To(Equal(float64(1)))
} else {
g.Expect(testutil.CollectAndCount(metrics.APIRequestCount)).To(Equal(1))
g.Expect(testutil.CollectAndCount(metrics.APIFailedRequestCount)).To(Equal(0))
g.Expect(testutil.ToFloat64(metrics.APIRequestCount.WithLabelValues(prometheusProviderLabelValue, serviceName))).To(Equal(float64(1)))
}
})
}
}

func TestDriverAPIMetricRecorderFn(t *testing.T) {
testCases := []struct {
name string
err error
}{
{"assert that function captures failed driver API request with default error code for internal error when there is an error", testErr},
{"assert that function captures failed driver API request with error code from status.Status on error", testStatusErr},
{"assert that function captures successful driver API request count when the error is nil", nil},
}
g := NewWithT(t)
reg := prometheus.NewRegistry()
g.Expect(reg.Register(metrics.DriverFailedAPIRequests)).To(Succeed())
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
defer metrics.DriverFailedAPIRequests.Reset()
_ = deferredMetricsRecorderInvoker(tc.err != nil, isStatusErr(tc.err), DriverAPIMetricRecorderFn)
if tc.err != nil {
expectedErrCode := getExpectedErrorCode(tc.err)
g.Expect(testutil.CollectAndCount(metrics.DriverFailedAPIRequests)).To(Equal(1))
g.Expect(testutil.ToFloat64(metrics.DriverFailedAPIRequests.WithLabelValues(prometheusProviderLabelValue, serviceName, expectedErrCode))).To(Equal(float64(1)))
} else {
g.Expect(testutil.CollectAndCount(metrics.DriverFailedAPIRequests)).To(Equal(0))
}
})
}
}

func isStatusErr(err error) bool {
if err == nil {
return false
}
var statusErr *status.Status
return errors.As(err, &statusErr)
}

func getExpectedErrorCode(err error) string {
if err == nil {
return ""
}
var statusErr *status.Status
if errors.As(err, &statusErr) {
return strconv.Itoa(int(statusErr.Code()))
} else {
return defaultErrorCode
}
}

type recorderFn func(string, *error) func()

func deferredMetricsRecorderInvoker(shouldReturnErr bool, isStatusErr bool, fn recorderFn) (err error) {
defer fn(serviceName, &err)()
if shouldReturnErr {
if isStatusErr {
err = testStatusErr
} else {
err = testErr
}
}
return
}
Loading

0 comments on commit fc7faad

Please sign in to comment.