From 6903963fd4ddb6d25cead655ed01bb2c71ccdd74 Mon Sep 17 00:00:00 2001 From: Yuki Tsuboi Date: Fri, 17 Jul 2020 15:07:26 +0900 Subject: [PATCH] Add support for OVS flow operations metrics on node (#866) - Number of OVS flow operations, partitioned by operations(add, modify and delete) - Number of OVS flow operation errors, partitioned by operations(add, modify and delete) - The latency of OVS flow operations, partitioned by operations(add, modify and delete) Signed-off-by: Yuki Tsuboi --- pkg/agent/metrics/prometheus.go | 50 ++++++++++++++++++- pkg/agent/openflow/pipeline.go | 86 ++++++++++++++++++++++++++++++--- test/e2e/prometheus_test.go | 7 ++- 3 files changed, 133 insertions(+), 10 deletions(-) diff --git a/pkg/agent/metrics/prometheus.go b/pkg/agent/metrics/prometheus.go index 25aa6e12044..28f9a768139 100644 --- a/pkg/agent/metrics/prometheus.go +++ b/pkg/agent/metrics/prometheus.go @@ -67,6 +67,33 @@ var ( Help: "Flow count for each OVS flow table. The TableID is used as a label.", StabilityLevel: metrics.STABLE, }, []string{"table_id"}) + + OVSFlowOpsCount = metrics.NewCounterVec( + &metrics.CounterOpts{ + Name: "antrea_agent_ovs_flow_ops_count", + Help: "Number of OVS flow operations, partitioned by operation type (add, modify and delete).", + StabilityLevel: metrics.ALPHA, + }, + []string{"operation"}, + ) + + OVSFlowOpsErrorCount = metrics.NewCounterVec( + &metrics.CounterOpts{ + Name: "antrea_agent_ovs_flow_ops_error_count", + Help: "Number of OVS flow operation errors, partitioned by operation type (add, modify and delete).", + StabilityLevel: metrics.ALPHA, + }, + []string{"operation"}, + ) + + OVSFlowOpsLatency = metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Name: "antrea_agent_ovs_flow_ops_latency_milliseconds", + Help: "The latency of OVS flow operations, partitioned by operation type (add, modify and delete).", + StabilityLevel: metrics.ALPHA, + }, + []string{"operation"}, + ) ) func InitializePrometheusMetrics() { @@ -78,8 +105,9 @@ func InitializePrometheusMetrics() { } gaugeHost := metrics.NewGauge(&metrics.GaugeOpts{ - Name: "antrea_agent_runtime_info", - Help: "Antrea agent runtime info , defined as labels. The value of the gauge is always set to 1.", + Name: "antrea_agent_runtime_info", + Help: "Antrea agent runtime info , defined as labels. The value of the gauge is always set to 1.", + ConstLabels: metrics.Labels{"k8s_nodename": nodeName, "k8s_podname": env.GetPodName()}, StabilityLevel: metrics.STABLE, }) @@ -122,4 +150,22 @@ func InitializeOVSMetrics() { if err := legacyregistry.Register(OVSFlowCount); err != nil { klog.Error("Failed to register antrea_agent_ovs_flow_count with Prometheus") } + + if err := legacyregistry.Register(OVSFlowOpsCount); err != nil { + klog.Error("Failed to register antrea_agent_ovs_flow_ops_count with Prometheus") + } + if err := legacyregistry.Register(OVSFlowOpsErrorCount); err != nil { + klog.Error("Failed to register antrea_agent_ovs_flow_ops_error_count with Prometheus") + } + if err := legacyregistry.Register(OVSFlowOpsLatency); err != nil { + klog.Error("Failed to register antrea_agent_ovs_flow_ops_latency_milliseconds with Prometheus") + } + // Initialize OpenFlow operations metrics with label add, modify and delete + // since those metrics won't come out until observation. + opsArray := [3]string{"add", "modify", "delete"} + for _, ops := range opsArray { + OVSFlowOpsCount.WithLabelValues(ops) + OVSFlowOpsErrorCount.WithLabelValues(ops) + OVSFlowOpsLatency.WithLabelValues(ops) + } } diff --git a/pkg/agent/openflow/pipeline.go b/pkg/agent/openflow/pipeline.go index f7887ef5ede..eb8370256f0 100644 --- a/pkg/agent/openflow/pipeline.go +++ b/pkg/agent/openflow/pipeline.go @@ -21,10 +21,12 @@ import ( "strconv" "strings" "sync" + "time" "k8s.io/client-go/tools/cache" "github.com/vmware-tanzu/antrea/pkg/agent/config" + "github.com/vmware-tanzu/antrea/pkg/agent/metrics" "github.com/vmware-tanzu/antrea/pkg/agent/openflow/cookie" "github.com/vmware-tanzu/antrea/pkg/agent/types" "github.com/vmware-tanzu/antrea/pkg/features" @@ -257,31 +259,101 @@ func (c *client) GetTunnelVirtualMAC() net.HardwareAddr { } func (c *client) Add(flow binding.Flow) error { - return c.bridge.AddFlowsInBundle([]binding.Flow{flow}, nil, nil) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddFlowsInBundle([]binding.Flow{flow}, nil, nil); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("add").Inc() + return nil } func (c *client) Modify(flow binding.Flow) error { - return c.bridge.AddFlowsInBundle(nil, []binding.Flow{flow}, nil) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("modify").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddFlowsInBundle(nil, []binding.Flow{flow}, nil); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("modify").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("modify").Inc() + return nil } func (c *client) Delete(flow binding.Flow) error { - return c.bridge.AddFlowsInBundle(nil, nil, []binding.Flow{flow}) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddFlowsInBundle(nil, nil, []binding.Flow{flow}); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc() + return nil } func (c *client) AddAll(flows []binding.Flow) error { - return c.bridge.AddFlowsInBundle(flows, nil, nil) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddFlowsInBundle(flows, nil, nil); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("add").Inc() + return nil } func (c *client) DeleteAll(flows []binding.Flow) error { - return c.bridge.AddFlowsInBundle(nil, nil, flows) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddFlowsInBundle(nil, nil, flows); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc() + return nil } func (c *client) AddOFEntries(ofEntries []binding.OFEntry) error { - return c.bridge.AddOFEntriesInBundle(ofEntries, nil, nil) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddOFEntriesInBundle(ofEntries, nil, nil); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("add").Inc() + return nil } func (c *client) DeleteOFEntries(ofEntries []binding.OFEntry) error { - return c.bridge.AddOFEntriesInBundle(nil, nil, ofEntries) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddOFEntriesInBundle(nil, nil, ofEntries); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc() + return nil } // defaultFlows generates the default flows of all tables. diff --git a/test/e2e/prometheus_test.go b/test/e2e/prometheus_test.go index 06b3fa346b0..d8f44573268 100644 --- a/test/e2e/prometheus_test.go +++ b/test/e2e/prometheus_test.go @@ -39,8 +39,13 @@ var antreaAgentMetrics = []string{ "antrea_agent_ingress_networkpolicy_rule_count", "antrea_agent_local_pod_count", "antrea_agent_networkpolicy_count", - "antrea_agent_ovs_total_flow_count", "antrea_agent_ovs_flow_count", + "antrea_agent_ovs_flow_ops_count", + "antrea_agent_ovs_flow_ops_error_count", + "antrea_agent_ovs_flow_ops_latency_milliseconds_bucket", + "antrea_agent_ovs_flow_ops_latency_milliseconds_count", + "antrea_agent_ovs_flow_ops_latency_milliseconds_sum", + "antrea_agent_ovs_total_flow_count", "antrea_agent_runtime_info", }