From df5e6fdeeedda4b9a38b3359a7cab4f2e94d745a Mon Sep 17 00:00:00 2001 From: Yuki Tsuboi Date: Sat, 4 Jul 2020 00:52:46 +0900 Subject: [PATCH] Add support for OVS flow operations metrics on node (#866) - Number of OVS flow operations, partitioned by operations(add, modify and delete) - Number of OVS flow operation errors, partitioned by operations(add, modify and delete) - Latency of OVS flow operations, partitioned by operations(add, modify and delete) Signed-off-by: Yuki Tsuboi --- pkg/agent/metrics/prometheus.go | 45 +++++++++++++++++ pkg/agent/openflow/pipeline.go | 86 ++++++++++++++++++++++++++++++--- test/e2e/prometheus_test.go | 5 +- 3 files changed, 128 insertions(+), 8 deletions(-) diff --git a/pkg/agent/metrics/prometheus.go b/pkg/agent/metrics/prometheus.go index f8bf801d492..b3b12853f4e 100644 --- a/pkg/agent/metrics/prometheus.go +++ b/pkg/agent/metrics/prometheus.go @@ -67,6 +67,33 @@ var ( Help: "Flow count for each OVS flow table. The TableID is used as a label.", StabilityLevel: metrics.STABLE, }, []string{"table_id"}) + + OVSFlowOpsCount = metrics.NewCounterVec( + &metrics.CounterOpts{ + Name: "antrea_agent_ovs_flow_ops_count", + Help: "Number of OVS flow operations, partitioned by operations(add, modify and delete).", + StabilityLevel: metrics.ALPHA, + }, + []string{"operation"}, + ) + + OVSFlowOpsErrorCount = metrics.NewCounterVec( + &metrics.CounterOpts{ + Name: "antrea_agent_ovs_flow_ops_error_count", + Help: "Number of OVS flow operation errors, partitioned by operations(add, modify and delete).", + StabilityLevel: metrics.ALPHA, + }, + []string{"operation"}, + ) + + OVSFlowOpsLatency = metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Name: "antrea_agent_ovs_flow_ops_latency_milliseconds", + Help: "Latency of OVS flow operations, partitioned by operations(add, modify and delete).", + StabilityLevel: metrics.ALPHA, + }, + []string{"operation"}, + ) ) func InitializePrometheusMetrics() { @@ -112,4 +139,22 @@ func InitializePrometheusMetrics() { if err := legacyregistry.Register(OVSFlowCount); err != nil { klog.Error("Failed to register antrea_agent_ovs_flow_count with Prometheus") } + + if err := legacyregistry.Register(OVSFlowOpsCount); err != nil { + klog.Error("Failed to register antrea_agent_ovs_flow_ops_count with Prometheus") + } + if err := legacyregistry.Register(OVSFlowOpsErrorCount); err != nil { + klog.Error("Failed to register antrea_agent_ovs_flow_ops_error_count with Prometheus") + } + if err := legacyregistry.Register(OVSFlowOpsLatency); err != nil { + klog.Error("Failed to register antrea_agent_ovs_flow_ops_latency_milliseconds with Prometheus") + } + // Initialize OpenFlow operations metrics with label add, modify and delete + // since those metrics won't come out until observation. + opsArray := [3]string{"add", "modify", "delete"} + for _, ops := range opsArray { + OVSFlowOpsCount.WithLabelValues(ops) + OVSFlowOpsErrorCount.WithLabelValues(ops) + OVSFlowOpsLatency.WithLabelValues(ops) + } } diff --git a/pkg/agent/openflow/pipeline.go b/pkg/agent/openflow/pipeline.go index b54ef5fafdd..d4356491085 100644 --- a/pkg/agent/openflow/pipeline.go +++ b/pkg/agent/openflow/pipeline.go @@ -21,10 +21,12 @@ import ( "strconv" "strings" "sync" + "time" "k8s.io/client-go/tools/cache" "github.com/vmware-tanzu/antrea/pkg/agent/config" + "github.com/vmware-tanzu/antrea/pkg/agent/metrics" "github.com/vmware-tanzu/antrea/pkg/agent/openflow/cookie" "github.com/vmware-tanzu/antrea/pkg/agent/types" "github.com/vmware-tanzu/antrea/pkg/features" @@ -257,31 +259,101 @@ func (c *client) GetTunnelVirtualMAC() net.HardwareAddr { } func (c *client) Add(flow binding.Flow) error { - return c.bridge.AddFlowsInBundle([]binding.Flow{flow}, nil, nil) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddFlowsInBundle([]binding.Flow{flow}, nil, nil); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("add").Inc() + return nil } func (c *client) Modify(flow binding.Flow) error { - return c.bridge.AddFlowsInBundle(nil, []binding.Flow{flow}, nil) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("modify").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddFlowsInBundle(nil, []binding.Flow{flow}, nil); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("modify").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("modify").Inc() + return nil } func (c *client) Delete(flow binding.Flow) error { - return c.bridge.AddFlowsInBundle(nil, nil, []binding.Flow{flow}) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddFlowsInBundle(nil, nil, []binding.Flow{flow}); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc() + return nil } func (c *client) AddAll(flows []binding.Flow) error { - return c.bridge.AddFlowsInBundle(flows, nil, nil) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddFlowsInBundle(flows, nil, nil); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("add").Inc() + return nil } func (c *client) DeleteAll(flows []binding.Flow) error { - return c.bridge.AddFlowsInBundle(nil, nil, flows) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddFlowsInBundle(nil, nil, flows); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc() + return nil } func (c *client) AddOFEntries(ofEntries []binding.OFEntry) error { - return c.bridge.AddOFEntriesInBundle(ofEntries, nil, nil) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddOFEntriesInBundle(ofEntries, nil, nil); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("add").Inc() + return nil } func (c *client) DeleteOFEntries(ofEntries []binding.OFEntry) error { - return c.bridge.AddOFEntriesInBundle(nil, nil, ofEntries) + startTime := time.Now() + defer func() { + d := time.Since(startTime) + metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds())) + }() + if err := c.bridge.AddOFEntriesInBundle(nil, nil, ofEntries); err != nil { + metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc() + return err + } + metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc() + return nil } // defaultFlows generates the default flows of all tables. diff --git a/test/e2e/prometheus_test.go b/test/e2e/prometheus_test.go index 06b3fa346b0..b2d6749adc4 100644 --- a/test/e2e/prometheus_test.go +++ b/test/e2e/prometheus_test.go @@ -39,8 +39,11 @@ var antreaAgentMetrics = []string{ "antrea_agent_ingress_networkpolicy_rule_count", "antrea_agent_local_pod_count", "antrea_agent_networkpolicy_count", - "antrea_agent_ovs_total_flow_count", "antrea_agent_ovs_flow_count", + "antrea_agent_ovs_flow_ops_count", + "antrea_agent_ovs_flow_ops_error_count", + "antrea_agent_ovs_flow_ops_latency_milliseconds", + "antrea_agent_ovs_total_flow_count", "antrea_agent_runtime_info", }