Skip to content

Commit

Permalink
Add support for OVS flow operations metrics on node (antrea-io#866)
Browse files Browse the repository at this point in the history
- Number of OVS flow operations, partitioned by operations(add, modify and delete)
- Number of OVS flow operation errors, partitioned by operations(add, modify and delete)
- The latency of OVS flow operations, partitioned by operations(add, modify and delete)

Signed-off-by: Yuki Tsuboi <ytsuboi@vmware.com>
  • Loading branch information
Yuki Tsuboi committed Jul 22, 2020
1 parent 11792b6 commit 4708fed
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 11 deletions.
50 changes: 48 additions & 2 deletions pkg/agent/metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,33 @@ var (
Help: "Flow count for each OVS flow table. The TableID is used as a label.",
StabilityLevel: metrics.STABLE,
}, []string{"table_id"})

OVSFlowOpsCount = metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "antrea_agent_ovs_flow_ops_count",
Help: "Number of OVS flow operations, partitioned by operation type (add, modify and delete).",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)

OVSFlowOpsErrorCount = metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "antrea_agent_ovs_flow_ops_error_count",
Help: "Number of OVS flow operation errors, partitioned by operation type (add, modify and delete).",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)

OVSFlowOpsLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Name: "antrea_agent_ovs_flow_ops_latency_milliseconds",
Help: "The latency of OVS flow operations, partitioned by operation type (add, modify and delete).",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)
)

func InitializePrometheusMetrics() {
Expand All @@ -78,8 +105,9 @@ func InitializePrometheusMetrics() {
}

gaugeHost := metrics.NewGauge(&metrics.GaugeOpts{
Name: "antrea_agent_runtime_info",
Help: "Antrea agent runtime info , defined as labels. The value of the gauge is always set to 1.",
Name: "antrea_agent_runtime_info",
Help: "Antrea agent runtime info , defined as labels. The value of the gauge is always set to 1.",

ConstLabels: metrics.Labels{"k8s_nodename": nodeName, "k8s_podname": env.GetPodName()},
StabilityLevel: metrics.STABLE,
})
Expand Down Expand Up @@ -122,4 +150,22 @@ func InitializeOVSMetrics() {
if err := legacyregistry.Register(OVSFlowCount); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_count with Prometheus")
}

if err := legacyregistry.Register(OVSFlowOpsCount); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_ops_count with Prometheus")
}
if err := legacyregistry.Register(OVSFlowOpsErrorCount); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_ops_error_count with Prometheus")
}
if err := legacyregistry.Register(OVSFlowOpsLatency); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_ops_latency_milliseconds with Prometheus")
}
// Initialize OpenFlow operations metrics with label add, modify and delete
// since those metrics won't come out until observation.
opsArray := [3]string{"add", "modify", "delete"}
for _, ops := range opsArray {
OVSFlowOpsCount.WithLabelValues(ops)
OVSFlowOpsErrorCount.WithLabelValues(ops)
OVSFlowOpsLatency.WithLabelValues(ops)
}
}
86 changes: 79 additions & 7 deletions pkg/agent/openflow/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ import (
"strconv"
"strings"
"sync"
"time"

"k8s.io/client-go/tools/cache"

"github.com/vmware-tanzu/antrea/pkg/agent/config"
"github.com/vmware-tanzu/antrea/pkg/agent/metrics"
"github.com/vmware-tanzu/antrea/pkg/agent/openflow/cookie"
"github.com/vmware-tanzu/antrea/pkg/agent/types"
"github.com/vmware-tanzu/antrea/pkg/features"
Expand Down Expand Up @@ -259,31 +261,101 @@ func (c *client) GetTunnelVirtualMAC() net.HardwareAddr {
}

func (c *client) Add(flow binding.Flow) error {
return c.bridge.AddFlowsInBundle([]binding.Flow{flow}, nil, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle([]binding.Flow{flow}, nil, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("add").Inc()
return nil
}

func (c *client) Modify(flow binding.Flow) error {
return c.bridge.AddFlowsInBundle(nil, []binding.Flow{flow}, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("modify").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(nil, []binding.Flow{flow}, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("modify").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("modify").Inc()
return nil
}

func (c *client) Delete(flow binding.Flow) error {
return c.bridge.AddFlowsInBundle(nil, nil, []binding.Flow{flow})
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(nil, nil, []binding.Flow{flow}); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc()
return nil
}

func (c *client) AddAll(flows []binding.Flow) error {
return c.bridge.AddFlowsInBundle(flows, nil, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(flows, nil, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("add").Inc()
return nil
}

func (c *client) DeleteAll(flows []binding.Flow) error {
return c.bridge.AddFlowsInBundle(nil, nil, flows)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(nil, nil, flows); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc()
return nil
}

func (c *client) AddOFEntries(ofEntries []binding.OFEntry) error {
return c.bridge.AddOFEntriesInBundle(ofEntries, nil, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddOFEntriesInBundle(ofEntries, nil, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("add").Inc()
return nil
}

func (c *client) DeleteOFEntries(ofEntries []binding.OFEntry) error {
return c.bridge.AddOFEntriesInBundle(nil, nil, ofEntries)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddOFEntriesInBundle(nil, nil, ofEntries); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc()
return nil
}

// defaultFlows generates the default flows of all tables.
Expand Down
16 changes: 14 additions & 2 deletions test/e2e/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ var antreaAgentMetrics = []string{
"antrea_agent_ingress_networkpolicy_rule_count",
"antrea_agent_local_pod_count",
"antrea_agent_networkpolicy_count",
"antrea_agent_ovs_total_flow_count",
"antrea_agent_ovs_flow_count",
"antrea_agent_ovs_flow_ops_count",
"antrea_agent_ovs_flow_ops_error_count",
"antrea_agent_ovs_flow_ops_latency_milliseconds",
"antrea_agent_ovs_total_flow_count",
"antrea_agent_runtime_info",
}

Expand Down Expand Up @@ -276,7 +279,12 @@ func testMetricsFromPrometheusServer(t *testing.T, data *TestData, prometheusJob
// Create a map of all the metrics which were found on the server
testMap := make(map[string]bool)
for _, metric := range output.Data {
testMap[metric["__name__"]] = true
name := metric["__name__"]
switch {
case isBucket(name):
name = name[:len(name)-7]
}
testMap[name] = true
}

// Validate that all the required metrics exist in the server's output
Expand Down Expand Up @@ -315,3 +323,7 @@ func TestAgentMetricsOnPrometheusServer(t *testing.T) {

testMetricsFromPrometheusServer(t, data, "antrea-agents", antreaAgentMetrics)
}

func isBucket(name string) bool {
return len(name) > 7 && name[len(name)-7:] == "_bucket"
}

0 comments on commit 4708fed

Please sign in to comment.