Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for OVS flow operations metrics on node #866

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 48 additions & 2 deletions pkg/agent/metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,33 @@ var (
Help: "Flow count for each OVS flow table. The TableID is used as a label.",
StabilityLevel: metrics.STABLE,
}, []string{"table_id"})

OVSFlowOpsCount = metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "antrea_agent_ovs_flow_ops_count",
Help: "Number of OVS flow operations, partitioned by operation type (add, modify and delete).",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)

OVSFlowOpsErrorCount = metrics.NewCounterVec(
antoninbas marked this conversation as resolved.
Show resolved Hide resolved
&metrics.CounterOpts{
Name: "antrea_agent_ovs_flow_ops_error_count",
Help: "Number of OVS flow operation errors, partitioned by operation type (add, modify and delete).",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)

OVSFlowOpsLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Name: "antrea_agent_ovs_flow_ops_latency_milliseconds",
Help: "The latency of OVS flow operations, partitioned by operation type (add, modify and delete).",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)
)

func InitializePrometheusMetrics() {
Expand All @@ -78,8 +105,9 @@ func InitializePrometheusMetrics() {
}

gaugeHost := metrics.NewGauge(&metrics.GaugeOpts{
Name: "antrea_agent_runtime_info",
Help: "Antrea agent runtime info , defined as labels. The value of the gauge is always set to 1.",
Name: "antrea_agent_runtime_info",
Help: "Antrea agent runtime info , defined as labels. The value of the gauge is always set to 1.",

ConstLabels: metrics.Labels{"k8s_nodename": nodeName, "k8s_podname": env.GetPodName()},
StabilityLevel: metrics.STABLE,
})
Expand Down Expand Up @@ -122,4 +150,22 @@ func InitializeOVSMetrics() {
if err := legacyregistry.Register(OVSFlowCount); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_count with Prometheus")
}

if err := legacyregistry.Register(OVSFlowOpsCount); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_ops_count with Prometheus")
}
if err := legacyregistry.Register(OVSFlowOpsErrorCount); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_ops_error_count with Prometheus")
}
if err := legacyregistry.Register(OVSFlowOpsLatency); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_ops_latency_milliseconds with Prometheus")
}
// Initialize OpenFlow operations metrics with label add, modify and delete
// since those metrics won't come out until observation.
opsArray := [3]string{"add", "modify", "delete"}
for _, ops := range opsArray {
OVSFlowOpsCount.WithLabelValues(ops)
OVSFlowOpsErrorCount.WithLabelValues(ops)
OVSFlowOpsLatency.WithLabelValues(ops)
}
}
86 changes: 79 additions & 7 deletions pkg/agent/openflow/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ import (
"strconv"
"strings"
"sync"
"time"

"k8s.io/client-go/tools/cache"

"github.com/vmware-tanzu/antrea/pkg/agent/config"
"github.com/vmware-tanzu/antrea/pkg/agent/metrics"
"github.com/vmware-tanzu/antrea/pkg/agent/openflow/cookie"
"github.com/vmware-tanzu/antrea/pkg/agent/types"
"github.com/vmware-tanzu/antrea/pkg/features"
Expand Down Expand Up @@ -259,31 +261,101 @@ func (c *client) GetTunnelVirtualMAC() net.HardwareAddr {
}

func (c *client) Add(flow binding.Flow) error {
return c.bridge.AddFlowsInBundle([]binding.Flow{flow}, nil, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle([]binding.Flow{flow}, nil, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("add").Inc()
return nil
}

func (c *client) Modify(flow binding.Flow) error {
return c.bridge.AddFlowsInBundle(nil, []binding.Flow{flow}, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("modify").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(nil, []binding.Flow{flow}, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("modify").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("modify").Inc()
return nil
}

func (c *client) Delete(flow binding.Flow) error {
return c.bridge.AddFlowsInBundle(nil, nil, []binding.Flow{flow})
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(nil, nil, []binding.Flow{flow}); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc()
return nil
}

func (c *client) AddAll(flows []binding.Flow) error {
return c.bridge.AddFlowsInBundle(flows, nil, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(flows, nil, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("add").Inc()
return nil
}

func (c *client) DeleteAll(flows []binding.Flow) error {
return c.bridge.AddFlowsInBundle(nil, nil, flows)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(nil, nil, flows); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc()
return nil
}

func (c *client) AddOFEntries(ofEntries []binding.OFEntry) error {
return c.bridge.AddOFEntriesInBundle(ofEntries, nil, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddOFEntriesInBundle(ofEntries, nil, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("add").Inc()
return nil
}

func (c *client) DeleteOFEntries(ofEntries []binding.OFEntry) error {
return c.bridge.AddOFEntriesInBundle(nil, nil, ofEntries)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddOFEntriesInBundle(nil, nil, ofEntries); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc()
return nil
}

// defaultFlows generates the default flows of all tables.
Expand Down
8 changes: 6 additions & 2 deletions test/e2e/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ var antreaAgentMetrics = []string{
"antrea_agent_ingress_networkpolicy_rule_count",
"antrea_agent_local_pod_count",
"antrea_agent_networkpolicy_count",
"antrea_agent_ovs_total_flow_count",
"antrea_agent_ovs_flow_count",
"antrea_agent_ovs_flow_ops_count",
"antrea_agent_ovs_flow_ops_error_count",
"antrea_agent_ovs_flow_ops_latency_milliseconds",
"antrea_agent_ovs_total_flow_count",
"antrea_agent_runtime_info",
}

Expand Down Expand Up @@ -276,7 +279,8 @@ func testMetricsFromPrometheusServer(t *testing.T, data *TestData, prometheusJob
// Create a map of all the metrics which were found on the server
testMap := make(map[string]bool)
for _, metric := range output.Data {
testMap[metric["__name__"]] = true
name := strings.TrimSuffix(metric["__name__"], "_bucket")
testMap[name] = true
}

// Validate that all the required metrics exist in the server's output
Expand Down