Skip to content

Commit

Permalink
Add support for OVS flow operations metrics on node (antrea-io#866)
Browse files Browse the repository at this point in the history
- Number of OVS flow operations, partitioned by operations(add, modify and delete)
- Number of OVS flow operation errors, partitioned by operations(add, modify and delete)
- The latency of OVS flow operations, partitioned by operations(add, modify and delete)
- Use prometheus v2.19.2 image to use API of querying target metadata in the e2e test

Signed-off-by: Yuki Tsuboi <ytsuboi@vmware.com>
  • Loading branch information
Yuki Tsuboi committed Jul 20, 2020
1 parent a858c9a commit 4fbe993
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 14 deletions.
2 changes: 1 addition & 1 deletion build/yamls/antrea-prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ spec:
spec:
containers:
- name: prometheus
image: prom/prometheus:v2.2.1
image: prom/prometheus:v2.19.2
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus/"
Expand Down
50 changes: 48 additions & 2 deletions pkg/agent/metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,33 @@ var (
Help: "Flow count for each OVS flow table. The TableID is used as a label.",
StabilityLevel: metrics.STABLE,
}, []string{"table_id"})

OVSFlowOpsCount = metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "antrea_agent_ovs_flow_ops_count",
Help: "Number of OVS flow operations, partitioned by operation type (add, modify and delete).",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)

OVSFlowOpsErrorCount = metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "antrea_agent_ovs_flow_ops_error_count",
Help: "Number of OVS flow operation errors, partitioned by operation type (add, modify and delete).",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)

OVSFlowOpsLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Name: "antrea_agent_ovs_flow_ops_latency_milliseconds",
Help: "The latency of OVS flow operations, partitioned by operation type (add, modify and delete).",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)
)

func InitializePrometheusMetrics() {
Expand All @@ -78,8 +105,9 @@ func InitializePrometheusMetrics() {
}

gaugeHost := metrics.NewGauge(&metrics.GaugeOpts{
Name: "antrea_agent_runtime_info",
Help: "Antrea agent runtime info , defined as labels. The value of the gauge is always set to 1.",
Name: "antrea_agent_runtime_info",
Help: "Antrea agent runtime info , defined as labels. The value of the gauge is always set to 1.",

ConstLabels: metrics.Labels{"k8s_nodename": nodeName, "k8s_podname": env.GetPodName()},
StabilityLevel: metrics.STABLE,
})
Expand Down Expand Up @@ -122,4 +150,22 @@ func InitializeOVSMetrics() {
if err := legacyregistry.Register(OVSFlowCount); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_count with Prometheus")
}

if err := legacyregistry.Register(OVSFlowOpsCount); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_ops_count with Prometheus")
}
if err := legacyregistry.Register(OVSFlowOpsErrorCount); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_ops_error_count with Prometheus")
}
if err := legacyregistry.Register(OVSFlowOpsLatency); err != nil {
klog.Error("Failed to register antrea_agent_ovs_flow_ops_latency_milliseconds with Prometheus")
}
// Initialize OpenFlow operations metrics with label add, modify and delete
// since those metrics won't come out until observation.
opsArray := [3]string{"add", "modify", "delete"}
for _, ops := range opsArray {
OVSFlowOpsCount.WithLabelValues(ops)
OVSFlowOpsErrorCount.WithLabelValues(ops)
OVSFlowOpsLatency.WithLabelValues(ops)
}
}
86 changes: 79 additions & 7 deletions pkg/agent/openflow/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ import (
"strconv"
"strings"
"sync"
"time"

"k8s.io/client-go/tools/cache"

"github.com/vmware-tanzu/antrea/pkg/agent/config"
"github.com/vmware-tanzu/antrea/pkg/agent/metrics"
"github.com/vmware-tanzu/antrea/pkg/agent/openflow/cookie"
"github.com/vmware-tanzu/antrea/pkg/agent/types"
"github.com/vmware-tanzu/antrea/pkg/features"
Expand Down Expand Up @@ -257,31 +259,101 @@ func (c *client) GetTunnelVirtualMAC() net.HardwareAddr {
}

func (c *client) Add(flow binding.Flow) error {
return c.bridge.AddFlowsInBundle([]binding.Flow{flow}, nil, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle([]binding.Flow{flow}, nil, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("add").Inc()
return nil
}

func (c *client) Modify(flow binding.Flow) error {
return c.bridge.AddFlowsInBundle(nil, []binding.Flow{flow}, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("modify").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(nil, []binding.Flow{flow}, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("modify").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("modify").Inc()
return nil
}

func (c *client) Delete(flow binding.Flow) error {
return c.bridge.AddFlowsInBundle(nil, nil, []binding.Flow{flow})
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(nil, nil, []binding.Flow{flow}); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc()
return nil
}

func (c *client) AddAll(flows []binding.Flow) error {
return c.bridge.AddFlowsInBundle(flows, nil, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(flows, nil, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("add").Inc()
return nil
}

func (c *client) DeleteAll(flows []binding.Flow) error {
return c.bridge.AddFlowsInBundle(nil, nil, flows)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddFlowsInBundle(nil, nil, flows); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc()
return nil
}

func (c *client) AddOFEntries(ofEntries []binding.OFEntry) error {
return c.bridge.AddOFEntriesInBundle(ofEntries, nil, nil)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("add").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddOFEntriesInBundle(ofEntries, nil, nil); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("add").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("add").Inc()
return nil
}

func (c *client) DeleteOFEntries(ofEntries []binding.OFEntry) error {
return c.bridge.AddOFEntriesInBundle(nil, nil, ofEntries)
startTime := time.Now()
defer func() {
d := time.Since(startTime)
metrics.OVSFlowOpsLatency.WithLabelValues("delete").Observe(float64(d.Milliseconds()))
}()
if err := c.bridge.AddOFEntriesInBundle(nil, nil, ofEntries); err != nil {
metrics.OVSFlowOpsErrorCount.WithLabelValues("delete").Inc()
return err
}
metrics.OVSFlowOpsCount.WithLabelValues("delete").Inc()
return nil
}

// defaultFlows generates the default flows of all tables.
Expand Down
11 changes: 7 additions & 4 deletions test/e2e/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ var antreaAgentMetrics = []string{
"antrea_agent_ingress_networkpolicy_rule_count",
"antrea_agent_local_pod_count",
"antrea_agent_networkpolicy_count",
"antrea_agent_ovs_total_flow_count",
"antrea_agent_ovs_flow_count",
"antrea_agent_ovs_flow_ops_count",
"antrea_agent_ovs_flow_ops_error_count",
"antrea_agent_ovs_flow_ops_latency_milliseconds",
"antrea_agent_ovs_total_flow_count",
"antrea_agent_runtime_info",
}

Expand Down Expand Up @@ -251,8 +254,8 @@ func testMetricsFromPrometheusServer(t *testing.T, data *TestData, prometheusJob
hostIP, nodePort := getPrometheusEndpoint(t, data)

// Build the Prometheus query URL
path := url.PathEscape("match[]={job=\"" + prometheusJob + "\"}")
queryUrl := fmt.Sprintf("http://%s:%d/api/v1/series?%s", hostIP, nodePort, path)
path := url.PathEscape("match_target={job=\"" + prometheusJob + "\"}")
queryUrl := fmt.Sprintf("http://%s:%d/api/v1/targets/metadata?%s", hostIP, nodePort, path)

client := &http.Client{}
resp, err := client.Get(queryUrl)
Expand All @@ -276,7 +279,7 @@ func testMetricsFromPrometheusServer(t *testing.T, data *TestData, prometheusJob
// Create a map of all the metrics which were found on the server
testMap := make(map[string]bool)
for _, metric := range output.Data {
testMap[metric["__name__"]] = true
testMap[metric["metric"]] = true
}

// Validate that all the required metrics exist in the server's output
Expand Down

0 comments on commit 4fbe993

Please sign in to comment.