From 40b2de93787140c52c8baee36860a6eb833e03d5 Mon Sep 17 00:00:00 2001
From: davilu <45977130+davilu@users.noreply.github.com>
Date: Wed, 9 Dec 2020 17:01:07 -0800
Subject: [PATCH] [8403008] Add client connection related edgeHub metrics
(#4060)
* added metrics for current number of connected clients, individual device connect/disconnected to/from iot edge
* updated doc/BuiltInMetrics.md
* modified failed connection metrics description to align with doc and added ignore disconnect in e2e test
---
doc/BuiltInMetrics.md | 40 ++++++++++---------
.../Authenticator.cs | 5 ++-
.../ConnectionManager.cs | 21 ++++++++--
.../DeviceConnectionMetrics.cs | 33 +++++++++++++++
.../ConnectionManagerTest.cs | 1 +
.../src/tests/ValidateDocumentedMetrics.cs | 3 +-
6 files changed, 78 insertions(+), 25 deletions(-)
create mode 100644 edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/DeviceConnectionMetrics.cs
diff --git a/doc/BuiltInMetrics.md b/doc/BuiltInMetrics.md
index 011245c880b..7b44a32da27 100644
--- a/doc/BuiltInMetrics.md
+++ b/doc/BuiltInMetrics.md
@@ -21,25 +21,27 @@ instance_number | A Guid representing the current runtime. On restart, all metri
### EdgeHub
| Name | Dimensions | Description | Type |
|-------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|
-| `edgehub_gettwin_total` | `source` (Operation source)
`id` (Module ID) | Total number of GetTwin calls | counter |
-| `edgehub_messages_received_total` | `route_output` (Output that sent the message)
`id` (Module ID) | Total number of messages received from clients | counter |
-| `edgehub_messages_sent_total` | `from` (Message source)
`to` (Message destination)
`from_route_output` (Output that sent the message)
`to_route_input` (Message destination input [empty when "to" is $upstream])
`priority` (message priority to destination) | Total number of messages sent to clients or upstream | counter |
-| `edgehub_reported_properties_total` | `target`(Update target)
`id` (Module ID) | Total reported property updates calls | counter |
-| `edgehub_message_size_bytes` | `id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 message size from clients. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
-| `edgehub_gettwin_duration_seconds` | `source` (Operation source)
`id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken for get twin operations. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
+| `edgehub_gettwin_total` | `source` (Operation source)
`id` (Module ID) | Total number of GetTwin calls | counter |
+| `edgehub_messages_received_total` | `route_output` (Output that sent the message)
`id` (Module ID) | Total number of messages received from clients | counter |
+| `edgehub_messages_sent_total` | `from` (Message source)
`to` (Message destination)
`from_route_output` (Output that sent the message)
`to_route_input` (Message destination input [empty when "to" is $upstream])
`priority` (message priority to destination) | Total number of messages sent to clients or upstream | counter |
+| `edgehub_reported_properties_total` | `target`(Update target)
`id` (Module ID) | Total reported property updates calls | counter |
+| `edgehub_message_size_bytes` | `id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 message size from clients. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
+| `edgehub_gettwin_duration_seconds` | `source` (Operation source)
`id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken for get twin operations. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_message_send_duration_seconds` | `from` (Message source)
`to` (Message destination)
`from_route_output` (Output that sent the message)
`to_route_input` (Message destination input [empty when "to" is $upstream])
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to send a message. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
-| `edgehub_message_process_duration_seconds` | `from` (Message source)
`to` (Message destination)
`priority` (Message priority)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to process a message from the queue. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
-| `edgehub_reported_properties_update_duration_seconds` | `target` (Operation target)
`id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to update reported properties. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
-| `edgehub_direct_method_duration_seconds` | `from` (Caller)
`to` (Reciever)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to resolve a direct message. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
-| `edgehub_direct_methods_total` | `from` (Message source)
`to` (Message destination) | Total number of direct messages sent | counter |
-| `edgehub_queue_length` | `endpoint` (Message source)
`priority` (queue priority) | Current length of edgeHub's queue for a given priority | gauge |
-| `edgehub_messages_dropped_total` | `reason` (no_route, ttl_expiry)
`from` (Message source)
`from_route_output` (Output that sent the message)
| Total number of messages removed because of reason | counter |
-| `edgehub_messages_unack_total` | `reason` (storage_failure)
`from` (Message source)
`from_route_output` (Output that sent the message)
| Total number of messages unack because storage failure | counter |
-| `edgehub_offline_count_total` | `id` (Module ID)
| Total number of times edgeHub went offline | counter |
-| `edgehub_offline_duration_seconds` | `id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time edge hub was offline. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
-| `edgehub_operation_retry_total` | `id` (Module ID)
`operation` (Operation name) | Total number of times edgeHub operations were retried | counter |
-| `edgehub_client_connect_failed_total` | `id` (Module ID)
`reason` (not authenticated)
| Total number of times clients failed to connect to edgeHub | counter |
-
+| `edgehub_message_process_duration_seconds` | `from` (Message source)
`to` (Message destination)
`priority` (Message priority)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to process a message from the queue. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
+| `edgehub_reported_properties_update_duration_seconds` | `target` (Operation target)
`id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99] | P50, P90, P95, P99, P99.9 and P99.99 time taken to update reported properties. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
+| `edgehub_direct_method_duration_seconds` | `from` (Caller)
`to` (Reciever)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to resolve a direct message. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
+| `edgehub_direct_methods_total` | `from` (Message source)
`to` (Message destination) | Total number of direct messages sent | counter |
+| `edgehub_queue_length` | `endpoint` (Message source)
`priority` (queue priority) | Current length of edgeHub's queue for a given priority | gauge |
+| `edgehub_messages_dropped_total` | `reason` (no_route, ttl_expiry)
`from` (Message source)
`from_route_output` (Output that sent the message)
| Total number of messages removed because of reason | counter |
+| `edgehub_messages_unack_total` | `reason` (storage_failure)
`from` (Message source)
`from_route_output` (Output that sent the message)
| Total number of messages unack because storage failure | counter |
+| `edgehub_offline_count_total` | `id` (Module ID)
| Total number of times edgeHub went offline | counter |
+| `edgehub_offline_duration_seconds` | `id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time edge hub was offline. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
+| `edgehub_operation_retry_total` | `id` (Module ID)
`operation` (Operation name) | Total number of times edgeHub operations were retried | counter |
+| `edgehub_client_connect_failed_total` | `id` (Device/Module ID)
`reason` (not authenticated)
| Total number of times each client failed to connect to edgeHub | counter |
+| `edgehub_connected_clients` | | Current number of clients connected to edgeHub | gauge |
+| `edgehub_client_connect_success_total` | `id` (Device/Module ID)
| Total number of times each client successfully connected to edgeHub | counter | | counter |
+| `edgehub_client_disconnect_total` | `id` (Device/Module ID)
| Total number of times each client disconnected from edgeHub | counter |
### EdgeAgent
@@ -84,4 +86,4 @@ For mapping to host, the port will need to be exposed from Edge Hub's `createOpt
}
}
-```
\ No newline at end of file
+```
diff --git a/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/Authenticator.cs b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/Authenticator.cs
index 3ecee1d2088..fa623a121a2 100644
--- a/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/Authenticator.cs
+++ b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/Authenticator.cs
@@ -57,8 +57,9 @@ async Task AuthenticateAsync(IClientCredentials clientCredentials, bool re
{
await this.credentialsCache.Add(clientCredentials);
}
- else
+ else if (!reAuthenticating)
{
+ // only report authentication failure on initial authentication
Metrics.AddAuthenticationFailure(clientCredentials.Identity.Id);
}
@@ -101,7 +102,7 @@ static class Metrics
{
static readonly IMetricsCounter AuthCounter = Util.Metrics.Metrics.Instance.CreateCounter(
"client_connect_failed",
- "Client connection failure",
+ "Total number of times each client failed to connect to edgeHub",
new List { "id", "reason", MetricsConstants.MsTelemetry });
public static void AddAuthenticationFailure(string id) => AuthCounter.Increment(1, new[] { id, "not_authenticated", bool.TrueString });
diff --git a/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/ConnectionManager.cs b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/ConnectionManager.cs
index 9142629b006..7cb8eb3cb85 100644
--- a/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/ConnectionManager.cs
+++ b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/ConnectionManager.cs
@@ -72,6 +72,7 @@ public async Task AddDeviceConnection(IIdentity identity, IDeviceProxy devicePro
await currentDeviceConnection
.Filter(dc => dc.IsActive)
.ForEachAsync(dc => dc.CloseAsync(new MultipleConnectionsException($"Multiple connections detected for device {identity.Id}")));
+ this.OnDeviceConnected(identity);
this.DeviceConnected?.Invoke(this, identity);
}
@@ -181,9 +182,10 @@ static Try GetCloudProxyFromCloudConnection(Try c
async Task RemoveDeviceConnection(ConnectedDevice device, bool removeCloudConnection)
{
- Events.RemovingDeviceConnection(device.Identity.Id, removeCloudConnection);
+ var id = device.Identity.Id;
+ Events.RemovingDeviceConnection(id, removeCloudConnection);
await device.DeviceConnection.Filter(dp => dp.IsActive)
- .ForEachAsync(dp => dp.CloseAsync(new EdgeHubConnectionException($"Connection closed for device {device.Identity.Id}.")));
+ .ForEachAsync(dp => dp.CloseAsync(new EdgeHubConnectionException($"Connection closed for device {id}.")));
if (removeCloudConnection)
{
@@ -191,7 +193,8 @@ await device.CloudConnection.Filter(cp => cp.IsActive)
.ForEachAsync(cp => cp.CloseAsync());
}
- Events.RemoveDeviceConnection(device.Identity.Id);
+ Events.RemoveDeviceConnection(id);
+ this.OnDeviceDisconnected(device.Identity);
this.DeviceDisconnected?.Invoke(this, device.Identity);
}
@@ -574,5 +577,17 @@ public static void SetConnectedClientCountGauge(ConnectionManager connectionMana
Util.Metrics.MetricsV0.SetGauge(ConnectedClientGaugeOptions, connectedClients);
}
}
+
+ void OnDeviceConnected(IIdentity identity)
+ {
+ DeviceConnectionMetrics.OnDeviceConnected(identity.ToString());
+ DeviceConnectionMetrics.UpdateConnectedClients(this.GetConnectedClients().Count() - 1);
+ }
+
+ void OnDeviceDisconnected(IIdentity identity)
+ {
+ DeviceConnectionMetrics.OnDeviceDisconnected(identity.ToString());
+ DeviceConnectionMetrics.UpdateConnectedClients(this.GetConnectedClients().Count() - 1);
+ }
}
}
diff --git a/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/DeviceConnectionMetrics.cs b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/DeviceConnectionMetrics.cs
new file mode 100644
index 00000000000..f50c057afcd
--- /dev/null
+++ b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/DeviceConnectionMetrics.cs
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft. All rights reserved.
+namespace Microsoft.Azure.Devices.Edge.Hub.Core
+{
+ using System;
+ using System.Collections.Generic;
+ using Microsoft.Azure.Devices.Edge.Util.Metrics;
+ using EdgeMetrics = Util.Metrics.Metrics;
+
+ public static class DeviceConnectionMetrics
+ {
+ static readonly List EmptyStringList = new List();
+ public static readonly IMetricsGauge ConnectedClientsGauge = EdgeMetrics.Instance.CreateGauge(
+ "connected_clients",
+ "Current number of clients connected to edgeHub",
+ EmptyStringList);
+
+ public static readonly IMetricsCounter ClientsConnectCounter = EdgeMetrics.Instance.CreateCounter(
+ "client_connect_success",
+ "Total number of times each client successfully connected to edgeHub",
+ new List() { "id" });
+
+ public static readonly IMetricsCounter ClientsDiscconnectCounter = EdgeMetrics.Instance.CreateCounter(
+ "client_disconnect",
+ "Total number of times each client disconnected from edgeHub",
+ new List() { "id" });
+
+ public static void UpdateConnectedClients(int connectedClients) => ConnectedClientsGauge.Set(connectedClients, Array.Empty());
+
+ public static void OnDeviceConnected(string deviceId) => ClientsConnectCounter.Increment(1, new string[] { deviceId });
+
+ public static void OnDeviceDisconnected(string deviceId) => ClientsDiscconnectCounter.Increment(1, new string[] { deviceId });
+ }
+}
diff --git a/edge-hub/test/Microsoft.Azure.Devices.Edge.Hub.Core.Test/ConnectionManagerTest.cs b/edge-hub/test/Microsoft.Azure.Devices.Edge.Hub.Core.Test/ConnectionManagerTest.cs
index 5017d82ff1a..d4b09b513ff 100644
--- a/edge-hub/test/Microsoft.Azure.Devices.Edge.Hub.Core.Test/ConnectionManagerTest.cs
+++ b/edge-hub/test/Microsoft.Azure.Devices.Edge.Hub.Core.Test/ConnectionManagerTest.cs
@@ -462,6 +462,7 @@ public async Task CloudProxyCallbackTest2()
.ReturnsAsync(Try.Success(cloudConnection as ICloudConnection));
var deviceProxy = new Mock(MockBehavior.Strict);
+ deviceProxy.Setup(dp => dp.IsActive).Returns(true);
var credentialsCache = new Mock(MockBehavior.Strict);
credentialsCache.Setup(c => c.Get(deviceIdentity)).ReturnsAsync(Option.Some((IClientCredentials)updatedDeviceCredentials));
diff --git a/test/modules/MetricsValidator/src/tests/ValidateDocumentedMetrics.cs b/test/modules/MetricsValidator/src/tests/ValidateDocumentedMetrics.cs
index 948a227a20b..3695f225ac4 100644
--- a/test/modules/MetricsValidator/src/tests/ValidateDocumentedMetrics.cs
+++ b/test/modules/MetricsValidator/src/tests/ValidateDocumentedMetrics.cs
@@ -80,7 +80,8 @@ protected override async Task Test(CancellationToken cancellationToken)
"edgehub_messages_dropped_total",
"edgehub_messages_unack_total",
"edgehub_offline_count_total",
- "edgehub_operation_retry_total"
+ "edgehub_operation_retry_total",
+ "edgehub_client_disconnect_total"
};
foreach (string skippingMetric in skippingMetrics)