diff --git a/doc/BuiltInMetrics.md b/doc/BuiltInMetrics.md index 011245c880b..7b44a32da27 100644 --- a/doc/BuiltInMetrics.md +++ b/doc/BuiltInMetrics.md @@ -21,25 +21,27 @@ instance_number | A Guid representing the current runtime. On restart, all metri ### EdgeHub | Name | Dimensions | Description | Type | |-------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------| -| `edgehub_gettwin_total` | `source` (Operation source)
`id` (Module ID) | Total number of GetTwin calls | counter | -| `edgehub_messages_received_total` | `route_output` (Output that sent the message)
`id` (Module ID) | Total number of messages received from clients | counter | -| `edgehub_messages_sent_total` | `from` (Message source)
`to` (Message destination)
`from_route_output` (Output that sent the message)
`to_route_input` (Message destination input [empty when "to" is $upstream])
`priority` (message priority to destination) | Total number of messages sent to clients or upstream | counter | -| `edgehub_reported_properties_total` | `target`(Update target)
`id` (Module ID) | Total reported property updates calls | counter | -| `edgehub_message_size_bytes` | `id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 message size from clients. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | -| `edgehub_gettwin_duration_seconds` | `source` (Operation source)
`id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken for get twin operations. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | +| `edgehub_gettwin_total` | `source` (Operation source)
`id` (Module ID) | Total number of GetTwin calls | counter | +| `edgehub_messages_received_total` | `route_output` (Output that sent the message)
`id` (Module ID) | Total number of messages received from clients | counter | +| `edgehub_messages_sent_total` | `from` (Message source)
`to` (Message destination)
`from_route_output` (Output that sent the message)
`to_route_input` (Message destination input [empty when "to" is $upstream])
`priority` (message priority to destination) | Total number of messages sent to clients or upstream | counter | +| `edgehub_reported_properties_total` | `target`(Update target)
`id` (Module ID) | Total reported property updates calls | counter | +| `edgehub_message_size_bytes` | `id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 message size from clients. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | +| `edgehub_gettwin_duration_seconds` | `source` (Operation source)
`id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken for get twin operations. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | | `edgehub_message_send_duration_seconds` | `from` (Message source)
`to` (Message destination)
`from_route_output` (Output that sent the message)
`to_route_input` (Message destination input [empty when "to" is $upstream])
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to send a message. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | -| `edgehub_message_process_duration_seconds` | `from` (Message source)
`to` (Message destination)
`priority` (Message priority)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to process a message from the queue. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | -| `edgehub_reported_properties_update_duration_seconds` | `target` (Operation target)
`id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to update reported properties. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | -| `edgehub_direct_method_duration_seconds` | `from` (Caller)
`to` (Reciever)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to resolve a direct message. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | -| `edgehub_direct_methods_total` | `from` (Message source)
`to` (Message destination) | Total number of direct messages sent | counter | -| `edgehub_queue_length` | `endpoint` (Message source)
`priority` (queue priority) | Current length of edgeHub's queue for a given priority | gauge | -| `edgehub_messages_dropped_total` | `reason` (no_route, ttl_expiry)
`from` (Message source)
`from_route_output` (Output that sent the message)
| Total number of messages removed because of reason | counter | -| `edgehub_messages_unack_total` | `reason` (storage_failure)
`from` (Message source)
`from_route_output` (Output that sent the message)
| Total number of messages unack because storage failure | counter | -| `edgehub_offline_count_total` | `id` (Module ID)
| Total number of times edgeHub went offline | counter | -| `edgehub_offline_duration_seconds` | `id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time edge hub was offline. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | -| `edgehub_operation_retry_total` | `id` (Module ID)
`operation` (Operation name) | Total number of times edgeHub operations were retried | counter | -| `edgehub_client_connect_failed_total` | `id` (Module ID)
`reason` (not authenticated)
| Total number of times clients failed to connect to edgeHub | counter | - +| `edgehub_message_process_duration_seconds` | `from` (Message source)
`to` (Message destination)
`priority` (Message priority)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to process a message from the queue. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | +| `edgehub_reported_properties_update_duration_seconds` | `target` (Operation target)
`id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99] | P50, P90, P95, P99, P99.9 and P99.99 time taken to update reported properties. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | +| `edgehub_direct_method_duration_seconds` | `from` (Caller)
`to` (Reciever)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to resolve a direct message. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | +| `edgehub_direct_methods_total` | `from` (Message source)
`to` (Message destination) | Total number of direct messages sent | counter | +| `edgehub_queue_length` | `endpoint` (Message source)
`priority` (queue priority) | Current length of edgeHub's queue for a given priority | gauge | +| `edgehub_messages_dropped_total` | `reason` (no_route, ttl_expiry)
`from` (Message source)
`from_route_output` (Output that sent the message)
| Total number of messages removed because of reason | counter | +| `edgehub_messages_unack_total` | `reason` (storage_failure)
`from` (Message source)
`from_route_output` (Output that sent the message)
| Total number of messages unack because storage failure | counter | +| `edgehub_offline_count_total` | `id` (Module ID)
| Total number of times edgeHub went offline | counter | +| `edgehub_offline_duration_seconds` | `id` (Module ID)
`quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time edge hub was offline. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary | +| `edgehub_operation_retry_total` | `id` (Module ID)
`operation` (Operation name) | Total number of times edgeHub operations were retried | counter | +| `edgehub_client_connect_failed_total` | `id` (Device/Module ID)
`reason` (not authenticated)
| Total number of times each client failed to connect to edgeHub | counter | +| `edgehub_connected_clients` | | Current number of clients connected to edgeHub | gauge | +| `edgehub_client_connect_success_total` | `id` (Device/Module ID)
| Total number of times each client successfully connected to edgeHub | counter | | counter | +| `edgehub_client_disconnect_total` | `id` (Device/Module ID)
| Total number of times each client disconnected from edgeHub | counter | ### EdgeAgent @@ -84,4 +86,4 @@ For mapping to host, the port will need to be exposed from Edge Hub's `createOpt } } -``` \ No newline at end of file +``` diff --git a/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/Authenticator.cs b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/Authenticator.cs index 3ecee1d2088..fa623a121a2 100644 --- a/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/Authenticator.cs +++ b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/Authenticator.cs @@ -57,8 +57,9 @@ async Task AuthenticateAsync(IClientCredentials clientCredentials, bool re { await this.credentialsCache.Add(clientCredentials); } - else + else if (!reAuthenticating) { + // only report authentication failure on initial authentication Metrics.AddAuthenticationFailure(clientCredentials.Identity.Id); } @@ -101,7 +102,7 @@ static class Metrics { static readonly IMetricsCounter AuthCounter = Util.Metrics.Metrics.Instance.CreateCounter( "client_connect_failed", - "Client connection failure", + "Total number of times each client failed to connect to edgeHub", new List { "id", "reason", MetricsConstants.MsTelemetry }); public static void AddAuthenticationFailure(string id) => AuthCounter.Increment(1, new[] { id, "not_authenticated", bool.TrueString }); diff --git a/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/ConnectionManager.cs b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/ConnectionManager.cs index 9142629b006..7cb8eb3cb85 100644 --- a/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/ConnectionManager.cs +++ b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/ConnectionManager.cs @@ -72,6 +72,7 @@ public async Task AddDeviceConnection(IIdentity identity, IDeviceProxy devicePro await currentDeviceConnection .Filter(dc => dc.IsActive) .ForEachAsync(dc => dc.CloseAsync(new MultipleConnectionsException($"Multiple connections detected for device {identity.Id}"))); + this.OnDeviceConnected(identity); this.DeviceConnected?.Invoke(this, identity); } @@ -181,9 +182,10 @@ static Try GetCloudProxyFromCloudConnection(Try c async Task RemoveDeviceConnection(ConnectedDevice device, bool removeCloudConnection) { - Events.RemovingDeviceConnection(device.Identity.Id, removeCloudConnection); + var id = device.Identity.Id; + Events.RemovingDeviceConnection(id, removeCloudConnection); await device.DeviceConnection.Filter(dp => dp.IsActive) - .ForEachAsync(dp => dp.CloseAsync(new EdgeHubConnectionException($"Connection closed for device {device.Identity.Id}."))); + .ForEachAsync(dp => dp.CloseAsync(new EdgeHubConnectionException($"Connection closed for device {id}."))); if (removeCloudConnection) { @@ -191,7 +193,8 @@ await device.CloudConnection.Filter(cp => cp.IsActive) .ForEachAsync(cp => cp.CloseAsync()); } - Events.RemoveDeviceConnection(device.Identity.Id); + Events.RemoveDeviceConnection(id); + this.OnDeviceDisconnected(device.Identity); this.DeviceDisconnected?.Invoke(this, device.Identity); } @@ -574,5 +577,17 @@ public static void SetConnectedClientCountGauge(ConnectionManager connectionMana Util.Metrics.MetricsV0.SetGauge(ConnectedClientGaugeOptions, connectedClients); } } + + void OnDeviceConnected(IIdentity identity) + { + DeviceConnectionMetrics.OnDeviceConnected(identity.ToString()); + DeviceConnectionMetrics.UpdateConnectedClients(this.GetConnectedClients().Count() - 1); + } + + void OnDeviceDisconnected(IIdentity identity) + { + DeviceConnectionMetrics.OnDeviceDisconnected(identity.ToString()); + DeviceConnectionMetrics.UpdateConnectedClients(this.GetConnectedClients().Count() - 1); + } } } diff --git a/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/DeviceConnectionMetrics.cs b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/DeviceConnectionMetrics.cs new file mode 100644 index 00000000000..f50c057afcd --- /dev/null +++ b/edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/DeviceConnectionMetrics.cs @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft. All rights reserved. +namespace Microsoft.Azure.Devices.Edge.Hub.Core +{ + using System; + using System.Collections.Generic; + using Microsoft.Azure.Devices.Edge.Util.Metrics; + using EdgeMetrics = Util.Metrics.Metrics; + + public static class DeviceConnectionMetrics + { + static readonly List EmptyStringList = new List(); + public static readonly IMetricsGauge ConnectedClientsGauge = EdgeMetrics.Instance.CreateGauge( + "connected_clients", + "Current number of clients connected to edgeHub", + EmptyStringList); + + public static readonly IMetricsCounter ClientsConnectCounter = EdgeMetrics.Instance.CreateCounter( + "client_connect_success", + "Total number of times each client successfully connected to edgeHub", + new List() { "id" }); + + public static readonly IMetricsCounter ClientsDiscconnectCounter = EdgeMetrics.Instance.CreateCounter( + "client_disconnect", + "Total number of times each client disconnected from edgeHub", + new List() { "id" }); + + public static void UpdateConnectedClients(int connectedClients) => ConnectedClientsGauge.Set(connectedClients, Array.Empty()); + + public static void OnDeviceConnected(string deviceId) => ClientsConnectCounter.Increment(1, new string[] { deviceId }); + + public static void OnDeviceDisconnected(string deviceId) => ClientsDiscconnectCounter.Increment(1, new string[] { deviceId }); + } +} diff --git a/edge-hub/test/Microsoft.Azure.Devices.Edge.Hub.Core.Test/ConnectionManagerTest.cs b/edge-hub/test/Microsoft.Azure.Devices.Edge.Hub.Core.Test/ConnectionManagerTest.cs index 5017d82ff1a..d4b09b513ff 100644 --- a/edge-hub/test/Microsoft.Azure.Devices.Edge.Hub.Core.Test/ConnectionManagerTest.cs +++ b/edge-hub/test/Microsoft.Azure.Devices.Edge.Hub.Core.Test/ConnectionManagerTest.cs @@ -462,6 +462,7 @@ public async Task CloudProxyCallbackTest2() .ReturnsAsync(Try.Success(cloudConnection as ICloudConnection)); var deviceProxy = new Mock(MockBehavior.Strict); + deviceProxy.Setup(dp => dp.IsActive).Returns(true); var credentialsCache = new Mock(MockBehavior.Strict); credentialsCache.Setup(c => c.Get(deviceIdentity)).ReturnsAsync(Option.Some((IClientCredentials)updatedDeviceCredentials)); diff --git a/test/modules/MetricsValidator/src/tests/ValidateDocumentedMetrics.cs b/test/modules/MetricsValidator/src/tests/ValidateDocumentedMetrics.cs index 948a227a20b..3695f225ac4 100644 --- a/test/modules/MetricsValidator/src/tests/ValidateDocumentedMetrics.cs +++ b/test/modules/MetricsValidator/src/tests/ValidateDocumentedMetrics.cs @@ -80,7 +80,8 @@ protected override async Task Test(CancellationToken cancellationToken) "edgehub_messages_dropped_total", "edgehub_messages_unack_total", "edgehub_offline_count_total", - "edgehub_operation_retry_total" + "edgehub_operation_retry_total", + "edgehub_client_disconnect_total" }; foreach (string skippingMetric in skippingMetrics)