Skip to content

Commit

Permalink
[8403008] Add client connection related edgeHub metrics to release1.2 (
Browse files Browse the repository at this point in the history
…#4134)

* [8403008] Add client connection related edgeHub metrics (#4060)

* added metrics for current number of connected clients, individual device connect/disconnected to/from iot edge
* updated doc/BuiltInMetrics.md
* modified failed connection metrics description to align with doc and added ignore disconnect in e2e test

* [8982026] fix metrics e2e test (#4125)

* added metrics for current number of connected clients, individual device connect/disconnected to/from iot edge

* fixed metrics description

* fixed UT ConnectionManagerTest by adding missing mock which impacted by metrics change

* Update edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/DeviceConnectionMetrics.cs

Co-authored-by: Venkat Yalla <veyalla@microsoft.com>

* Update edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/DeviceConnectionMetrics.cs

Co-authored-by: Venkat Yalla <veyalla@microsoft.com>

* Update edge-hub/src/Microsoft.Azure.Devices.Edge.Hub.Core/DeviceConnectionMetrics.cs

Co-authored-by: Venkat Yalla <veyalla@microsoft.com>

* Update doc/BuiltInMetrics.md

Co-authored-by: Venkat Yalla <veyalla@microsoft.com>

* Update doc/BuiltInMetrics.md

Co-authored-by: Venkat Yalla <veyalla@microsoft.com>

* Update doc/BuiltInMetrics.md

Co-authored-by: Venkat Yalla <veyalla@microsoft.com>

* Update doc/BuiltInMetrics.md

Co-authored-by: Venkat Yalla <veyalla@microsoft.com>

* modified to use Array.Empty

* modified failed connection metrics description to align with doc and added ignore disconnect in e2e test

* fixed typo in md file

Co-authored-by: Venkat Yalla <veyalla@microsoft.com>

* moved new cs file to modified location

Co-authored-by: Venkat Yalla <veyalla@microsoft.com>
  • Loading branch information
davilu and veyalla authored Dec 18, 2020
1 parent ee0b87f commit c7da97b
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 25 deletions.
40 changes: 21 additions & 19 deletions doc/BuiltInMetrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,27 @@ instance_number | A Guid representing the current runtime. On restart, all metri
### EdgeHub
| Name | Dimensions | Description | Type |
|-------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|
| `edgehub_gettwin_total` | `source` (Operation source)<br> `id` (Module ID) | Total number of GetTwin calls | counter |
| `edgehub_messages_received_total` | `route_output` (Output that sent the message)<br> `id` (Module ID) | Total number of messages received from clients | counter |
| `edgehub_messages_sent_total` | `from` (Message source)<br> `to` (Message destination)<br>`from_route_output` (Output that sent the message)<br> `to_route_input` (Message destination input [empty when "to" is $upstream])<br> `priority` (message priority to destination) | Total number of messages sent to clients or upstream | counter |
| `edgehub_reported_properties_total` | `target`(Update target)<br> `id` (Module ID) | Total reported property updates calls | counter |
| `edgehub_message_size_bytes` | `id` (Module ID)<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 message size from clients. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_gettwin_duration_seconds` | `source` (Operation source)<br> `id` (Module ID)<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken for get twin operations. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_gettwin_total` | `source` (Operation source)<br> `id` (Module ID) | Total number of GetTwin calls | counter |
| `edgehub_messages_received_total` | `route_output` (Output that sent the message)<br> `id` (Module ID) | Total number of messages received from clients | counter |
| `edgehub_messages_sent_total` | `from` (Message source)<br> `to` (Message destination)<br>`from_route_output` (Output that sent the message)<br> `to_route_input` (Message destination input [empty when "to" is $upstream])<br> `priority` (message priority to destination) | Total number of messages sent to clients or upstream | counter |
| `edgehub_reported_properties_total` | `target`(Update target)<br> `id` (Module ID) | Total reported property updates calls | counter |
| `edgehub_message_size_bytes` | `id` (Module ID)<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 message size from clients. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_gettwin_duration_seconds` | `source` (Operation source)<br> `id` (Module ID)<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken for get twin operations. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_message_send_duration_seconds` | `from` (Message source)<br> `to` (Message destination)<br>`from_route_output` (Output that sent the message)<br> `to_route_input` (Message destination input [empty when "to" is $upstream])<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to send a message. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_message_process_duration_seconds` | `from` (Message source)<br> `to` (Message destination)<br> `priority` (Message priority) <br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to process a message from the queue. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_reported_properties_update_duration_seconds` | `target` (Operation target)<br> `id` (Module ID)<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to update reported properties. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_direct_method_duration_seconds` | `from` (Caller)<br> `to` (Reciever)<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to resolve a direct message. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_direct_methods_total` | `from` (Message source)<br> `to` (Message destination) | Total number of direct messages sent | counter |
| `edgehub_queue_length` | `endpoint` (Message source)<br> `priority` (queue priority) | Current length of edgeHub's queue for a given priority | gauge |
| `edgehub_messages_dropped_total` | `reason` (no_route, ttl_expiry)<br> `from` (Message source)<br> `from_route_output` (Output that sent the message)<br> | Total number of messages removed because of reason | counter |
| `edgehub_messages_unack_total` | `reason` (storage_failure)<br> `from` (Message source)<br> `from_route_output` (Output that sent the message)<br> | Total number of messages unack because storage failure | counter |
| `edgehub_offline_count_total` | `id` (Module ID)<br> | Total number of times edgeHub went offline | counter |
| `edgehub_offline_duration_seconds` | `id` (Module ID)<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time edge hub was offline. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_operation_retry_total` | `id` (Module ID)<br>`operation` (Operation name) | Total number of times edgeHub operations were retried | counter |
| `edgehub_client_connect_failed_total` | `id` (Module ID)<br> `reason` (not authenticated)<br> | Total number of times clients failed to connect to edgeHub | counter |

| `edgehub_message_process_duration_seconds` | `from` (Message source)<br> `to` (Message destination)<br> `priority` (Message priority) <br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to process a message from the queue. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_reported_properties_update_duration_seconds` | `target` (Operation target)<br> `id` (Module ID)<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99] | P50, P90, P95, P99, P99.9 and P99.99 time taken to update reported properties. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_direct_method_duration_seconds` | `from` (Caller)<br> `to` (Reciever)<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time taken to resolve a direct message. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_direct_methods_total` | `from` (Message source)<br> `to` (Message destination) | Total number of direct messages sent | counter |
| `edgehub_queue_length` | `endpoint` (Message source)<br> `priority` (queue priority) | Current length of edgeHub's queue for a given priority | gauge |
| `edgehub_messages_dropped_total` | `reason` (no_route, ttl_expiry)<br> `from` (Message source)<br> `from_route_output` (Output that sent the message)<br> | Total number of messages removed because of reason | counter |
| `edgehub_messages_unack_total` | `reason` (storage_failure)<br> `from` (Message source)<br> `from_route_output` (Output that sent the message)<br> | Total number of messages unack because storage failure | counter |
| `edgehub_offline_count_total` | `id` (Module ID)<br> | Total number of times edgeHub went offline | counter |
| `edgehub_offline_duration_seconds` | `id` (Module ID)<br> `quantile`(Percentile [50, 90, 95, 99, 99.9, 99.99]) | P50, P90, P95, P99, P99.9 and P99.99 time edge hub was offline. Values may be reported as `NaN` if no new measurements are reported for a certain period of time (currently 10 minutes). As this is `summary` type, corresponding `_count` and `_sum` counters will be emitted. | summary |
| `edgehub_operation_retry_total` | `id` (Module ID)<br>`operation` (Operation name) | Total number of times edgeHub operations were retried | counter |
| `edgehub_client_connect_failed_total` | `id` (Device/Module ID)<br> `reason` (not authenticated)<br> | Total number of times each client failed to connect to edgeHub | counter |
| `edgehub_connected_clients` | | Current number of clients connected to edgeHub | gauge |
| `edgehub_client_connect_success_total` | `id` (Device/Module ID)<br> | Total number of times each client successfully connected to edgeHub | counter |
| `edgehub_client_disconnect_total` | `id` (Device/Module ID)<br> | Total number of times each client disconnected from edgeHub | counter |


### EdgeAgent
Expand Down Expand Up @@ -84,4 +86,4 @@ For mapping to host, the port will need to be exposed from Edge Hub's `createOpt
}
<Other options, if any>
}
```
```
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ async Task<bool> AuthenticateAsync(IClientCredentials clientCredentials, bool re
Events.CredentialsCacheFailure(ex);
}
}
else
else if (!reAuthenticating)
{
// only report authentication failure on initial authentication
Metrics.AddAuthenticationFailure(clientCredentials.Identity.Id);
}

Expand Down Expand Up @@ -122,7 +123,7 @@ static class Metrics
{
static readonly IMetricsCounter AuthCounter = Util.Metrics.Metrics.Instance.CreateCounter(
"client_connect_failed",
"Client connection failure",
"Total number of times each client failed to connect to edgeHub",
new List<string> { "id", "reason", MetricsConstants.MsTelemetry });

public static void AddAuthenticationFailure(string id) => AuthCounter.Increment(1, new[] { id, "not_authenticated", bool.TrueString });
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ public async Task AddDeviceConnection(IIdentity identity, IDeviceProxy devicePro
await currentDeviceConnection
.Filter(dc => dc.IsActive)
.ForEachAsync(dc => dc.CloseAsync(new MultipleConnectionsException($"Multiple connections detected for device {identity.Id}")));
this.OnDeviceConnected(identity);
this.DeviceConnected?.Invoke(this, identity);
}

Expand Down Expand Up @@ -211,17 +212,19 @@ static Try<ICloudProxy> GetCloudProxyFromCloudConnection(Try<ICloudConnection> c

async Task RemoveDeviceConnection(ConnectedDevice device, bool removeCloudConnection)
{
Events.RemovingDeviceConnection(device.Identity.Id, removeCloudConnection);
var id = device.Identity.Id;
Events.RemovingDeviceConnection(id, removeCloudConnection);
await device.DeviceConnection.Filter(dp => dp.IsActive)
.ForEachAsync(dp => dp.CloseAsync(new EdgeHubConnectionException($"Connection closed for device {device.Identity.Id}.")));
.ForEachAsync(dp => dp.CloseAsync(new EdgeHubConnectionException($"Connection closed for device {id}.")));

if (removeCloudConnection)
{
await device.CloudConnection.Filter(cp => cp.IsActive)
.ForEachAsync(cp => cp.CloseAsync());
}

Events.RemoveDeviceConnection(device.Identity.Id);
Events.RemoveDeviceConnection(id);
this.OnDeviceDisconnected(device.Identity);
this.DeviceDisconnected?.Invoke(this, device.Identity);
}

Expand Down Expand Up @@ -602,5 +605,17 @@ public static void SetConnectedClientCountGauge(ConnectionManager connectionMana
Util.Metrics.MetricsV0.SetGauge(ConnectedClientGaugeOptions, connectedClients);
}
}

void OnDeviceConnected(IIdentity identity)
{
DeviceConnectionMetrics.OnDeviceConnected(identity.ToString());
DeviceConnectionMetrics.UpdateConnectedClients(this.GetConnectedClients().Count() - 1);
}

void OnDeviceDisconnected(IIdentity identity)
{
DeviceConnectionMetrics.OnDeviceDisconnected(identity.ToString());
DeviceConnectionMetrics.UpdateConnectedClients(this.GetConnectedClients().Count() - 1);
}
}
}
Loading

0 comments on commit c7da97b

Please sign in to comment.