From 3955e8c044bf47f1aceedae17cd9761d3e18fd02 Mon Sep 17 00:00:00 2001 From: Jan Ciesla Date: Wed, 17 Jun 2020 16:13:36 +0200 Subject: [PATCH 1/5] Add metrics definitions based on sparrow events --- lib/mongoose_push/metrics/telemetry_metrics.ex | 11 ++++++++++- test/integration/prometheus_endpoint_test.exs | 4 ++-- test/unit/mongoose_push_telemetry_metrics_test.exs | 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/mongoose_push/metrics/telemetry_metrics.ex b/lib/mongoose_push/metrics/telemetry_metrics.ex index d15edae6..1102cdc6 100644 --- a/lib/mongoose_push/metrics/telemetry_metrics.ex +++ b/lib/mongoose_push/metrics/telemetry_metrics.ex @@ -37,7 +37,16 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do ), Telemetry.Metrics.counter("mongoose_push.apns.state.get_default_topic.count", description: "Counts the number of APNS default topic reads from the ETS cache" - ) + ), + + # sparrow events + Telemetry.Metrics.distribution( + "sparrow.h2_worker.handle.duration.microsecond", + event_name: [:sparrow, :h2_worker, :handle], + measurement: :time, + buckets: [10_000, 25_000, 50_000, 100_000, 200_000, 500_000, 1000_000], + description: "A histogram showing time it takes for h2_worker to handle request." + ), ] end end diff --git a/test/integration/prometheus_endpoint_test.exs b/test/integration/prometheus_endpoint_test.exs index e6a2abc2..c8b1efdb 100644 --- a/test/integration/prometheus_endpoint_test.exs +++ b/test/integration/prometheus_endpoint_test.exs @@ -33,9 +33,9 @@ defmodule MongoosePush.API.PrometheusEndpointTest do # 3. regex on the payload to make sure this is prometheus output fcm_regex = - ~r/mongoose_push_notification_send_time_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?[\d]+)/ + ~r/mongoose_push_notification_send_time_microsecond_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?[\d]+)/ fcm_match = Regex.named_captures(fcm_regex, metrics) - assert 0 != fcm_match + assert nil != fcm_match end end diff --git a/test/unit/mongoose_push_telemetry_metrics_test.exs b/test/unit/mongoose_push_telemetry_metrics_test.exs index e82831c4..fcdd9270 100644 --- a/test/unit/mongoose_push_telemetry_metrics_test.exs +++ b/test/unit/mongoose_push_telemetry_metrics_test.exs @@ -127,7 +127,7 @@ defmodule MongoosePushTelemetryMetricsTest do # 3. regex on the payload to make sure this is prometheus output fcm_regex = - ~r/mongoose_push_notification_send_time_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?[\d]+)/ + ~r/mongoose_push_notification_send_time_microsecond_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?[\d]+)/ fcm_match = Regex.named_captures(fcm_regex, metrics.resp_body) assert 0 != fcm_match From ac1417e6292ac7a0199a9e5c3483444229bd0268 Mon Sep 17 00:00:00 2001 From: Jan Ciesla Date: Wed, 17 Jun 2020 14:07:18 +0200 Subject: [PATCH 2/5] Add periodic metrics based on telemetry poller --- lib/mongoose_push/application.ex | 4 +- .../metrics/telemetry_metrics.ex | 41 +++++++++++++++++++ mix.exs | 1 + mix.lock | 1 + .../mongoose_push_telemetry_metrics_test.exs | 23 +++++++++++ 5 files changed, 69 insertions(+), 1 deletion(-) diff --git a/lib/mongoose_push/application.ex b/lib/mongoose_push/application.ex index ac2e4f2f..72bf701a 100644 --- a/lib/mongoose_push/application.ex +++ b/lib/mongoose_push/application.ex @@ -38,7 +38,9 @@ defmodule MongoosePush.Application do # The MongoosePush.Metrics.TelemetryMetrics child is started first to capture possible events # when services start children = - [MongoosePush.Metrics.TelemetryMetrics] ++ service_children() ++ [MongoosePushWeb.Endpoint] + [MongoosePush.Metrics.TelemetryMetrics] ++ + MongoosePush.Metrics.TelemetryMetrics.pooler() ++ + service_children() ++ [MongoosePushWeb.Endpoint] # See http://elixir-lang.org/docs/stable/elixir/Supervisor.html # for other strategies and supported options diff --git a/lib/mongoose_push/metrics/telemetry_metrics.ex b/lib/mongoose_push/metrics/telemetry_metrics.ex index 1102cdc6..f25a7a57 100644 --- a/lib/mongoose_push/metrics/telemetry_metrics.ex +++ b/lib/mongoose_push/metrics/telemetry_metrics.ex @@ -7,6 +7,10 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do TelemetryMetricsPrometheus.Core.child_spec(metrics: metrics()) end + def pooler do + [{:telemetry_poller, measurements: periodic_measurements(), period: 5_000}] + end + def metrics do [ # Summary is not yet supported in TelemetryMetricsPrometheus @@ -47,6 +51,43 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do buckets: [10_000, 25_000, 50_000, 100_000, 200_000, 500_000, 1000_000], description: "A histogram showing time it takes for h2_worker to handle request." ), + Telemetry.Metrics.last_value( + "sparrow.pools_warden.workers.count", + event_name: [:sparrow, :pools_warden, :workers], + measurement: :count, + tags: [:pool], + description: "Total count of workers handled by worker_pool." + ), + Telemetry.Metrics.last_value( + "sparrow.pools_warden.pools.count", + event_name: [:sparrow, :pools_warden, :pools], + measurement: :count, + description: "Total count of the connection pools." + ) ] end + + def periodic_measurements do + [ + {MongoosePush.Metrics.TelemetryMetrics, :running_pools, []} + ] + end + + def running_pools do + stats = :wpool.stats() + + Enum.map(stats, fn stat -> + :telemetry.execute( + [:sparrow, :pools_warden, :workers], + %{count: length(stat[:workers])}, + %{pool: stat[:pool]} + ) + end) + + :telemetry.execute( + [:sparrow, :pools_warden, :pools], + %{count: length(stats)}, + %{} + ) + end end diff --git a/mix.exs b/mix.exs index 5acb9931..3cd253d5 100644 --- a/mix.exs +++ b/mix.exs @@ -61,6 +61,7 @@ defmodule MongoosePush.Mixfile do {:telemetry, "~>0.4.1"}, {:telemetry_metrics, "~> 0.5"}, {:telemetry_metrics_prometheus_core, "~> 0.4"}, + {:telemetry_poller, "~> 0.4"}, {:logfmt, "~>3.3"}, {:stream_data, "~> 0.5", only: :test} ] diff --git a/mix.lock b/mix.lock index 8fa28d31..6e19a744 100644 --- a/mix.lock +++ b/mix.lock @@ -55,6 +55,7 @@ "telemetry_metrics": {:hex, :telemetry_metrics, "0.5.0", "1b796e74add83abf844e808564275dfb342bcc930b04c7577ab780e262b0d998", [:mix], [{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "31225e6ce7a37a421a0a96ec55244386aec1c190b22578bd245188a4a33298fd"}, "telemetry_metrics_prometheus_core": {:hex, :telemetry_metrics_prometheus_core, "0.4.0", "0860e53746f4554cf453a5217a3d2648a6d3a074ae01a21869a3963c54b1d5bc", [:mix], [{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}, {:telemetry_metrics, "~> 0.5", [hex: :telemetry_metrics, repo: "hexpm", optional: false]}], "hexpm", "912e4c4421477bfb930a19a8de5b2eb967c2700880698c6d80706b8bc32532bf"}, "toml": {:hex, :toml, "0.6.2", "38f445df384a17e5d382befe30e3489112a48d3ba4c459e543f748c2f25dd4d1", [:mix], [], "hexpm", "d013e45126d74c0c26a38d31f5e8e9b83ea19fc752470feb9a86071ca5a672fa"}, + "telemetry_poller": {:hex, :telemetry_poller, "0.5.0", "4770888ef85599ead39c7f51d6b4b62306e602d96c69b2625d54dea3d9a5204b", [:rebar3], [{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69e4e8e65b0ae077c9e14cd5f42c7cc486de0e07ac6e3409e6f0e52699a7872c"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm", "1d1848c40487cdb0b30e8ed975e34e025860c02e419cb615d255849f3427439d"}, "uuid": {:hex, :uuid, "1.1.8", "e22fc04499de0de3ed1116b770c7737779f226ceefa0badb3592e64d5cfb4eb9", [:mix], [], "hexpm", "c790593b4c3b601f5dc2378baae7efaf5b3d73c4c6456ba85759905be792f2ac"}, "worker_pool": {:hex, :worker_pool, "4.0.1", "8cdebce7e09ecb4f1eb4bbf78aa99248064ac357077668c011ac600599973723", [:rebar3], [], "hexpm", "b7e12a0d942d32ee135a199631d71d893c6bb24daf4c51b4af4952c411462d82"}, diff --git a/test/unit/mongoose_push_telemetry_metrics_test.exs b/test/unit/mongoose_push_telemetry_metrics_test.exs index fcdd9270..21cb91af 100644 --- a/test/unit/mongoose_push_telemetry_metrics_test.exs +++ b/test/unit/mongoose_push_telemetry_metrics_test.exs @@ -133,6 +133,29 @@ defmodule MongoosePushTelemetryMetricsTest do assert 0 != fcm_match end + test "sparrow periodic metrics" do + :telemetry.execute( + [:sparrow, :pools_warden, :workers], + %{count: 5}, + %{pool: :periodic_pool} + ) + + :telemetry.execute( + [:sparrow, :pools_warden, :pools], + %{count: 3}, + %{} + ) + + metrics = TelemetryMetricsPrometheus.Core.scrape() + workers_regex = ~r/sparrow_pools_warden_workers_count{pool=\"periodic_pool\"} 5/ + workers_match = Regex.match?(workers_regex, metrics) + pools_regex = ~r/sparrow_pools_warden_pools_count [\d]+/ + pools_match = Regex.match?(pools_regex, metrics) + + assert true == workers_match + assert true == pools_match + end + defp do_push(push_result, service, repeat_no) do MongoosePush.Service.Mock |> expect(:push, repeat_no, fn _, _, _, _ -> push_result end) From 647891e7a738b56615ea7fac70df03149b5e8ed4 Mon Sep 17 00:00:00 2001 From: Jan Ciesla Date: Wed, 17 Jun 2020 16:07:29 +0200 Subject: [PATCH 3/5] Documentation update with new metrics --- README.md | 21 ++++++++++++++----- .../metrics/telemetry_metrics.ex | 4 +++- .../mongoose_push_telemetry_metrics_test.exs | 2 +- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e44952ad..89fa091a 100644 --- a/README.md +++ b/README.md @@ -519,17 +519,26 @@ Where: * `REASON` is an arbitrary error reason term (in case of `status="error"`) or an empty string (when `status="success"`) * `LE` defines the `upper inclusive bound` (`less than or equal`) values for buckets, currently `1000`, `10_000`, `25_000`, `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` or `+Inf` +This histogram metric shows the distribution of times needed to: +1. Select a worker (this may include waiting time when all workers are busy). +2. Send a request. +3. Get a response from push notifications provider. + +###### HTTP/2 requests + +`sparrow_h_worker_handle_duration_microsecond_bucket{le=${LE}}` +`sparrow_h_worker_handle_duration_microsecond_sum{le=${LE}}` +`sparrow_h_worker_handle_duration_microsecond_count{le=${LE}}` + +Where: +* `LE` defines the `upper inclusive bound` (`less than or equal`) values for buckets, currently `1000`, `10_000`, `25_000`, `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` or `+Inf` + > **NOTE** > > A bucket of value 250_000 will keep the count of measurements that are less than or equal to 250_000. > A measurement of value 51_836 will be added to all the buckets where the upper bound is greater than 51_836. > In this case these are buckets `100_000`, `250_000`, `500_000`, `1000_000` and `+Inf` -This histogram metric shows the distribution of times needed to: -1. Select a worker (this may include waiting time when all workers are busy). -2. Send a request. -3. Get a response from push notifications provider. - ##### Counters * `mongoose_push_supervisor_init_count{service=${SERVICE}}` - Counts the number of push notification service supervisor starts. @@ -538,6 +547,8 @@ This histogram metric shows the distribution of times needed to: * `mongoose_push_apns_state_init_count` - Counts the number of APNS state initialisations. * `mongoose_push_apns_state_terminate_count` - Counts the number of APNS state terminations. * `mongoose_push_apns_state_get_default_topic_count` - Counts the number of default topic reads from cache. +* `sparrow_pools_warden_pools_count` - Counts the number of worker pools. +* `sparrow_pools_warden_workers_count{pool=${POOL}}` - Counts the number of workers operated by a given worker `POOL`. #### How to quickly see all metrics diff --git a/lib/mongoose_push/metrics/telemetry_metrics.ex b/lib/mongoose_push/metrics/telemetry_metrics.ex index f25a7a57..91f0f757 100644 --- a/lib/mongoose_push/metrics/telemetry_metrics.ex +++ b/lib/mongoose_push/metrics/telemetry_metrics.ex @@ -48,7 +48,9 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do "sparrow.h2_worker.handle.duration.microsecond", event_name: [:sparrow, :h2_worker, :handle], measurement: :time, - buckets: [10_000, 25_000, 50_000, 100_000, 200_000, 500_000, 1000_000], + reporter_options: [ + buckets: [10_000, 25_000, 50_000, 100_000, 200_000, 500_000, 1000_000] + ], description: "A histogram showing time it takes for h2_worker to handle request." ), Telemetry.Metrics.last_value( diff --git a/test/unit/mongoose_push_telemetry_metrics_test.exs b/test/unit/mongoose_push_telemetry_metrics_test.exs index 21cb91af..4eb6517f 100644 --- a/test/unit/mongoose_push_telemetry_metrics_test.exs +++ b/test/unit/mongoose_push_telemetry_metrics_test.exs @@ -130,7 +130,7 @@ defmodule MongoosePushTelemetryMetricsTest do ~r/mongoose_push_notification_send_time_microsecond_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?[\d]+)/ fcm_match = Regex.named_captures(fcm_regex, metrics.resp_body) - assert 0 != fcm_match + assert nil != fcm_match end test "sparrow periodic metrics" do From c8f29e40ea427ad3f4d8ba5119b8f25b3777b50c Mon Sep 17 00:00:00 2001 From: Jan Ciesla Date: Mon, 22 Jun 2020 19:45:19 +0200 Subject: [PATCH 4/5] apply PR review comments --- lib/mongoose_push/metrics/telemetry_metrics.ex | 2 +- mix.exs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/mongoose_push/metrics/telemetry_metrics.ex b/lib/mongoose_push/metrics/telemetry_metrics.ex index 91f0f757..1010a2a8 100644 --- a/lib/mongoose_push/metrics/telemetry_metrics.ex +++ b/lib/mongoose_push/metrics/telemetry_metrics.ex @@ -8,7 +8,7 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do end def pooler do - [{:telemetry_poller, measurements: periodic_measurements(), period: 5_000}] + [{:telemetry_poller, measurements: periodic_measurements(), period: 30_000}] end def metrics do diff --git a/mix.exs b/mix.exs index 3cd253d5..25e03e41 100644 --- a/mix.exs +++ b/mix.exs @@ -61,7 +61,7 @@ defmodule MongoosePush.Mixfile do {:telemetry, "~>0.4.1"}, {:telemetry_metrics, "~> 0.5"}, {:telemetry_metrics_prometheus_core, "~> 0.4"}, - {:telemetry_poller, "~> 0.4"}, + {:telemetry_poller, "~> 0.5"}, {:logfmt, "~>3.3"}, {:stream_data, "~> 0.5", only: :test} ] From f864311c6a6c9a3e5f78a344e8d88efdc278d799 Mon Sep 17 00:00:00 2001 From: Jan Ciesla Date: Mon, 22 Jun 2020 19:45:52 +0200 Subject: [PATCH 5/5] add sparrow h2_worker metrics --- README.md | 11 ++++++++ .../metrics/telemetry_metrics.ex | 28 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/README.md b/README.md index 89fa091a..5d9733c5 100644 --- a/README.md +++ b/README.md @@ -533,6 +533,10 @@ This histogram metric shows the distribution of times needed to: Where: * `LE` defines the `upper inclusive bound` (`less than or equal`) values for buckets, currently `1000`, `10_000`, `25_000`, `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` or `+Inf` +This histogram metric shows the distribution of times needed to handle and send a request. This includes: +1. Open a new stream within an already established channel. +2. Send a request. + > **NOTE** > > A bucket of value 250_000 will keep the count of measurements that are less than or equal to 250_000. @@ -549,6 +553,13 @@ Where: * `mongoose_push_apns_state_get_default_topic_count` - Counts the number of default topic reads from cache. * `sparrow_pools_warden_pools_count` - Counts the number of worker pools. * `sparrow_pools_warden_workers_count{pool=${POOL}}` - Counts the number of workers operated by a given worker `POOL`. +* `sparrow_h_worker_init_count` - Counts the number of h2_worker starts. +* `sparrow_h_worker_terminate_count` - Counts the number of h2_worker terminations. +* `sparrow_h_worker_conn_success_count` - Counts the number of successful h2_worker connections. +* `sparrow_h_worker_conn_fail_count` - Counts the number of failed h2_worker connections. +* `sparrow_h_worker_conn_lost_count` - Counts the number of lost h2_worker connections. +* `sparrow_h_worker_request_success_count` - Counts the number of successful h2_worker requests. +* `sparrow_h_worker_request_error_count` - Counts the number of failed h2_worker requests. #### How to quickly see all metrics diff --git a/lib/mongoose_push/metrics/telemetry_metrics.ex b/lib/mongoose_push/metrics/telemetry_metrics.ex index 1010a2a8..f47638ed 100644 --- a/lib/mongoose_push/metrics/telemetry_metrics.ex +++ b/lib/mongoose_push/metrics/telemetry_metrics.ex @@ -53,6 +53,34 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do ], description: "A histogram showing time it takes for h2_worker to handle request." ), + Telemetry.Metrics.counter("sparrow.h2_worker.init.count", + event_name: [:sparrow, :h2_worker, :init], + description: "Counts the number of h2_worker starts." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.terminate.count", + event_name: [:sparrow, :h2_worker, :terminate], + description: "Counts the number of h2_worker terminations." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.conn_success.count", + event_name: [:sparrow, :h2_worker, :conn_success], + description: "Counts the number of successful h2_worker connections." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.conn_fail.count", + event_name: [:sparrow, :h2_worker, :conn_fail], + description: "Counts the number of failed h2_worker connections." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.conn_lost.count", + event_name: [:sparrow, :h2_worker, :conn_lost], + description: "Counts the number of lost h2_worker connections." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.request_success.count", + event_name: [:sparrow, :h2_worker, :request_success], + description: "Counts the number of successful h2_worker requests." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.request_error.count", + event_name: [:sparrow, :h2_worker, :request_error], + description: "Counts the number of failed h2_worker requests." + ), Telemetry.Metrics.last_value( "sparrow.pools_warden.workers.count", event_name: [:sparrow, :pools_warden, :workers],