diff --git a/README.md b/README.md index e44952ad..5d9733c5 100644 --- a/README.md +++ b/README.md @@ -519,17 +519,30 @@ Where: * `REASON` is an arbitrary error reason term (in case of `status="error"`) or an empty string (when `status="success"`) * `LE` defines the `upper inclusive bound` (`less than or equal`) values for buckets, currently `1000`, `10_000`, `25_000`, `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` or `+Inf` +This histogram metric shows the distribution of times needed to: +1. Select a worker (this may include waiting time when all workers are busy). +2. Send a request. +3. Get a response from push notifications provider. + +###### HTTP/2 requests + +`sparrow_h_worker_handle_duration_microsecond_bucket{le=${LE}}` +`sparrow_h_worker_handle_duration_microsecond_sum{le=${LE}}` +`sparrow_h_worker_handle_duration_microsecond_count{le=${LE}}` + +Where: +* `LE` defines the `upper inclusive bound` (`less than or equal`) values for buckets, currently `1000`, `10_000`, `25_000`, `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` or `+Inf` + +This histogram metric shows the distribution of times needed to handle and send a request. This includes: +1. Open a new stream within an already established channel. +2. Send a request. + > **NOTE** > > A bucket of value 250_000 will keep the count of measurements that are less than or equal to 250_000. > A measurement of value 51_836 will be added to all the buckets where the upper bound is greater than 51_836. > In this case these are buckets `100_000`, `250_000`, `500_000`, `1000_000` and `+Inf` -This histogram metric shows the distribution of times needed to: -1. Select a worker (this may include waiting time when all workers are busy). -2. Send a request. -3. Get a response from push notifications provider. - ##### Counters * `mongoose_push_supervisor_init_count{service=${SERVICE}}` - Counts the number of push notification service supervisor starts. @@ -538,6 +551,15 @@ This histogram metric shows the distribution of times needed to: * `mongoose_push_apns_state_init_count` - Counts the number of APNS state initialisations. * `mongoose_push_apns_state_terminate_count` - Counts the number of APNS state terminations. * `mongoose_push_apns_state_get_default_topic_count` - Counts the number of default topic reads from cache. +* `sparrow_pools_warden_pools_count` - Counts the number of worker pools. +* `sparrow_pools_warden_workers_count{pool=${POOL}}` - Counts the number of workers operated by a given worker `POOL`. +* `sparrow_h_worker_init_count` - Counts the number of h2_worker starts. +* `sparrow_h_worker_terminate_count` - Counts the number of h2_worker terminations. +* `sparrow_h_worker_conn_success_count` - Counts the number of successful h2_worker connections. +* `sparrow_h_worker_conn_fail_count` - Counts the number of failed h2_worker connections. +* `sparrow_h_worker_conn_lost_count` - Counts the number of lost h2_worker connections. +* `sparrow_h_worker_request_success_count` - Counts the number of successful h2_worker requests. +* `sparrow_h_worker_request_error_count` - Counts the number of failed h2_worker requests. #### How to quickly see all metrics diff --git a/lib/mongoose_push/application.ex b/lib/mongoose_push/application.ex index ac2e4f2f..72bf701a 100644 --- a/lib/mongoose_push/application.ex +++ b/lib/mongoose_push/application.ex @@ -38,7 +38,9 @@ defmodule MongoosePush.Application do # The MongoosePush.Metrics.TelemetryMetrics child is started first to capture possible events # when services start children = - [MongoosePush.Metrics.TelemetryMetrics] ++ service_children() ++ [MongoosePushWeb.Endpoint] + [MongoosePush.Metrics.TelemetryMetrics] ++ + MongoosePush.Metrics.TelemetryMetrics.pooler() ++ + service_children() ++ [MongoosePushWeb.Endpoint] # See http://elixir-lang.org/docs/stable/elixir/Supervisor.html # for other strategies and supported options diff --git a/lib/mongoose_push/metrics/telemetry_metrics.ex b/lib/mongoose_push/metrics/telemetry_metrics.ex index d15edae6..f47638ed 100644 --- a/lib/mongoose_push/metrics/telemetry_metrics.ex +++ b/lib/mongoose_push/metrics/telemetry_metrics.ex @@ -7,6 +7,10 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do TelemetryMetricsPrometheus.Core.child_spec(metrics: metrics()) end + def pooler do + [{:telemetry_poller, measurements: periodic_measurements(), period: 30_000}] + end + def metrics do [ # Summary is not yet supported in TelemetryMetricsPrometheus @@ -37,7 +41,83 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do ), Telemetry.Metrics.counter("mongoose_push.apns.state.get_default_topic.count", description: "Counts the number of APNS default topic reads from the ETS cache" + ), + + # sparrow events + Telemetry.Metrics.distribution( + "sparrow.h2_worker.handle.duration.microsecond", + event_name: [:sparrow, :h2_worker, :handle], + measurement: :time, + reporter_options: [ + buckets: [10_000, 25_000, 50_000, 100_000, 200_000, 500_000, 1000_000] + ], + description: "A histogram showing time it takes for h2_worker to handle request." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.init.count", + event_name: [:sparrow, :h2_worker, :init], + description: "Counts the number of h2_worker starts." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.terminate.count", + event_name: [:sparrow, :h2_worker, :terminate], + description: "Counts the number of h2_worker terminations." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.conn_success.count", + event_name: [:sparrow, :h2_worker, :conn_success], + description: "Counts the number of successful h2_worker connections." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.conn_fail.count", + event_name: [:sparrow, :h2_worker, :conn_fail], + description: "Counts the number of failed h2_worker connections." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.conn_lost.count", + event_name: [:sparrow, :h2_worker, :conn_lost], + description: "Counts the number of lost h2_worker connections." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.request_success.count", + event_name: [:sparrow, :h2_worker, :request_success], + description: "Counts the number of successful h2_worker requests." + ), + Telemetry.Metrics.counter("sparrow.h2_worker.request_error.count", + event_name: [:sparrow, :h2_worker, :request_error], + description: "Counts the number of failed h2_worker requests." + ), + Telemetry.Metrics.last_value( + "sparrow.pools_warden.workers.count", + event_name: [:sparrow, :pools_warden, :workers], + measurement: :count, + tags: [:pool], + description: "Total count of workers handled by worker_pool." + ), + Telemetry.Metrics.last_value( + "sparrow.pools_warden.pools.count", + event_name: [:sparrow, :pools_warden, :pools], + measurement: :count, + description: "Total count of the connection pools." ) ] end + + def periodic_measurements do + [ + {MongoosePush.Metrics.TelemetryMetrics, :running_pools, []} + ] + end + + def running_pools do + stats = :wpool.stats() + + Enum.map(stats, fn stat -> + :telemetry.execute( + [:sparrow, :pools_warden, :workers], + %{count: length(stat[:workers])}, + %{pool: stat[:pool]} + ) + end) + + :telemetry.execute( + [:sparrow, :pools_warden, :pools], + %{count: length(stats)}, + %{} + ) + end end diff --git a/mix.exs b/mix.exs index 5acb9931..25e03e41 100644 --- a/mix.exs +++ b/mix.exs @@ -61,6 +61,7 @@ defmodule MongoosePush.Mixfile do {:telemetry, "~>0.4.1"}, {:telemetry_metrics, "~> 0.5"}, {:telemetry_metrics_prometheus_core, "~> 0.4"}, + {:telemetry_poller, "~> 0.5"}, {:logfmt, "~>3.3"}, {:stream_data, "~> 0.5", only: :test} ] diff --git a/mix.lock b/mix.lock index 8fa28d31..6e19a744 100644 --- a/mix.lock +++ b/mix.lock @@ -55,6 +55,7 @@ "telemetry_metrics": {:hex, :telemetry_metrics, "0.5.0", "1b796e74add83abf844e808564275dfb342bcc930b04c7577ab780e262b0d998", [:mix], [{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "31225e6ce7a37a421a0a96ec55244386aec1c190b22578bd245188a4a33298fd"}, "telemetry_metrics_prometheus_core": {:hex, :telemetry_metrics_prometheus_core, "0.4.0", "0860e53746f4554cf453a5217a3d2648a6d3a074ae01a21869a3963c54b1d5bc", [:mix], [{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}, {:telemetry_metrics, "~> 0.5", [hex: :telemetry_metrics, repo: "hexpm", optional: false]}], "hexpm", "912e4c4421477bfb930a19a8de5b2eb967c2700880698c6d80706b8bc32532bf"}, "toml": {:hex, :toml, "0.6.2", "38f445df384a17e5d382befe30e3489112a48d3ba4c459e543f748c2f25dd4d1", [:mix], [], "hexpm", "d013e45126d74c0c26a38d31f5e8e9b83ea19fc752470feb9a86071ca5a672fa"}, + "telemetry_poller": {:hex, :telemetry_poller, "0.5.0", "4770888ef85599ead39c7f51d6b4b62306e602d96c69b2625d54dea3d9a5204b", [:rebar3], [{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69e4e8e65b0ae077c9e14cd5f42c7cc486de0e07ac6e3409e6f0e52699a7872c"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm", "1d1848c40487cdb0b30e8ed975e34e025860c02e419cb615d255849f3427439d"}, "uuid": {:hex, :uuid, "1.1.8", "e22fc04499de0de3ed1116b770c7737779f226ceefa0badb3592e64d5cfb4eb9", [:mix], [], "hexpm", "c790593b4c3b601f5dc2378baae7efaf5b3d73c4c6456ba85759905be792f2ac"}, "worker_pool": {:hex, :worker_pool, "4.0.1", "8cdebce7e09ecb4f1eb4bbf78aa99248064ac357077668c011ac600599973723", [:rebar3], [], "hexpm", "b7e12a0d942d32ee135a199631d71d893c6bb24daf4c51b4af4952c411462d82"}, diff --git a/test/integration/prometheus_endpoint_test.exs b/test/integration/prometheus_endpoint_test.exs index e6a2abc2..c8b1efdb 100644 --- a/test/integration/prometheus_endpoint_test.exs +++ b/test/integration/prometheus_endpoint_test.exs @@ -33,9 +33,9 @@ defmodule MongoosePush.API.PrometheusEndpointTest do # 3. regex on the payload to make sure this is prometheus output fcm_regex = - ~r/mongoose_push_notification_send_time_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?[\d]+)/ + ~r/mongoose_push_notification_send_time_microsecond_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?[\d]+)/ fcm_match = Regex.named_captures(fcm_regex, metrics) - assert 0 != fcm_match + assert nil != fcm_match end end diff --git a/test/unit/mongoose_push_telemetry_metrics_test.exs b/test/unit/mongoose_push_telemetry_metrics_test.exs index e82831c4..4eb6517f 100644 --- a/test/unit/mongoose_push_telemetry_metrics_test.exs +++ b/test/unit/mongoose_push_telemetry_metrics_test.exs @@ -127,10 +127,33 @@ defmodule MongoosePushTelemetryMetricsTest do # 3. regex on the payload to make sure this is prometheus output fcm_regex = - ~r/mongoose_push_notification_send_time_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?[\d]+)/ + ~r/mongoose_push_notification_send_time_microsecond_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?[\d]+)/ fcm_match = Regex.named_captures(fcm_regex, metrics.resp_body) - assert 0 != fcm_match + assert nil != fcm_match + end + + test "sparrow periodic metrics" do + :telemetry.execute( + [:sparrow, :pools_warden, :workers], + %{count: 5}, + %{pool: :periodic_pool} + ) + + :telemetry.execute( + [:sparrow, :pools_warden, :pools], + %{count: 3}, + %{} + ) + + metrics = TelemetryMetricsPrometheus.Core.scrape() + workers_regex = ~r/sparrow_pools_warden_workers_count{pool=\"periodic_pool\"} 5/ + workers_match = Regex.match?(workers_regex, metrics) + pools_regex = ~r/sparrow_pools_warden_pools_count [\d]+/ + pools_match = Regex.match?(pools_regex, metrics) + + assert true == workers_match + assert true == pools_match end defp do_push(push_result, service, repeat_no) do