esl · michalwski · Jun 24, 2020 · Jun 17, 2020 · Jun 17, 2020 · Jun 17, 2020
diff --git a/README.md b/README.md
@@ -519,17 +519,30 @@ Where:
 * `REASON` is an arbitrary error reason term (in case of `status="error"`) or an empty string (when `status="success"`)
 * `LE` defines the `upper inclusive bound` (`less than or equal`) values for buckets, currently `1000`, `10_000`, `25_000`, `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` or `+Inf`
 
+This histogram metric shows the distribution of times needed to:
+1. Select a worker (this may include waiting time when all workers are busy).
+2. Send a request.
+3. Get a response from push notifications provider.
+
+###### HTTP/2 requests
+
+`sparrow_h_worker_handle_duration_microsecond_bucket{le=${LE}}`
+`sparrow_h_worker_handle_duration_microsecond_sum{le=${LE}}`
+`sparrow_h_worker_handle_duration_microsecond_count{le=${LE}}`
+
+Where:
+* `LE` defines the `upper inclusive bound` (`less than or equal`) values for buckets, currently `1000`, `10_000`, `25_000`, `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` or `+Inf`
+
+This histogram metric shows the distribution of times needed to handle and send a request. This includes:
+1. Open a new stream within an already established channel.
+2. Send a request.
+
 > **NOTE**
 >
 > A bucket of value 250_000 will keep the count of measurements that are less than or equal to 250_000.
 > A measurement of value 51_836 will be added to all the buckets where the upper bound is greater than 51_836.
 > In this case these are buckets `100_000`, `250_000`, `500_000`, `1000_000` and `+Inf`
 
-This histogram metric shows the distribution of times needed to:
-1. Select a worker (this may include waiting time when all workers are busy).
-2. Send a request.
-3. Get a response from push notifications provider.
-
 ##### Counters
 
 * `mongoose_push_supervisor_init_count{service=${SERVICE}}` - Counts the number of push notification service supervisor starts.
@@ -538,6 +551,15 @@ This histogram metric shows the distribution of times needed to:
 * `mongoose_push_apns_state_init_count` - Counts the number of APNS state initialisations.
 * `mongoose_push_apns_state_terminate_count` - Counts the number of APNS state terminations.
 * `mongoose_push_apns_state_get_default_topic_count` - Counts the number of default topic reads from cache.
+* `sparrow_pools_warden_pools_count` - Counts the number of worker pools.
+* `sparrow_pools_warden_workers_count{pool=${POOL}}` - Counts the number of workers operated by a given worker `POOL`.
+* `sparrow_h_worker_init_count` - Counts the number of h2_worker starts.
+* `sparrow_h_worker_terminate_count` - Counts the number of h2_worker terminations.
+* `sparrow_h_worker_conn_success_count` - Counts the number of successful h2_worker connections.
+* `sparrow_h_worker_conn_fail_count` - Counts the number of failed h2_worker connections.
+* `sparrow_h_worker_conn_lost_count` - Counts the number of lost h2_worker connections.
+* `sparrow_h_worker_request_success_count` - Counts the number of successful h2_worker requests.
+* `sparrow_h_worker_request_error_count` - Counts the number of failed h2_worker requests.
 
 #### How to quickly see all metrics
 

diff --git a/lib/mongoose_push/application.ex b/lib/mongoose_push/application.ex
@@ -38,7 +38,9 @@ defmodule MongoosePush.Application do
     # The MongoosePush.Metrics.TelemetryMetrics child is started first to capture possible events
     # when services start
     children =
-      [MongoosePush.Metrics.TelemetryMetrics] ++ service_children() ++ [MongoosePushWeb.Endpoint]
+      [MongoosePush.Metrics.TelemetryMetrics] ++
+        MongoosePush.Metrics.TelemetryMetrics.pooler() ++
+        service_children() ++ [MongoosePushWeb.Endpoint]
 
     # See http://elixir-lang.org/docs/stable/elixir/Supervisor.html
     # for other strategies and supported options

diff --git a/lib/mongoose_push/metrics/telemetry_metrics.ex b/lib/mongoose_push/metrics/telemetry_metrics.ex
@@ -7,6 +7,10 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do
     TelemetryMetricsPrometheus.Core.child_spec(metrics: metrics())
   end
 
+  def pooler do
+    [{:telemetry_poller, measurements: periodic_measurements(), period: 30_000}]
+  end
+
   def metrics do
     [
       # Summary is not yet supported in TelemetryMetricsPrometheus
@@ -37,7 +41,83 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do
       ),
       Telemetry.Metrics.counter("mongoose_push.apns.state.get_default_topic.count",
         description: "Counts the number of APNS default topic reads from the ETS cache"
+      ),
+
+      # sparrow events
+      Telemetry.Metrics.distribution(
+        "sparrow.h2_worker.handle.duration.microsecond",
+        event_name: [:sparrow, :h2_worker, :handle],
+        measurement: :time,
+        reporter_options: [
+          buckets: [10_000, 25_000, 50_000, 100_000, 200_000, 500_000, 1000_000]
+        ],
+        description: "A histogram showing time it takes for h2_worker to handle request."
+      ),
+      Telemetry.Metrics.counter("sparrow.h2_worker.init.count",
+        event_name: [:sparrow, :h2_worker, :init],
+        description: "Counts the number of h2_worker starts."
+      ),
+      Telemetry.Metrics.counter("sparrow.h2_worker.terminate.count",
+        event_name: [:sparrow, :h2_worker, :terminate],
+        description: "Counts the number of h2_worker terminations."
+      ),
+      Telemetry.Metrics.counter("sparrow.h2_worker.conn_success.count",
+        event_name: [:sparrow, :h2_worker, :conn_success],
+        description: "Counts the number of successful h2_worker connections."
+      ),
+      Telemetry.Metrics.counter("sparrow.h2_worker.conn_fail.count",
+        event_name: [:sparrow, :h2_worker, :conn_fail],
+        description: "Counts the number of failed h2_worker connections."
+      ),
+      Telemetry.Metrics.counter("sparrow.h2_worker.conn_lost.count",
+        event_name: [:sparrow, :h2_worker, :conn_lost],
+        description: "Counts the number of lost h2_worker connections."
+      ),
+      Telemetry.Metrics.counter("sparrow.h2_worker.request_success.count",
+        event_name: [:sparrow, :h2_worker, :request_success],
+        description: "Counts the number of successful h2_worker requests."
+      ),
+      Telemetry.Metrics.counter("sparrow.h2_worker.request_error.count",
+        event_name: [:sparrow, :h2_worker, :request_error],
+        description: "Counts the number of failed h2_worker requests."
+      ),
+      Telemetry.Metrics.last_value(
+        "sparrow.pools_warden.workers.count",
+        event_name: [:sparrow, :pools_warden, :workers],
+        measurement: :count,
+        tags: [:pool],
+        description: "Total count of workers handled by worker_pool."
+      ),
+      Telemetry.Metrics.last_value(
+        "sparrow.pools_warden.pools.count",
+        event_name: [:sparrow, :pools_warden, :pools],
+        measurement: :count,
+        description: "Total count of the connection pools."
       )
     ]
   end
+
+  def periodic_measurements do
+    [
+      {MongoosePush.Metrics.TelemetryMetrics, :running_pools, []}
+    ]
+  end
+
+  def running_pools do
+    stats = :wpool.stats()
+
+    Enum.map(stats, fn stat ->
+      :telemetry.execute(
+        [:sparrow, :pools_warden, :workers],
+        %{count: length(stat[:workers])},
+        %{pool: stat[:pool]}
+      )
+    end)
+
+    :telemetry.execute(
+      [:sparrow, :pools_warden, :pools],
+      %{count: length(stats)},
+      %{}
+    )
+  end
 end
diff --git a/mix.exs b/mix.exs
@@ -61,6 +61,7 @@ defmodule MongoosePush.Mixfile do
       {:telemetry, "~>0.4.1"},
       {:telemetry_metrics, "~> 0.5"},
       {:telemetry_metrics_prometheus_core, "~> 0.4"},
+      {:telemetry_poller, "~> 0.5"},
       {:logfmt, "~>3.3"},
       {:stream_data, "~> 0.5", only: :test}
     ]

diff --git a/mix.lock b/mix.lock
@@ -55,6 +55,7 @@
   "telemetry_metrics": {:hex, :telemetry_metrics, "0.5.0", "1b796e74add83abf844e808564275dfb342bcc930b04c7577ab780e262b0d998", [:mix], [{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "31225e6ce7a37a421a0a96ec55244386aec1c190b22578bd245188a4a33298fd"},
   "telemetry_metrics_prometheus_core": {:hex, :telemetry_metrics_prometheus_core, "0.4.0", "0860e53746f4554cf453a5217a3d2648a6d3a074ae01a21869a3963c54b1d5bc", [:mix], [{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}, {:telemetry_metrics, "~> 0.5", [hex: :telemetry_metrics, repo: "hexpm", optional: false]}], "hexpm", "912e4c4421477bfb930a19a8de5b2eb967c2700880698c6d80706b8bc32532bf"},
   "toml": {:hex, :toml, "0.6.2", "38f445df384a17e5d382befe30e3489112a48d3ba4c459e543f748c2f25dd4d1", [:mix], [], "hexpm", "d013e45126d74c0c26a38d31f5e8e9b83ea19fc752470feb9a86071ca5a672fa"},
+  "telemetry_poller": {:hex, :telemetry_poller, "0.5.0", "4770888ef85599ead39c7f51d6b4b62306e602d96c69b2625d54dea3d9a5204b", [:rebar3], [{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69e4e8e65b0ae077c9e14cd5f42c7cc486de0e07ac6e3409e6f0e52699a7872c"},
   "unicode_util_compat": {:hex, :unicode_util_compat, "0.4.1", "d869e4c68901dd9531385bb0c8c40444ebf624e60b6962d95952775cac5e90cd", [:rebar3], [], "hexpm", "1d1848c40487cdb0b30e8ed975e34e025860c02e419cb615d255849f3427439d"},
   "uuid": {:hex, :uuid, "1.1.8", "e22fc04499de0de3ed1116b770c7737779f226ceefa0badb3592e64d5cfb4eb9", [:mix], [], "hexpm", "c790593b4c3b601f5dc2378baae7efaf5b3d73c4c6456ba85759905be792f2ac"},
   "worker_pool": {:hex, :worker_pool, "4.0.1", "8cdebce7e09ecb4f1eb4bbf78aa99248064ac357077668c011ac600599973723", [:rebar3], [], "hexpm", "b7e12a0d942d32ee135a199631d71d893c6bb24daf4c51b4af4952c411462d82"},

diff --git a/test/integration/prometheus_endpoint_test.exs b/test/integration/prometheus_endpoint_test.exs
@@ -33,9 +33,9 @@ defmodule MongoosePush.API.PrometheusEndpointTest do
 
     # 3. regex on the payload to make sure this is prometheus output
     fcm_regex =
-      ~r/mongoose_push_notification_send_time_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?<count>[\d]+)/
+      ~r/mongoose_push_notification_send_time_microsecond_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?<count>[\d]+)/
 
     fcm_match = Regex.named_captures(fcm_regex, metrics)
-    assert 0 != fcm_match
+    assert nil != fcm_match
   end
 end
diff --git a/test/unit/mongoose_push_telemetry_metrics_test.exs b/test/unit/mongoose_push_telemetry_metrics_test.exs
@@ -127,10 +127,33 @@ defmodule MongoosePushTelemetryMetricsTest do
 
     # 3. regex on the payload to make sure this is prometheus output
     fcm_regex =
-      ~r/mongoose_push_notification_send_time_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?<count>[\d]+)/
+      ~r/mongoose_push_notification_send_time_microsecond_count{error_category=\"\",error_reason=\"\",service=\"fcm\",status=\"success\"} (?<count>[\d]+)/
 
     fcm_match = Regex.named_captures(fcm_regex, metrics.resp_body)
-    assert 0 != fcm_match
+    assert nil != fcm_match
+  end
+
+  test "sparrow periodic metrics" do
+    :telemetry.execute(
+      [:sparrow, :pools_warden, :workers],
+      %{count: 5},
+      %{pool: :periodic_pool}
+    )
+
+    :telemetry.execute(
+      [:sparrow, :pools_warden, :pools],
+      %{count: 3},
+      %{}
+    )
+
+    metrics = TelemetryMetricsPrometheus.Core.scrape()
+    workers_regex = ~r/sparrow_pools_warden_workers_count{pool=\"periodic_pool\"} 5/
+    workers_match = Regex.match?(workers_regex, metrics)
+    pools_regex = ~r/sparrow_pools_warden_pools_count [\d]+/
+    pools_match = Regex.match?(pools_regex, metrics)
+
+    assert true == workers_match
+    assert true == pools_match
   end
 
   defp do_push(push_result, service, repeat_no) do