From 9a8af989e26dbb6c947b5503c384a28ab7b85505 Mon Sep 17 00:00:00 2001 From: Michal Piotrowski Date: Wed, 10 Jun 2020 10:10:01 +0200 Subject: [PATCH 1/8] fix link to docker compose file in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cfe2f9e7..2c750d06 100644 --- a/README.md +++ b/README.md @@ -253,7 +253,7 @@ Development release is by default configured to connect to local APNS / FCM mock in `config/dev.exs` file. For now, let's just start those mocks so that we can use default dev configuration: ```bash -docker-compose -f test/docker/docker-compose.unit.yml up -d +docker-compose -f test/docker/docker-compose.mocks.yml up -d ``` After this step you may try to run the service via: From 361cbe07d4e28cccfb46b4a11b61e68035b681b5 Mon Sep 17 00:00:00 2001 From: Michal Piotrowski Date: Wed, 10 Jun 2020 13:28:53 +0200 Subject: [PATCH 2/8] Prometheus example configuration Assuming that MongoosePush was started with the following command: MIX_ENV=integration mix do test.env.up You can start Prometheus configured to monitor MongoosePush in container executing the command below: docker-compose -f test/docker/docker-compose.prometheus.yml up -d --- test/docker/docker-compose.mpush.yml | 2 +- test/docker/docker-compose.prometheus.yml | 12 ++++++++++++ test/docker/prometheus.yml | 9 +++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 test/docker/docker-compose.prometheus.yml create mode 100644 test/docker/prometheus.yml diff --git a/test/docker/docker-compose.mpush.yml b/test/docker/docker-compose.mpush.yml index b95d43bc..00ce2d5c 100644 --- a/test/docker/docker-compose.mpush.yml +++ b/test/docker/docker-compose.mpush.yml @@ -1,5 +1,5 @@ # This file needs to be used along with `docker-compose.mocks.yml`: -# docker-compose -f test/docker/docker-compose.mocks.yml -f test/docker/docker-compose.mpush.yml ... +# PRIV=priv docker-compose -f test/docker/docker-compose.mocks.yml -f test/docker/docker-compose.mpush.yml ... version: '3' services: diff --git a/test/docker/docker-compose.prometheus.yml b/test/docker/docker-compose.prometheus.yml new file mode 100644 index 00000000..0ae17db2 --- /dev/null +++ b/test/docker/docker-compose.prometheus.yml @@ -0,0 +1,12 @@ +# This file needs to be used along with `docker-compose.mocks.yml` and `docker-compose.mpush.yml: +# PRIV=priv docker-compose -f test/docker/docker-compose.mocks.yml -f test/docker/docker-compose.mpush.yml -f test/docker/docker-compose.prometheus.yml ... +version: '3' + +services: + prometheus: + image: prom/prometheus + container_name: mongoose-push-prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml diff --git a/test/docker/prometheus.yml b/test/docker/prometheus.yml new file mode 100644 index 00000000..4f0ebaa6 --- /dev/null +++ b/test/docker/prometheus.yml @@ -0,0 +1,9 @@ +scrape_configs: + - job_name: 'mongoose-push' + scheme: 'https' #MongoosePush exposes encrypted endpoint - HTTPS + tls_config: #The default certs used by MongoosePush are self-signed + insecure_skip_verify: true #For checking purposes we can ignore certs verification + static_configs: + - targets: ['mongoose-push:8443'] + labels: + group: 'production' From e775072dea964f88bc606a09f967084c8d9c56b1 Mon Sep 17 00:00:00 2001 From: Michal Piotrowski Date: Tue, 16 Jun 2020 10:06:40 +0200 Subject: [PATCH 3/8] Update metrics doc - tips for configuring Prometheus --- README.md | 50 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 2c750d06..79af6400 100644 --- a/README.md +++ b/README.md @@ -491,26 +491,40 @@ If you specify both **alert** and **data**, target device will receive both noti * **500** `{"reason" : reason}` - the server internal error occured, specified by **reason**. -### I use MongoosePush docker, where do I find `sys.config`? +### Metrics -If you use dockerized MongoosePush, you need to do the following: -* Start MongoosePush docker, let's assume its name is `mongoose_push` -* Run: `docker cp mongoose_push:/opt/app/var/sys.config sys.config` on you docker host (this will get the current `sys.config` to your `${CWD}`) -* Modify the `sys.config` as you see fit (for metrics, see above) -* Stop MongoosePush docker container and restart it with the modified `sys.config` as volume in `/opt/app/sys.config` (yes, this is not the path we used to copy this file from, this is an override) +MongoosePush 2.1 provides metrics in the Prometheus format on the `/metrics` endpoint. +This is a breaking change compared to previous releases. +Existing dashboards will need to be updated. +#### Available metrics -### Available metrics +#### How to quickly see all metrics + +```bash +curl -k https://127.0.0.1:8443/metrics +``` + +The above command assumes that MongoosePush runs on `localhost` and listens on port `8443`. +Please, mind the `HTTPS` protocol, metrics are hosted on the same port as other API. + +#### Prometheus configuration + +When configuring Prometheus, it's important to: +* set the `scheme` to `https` since MongoosePush exposes `/metrics` path encrypted endpoint (HTTPS) +* set the `insecure_skip_verify` to `true` if the default self-signed certificates are used + +```yaml +scrape_configs: + - job_name: 'mongoose-push' + scheme: 'https' #MongoosePush exposes encrypted endpoint - HTTPS + tls_config: #The default certs used by MongoosePush are self-signed + insecure_skip_verify: true #For checking purposes we can ignore certs verification + static_configs: + - targets: ['mongoose-push:8443'] + labels: + group: 'production' + +``` -The following metrics are available: -* `mongoose_push_apns_state_get_default_topic_count` -* `mongoose_push_notification_send_time_bucket{error_category=${CATEGORY},error_reason=${REASON},service=${SERVICE},status=${STATUS},le=${LENGTH}}` -* `mongoose_push_notification_send_time_sum{error_category=${CATEGORY},error_reason=${REASON},service=${SERVICE},status=${STATUS}}` -* `mongoose_push_notification_send_time_count{error_category=${CATEGORY},error_reason=${REASON},service=${SERVICE},status="${STATUS}}` -Where: -* **CATEGORY** is an arbitrary error category term or empty string -* **REASON** is an arbitrary error reason term or empty string -* **SERVICE** is either `fcm` or `apns` -* **STATUS** is either `success` or `error` -* **LENGTH** is either `100` or `250` or `500` or `1000` or `+Inf` From 1f3ce76a7364f8ba1fc4e927bd9065b319242ddb Mon Sep 17 00:00:00 2001 From: Michal Piotrowski Date: Tue, 16 Jun 2020 12:48:37 +0200 Subject: [PATCH 4/8] Document the notification sent time histogram metric --- README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/README.md b/README.md index 79af6400..13a4129e 100644 --- a/README.md +++ b/README.md @@ -499,6 +499,32 @@ Existing dashboards will need to be updated. #### Available metrics +##### Histograms + +For more details about the histogram metric type please go to https://prometheus.io/docs/concepts/metric_types/#histogram + +###### Notification sent time + +`mongoose_push_notification_send_time_microsecond_bucket{error_category=${CATEGORY},error_reason=${REASON},service=${SERVICE},status=${STATUS},le=${LE}}` +`mongoose_push_notification_send_time_microsecond_sum{error_category=${CATEGORY},error_reason=${REASON},service=${SERVICE},status=${STATUS}}` +`mongoose_push_notification_send_time_microsecond_count{error_category=${CATEGORY},error_reason=${REASON},service=${SERVICE},status=${STATUS}}` + +Where: +* `STATUS` is `"success"` for the successful notifications or `"error"` in all other cases +* `SERVICE` is either `"apns"` or `"fcm"` +* `CATEGORY` is an arbitrary error category term (in case of `status="error"`) or an empty string (when `status="success"`) +* `REASON` is an arbitrary error reason term (in case of `status="error"`) or an empty string (when `status="success"`) +* `LE` defines the `less or equal` values for buckets, currently `1000`, `10_000`, `25_000`, `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` or `+Inf` + +> **NOTE** +> A mesurment of value 50_000 will be added to all buckets which are less or equal to 50_000. +> In this case these are buckets `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` and `+Inf` + +This histogram metric shows the distribution of times needed to: +1. Select a worker (this may include waiting time when all workers are busy). +2. Send a request. +3. Get a response from push notifications provider. + #### How to quickly see all metrics ```bash From c57d94a7f9a7694789a0ba07e62aeddb192f13c1 Mon Sep 17 00:00:00 2001 From: Michal Piotrowski Date: Tue, 16 Jun 2020 13:17:19 +0200 Subject: [PATCH 5/8] Start the MongoosePush.Metrics.TelemetryMetrics before others This is to capture some of the events emitted when APNS or FCM services start --- lib/mongoose_push/application.ex | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/mongoose_push/application.ex b/lib/mongoose_push/application.ex index 991e283c..197bae97 100644 --- a/lib/mongoose_push/application.ex +++ b/lib/mongoose_push/application.ex @@ -33,8 +33,10 @@ defmodule MongoosePush.Application do _ = check_runtime_configuration_status() # Define workers and child supervisors to be supervised + # The MongoosePush.Metrics.TelemetryMetrics child is started first to capture possible events + # when services start children = - service_children() ++ [MongoosePushWeb.Endpoint, MongoosePush.Metrics.TelemetryMetrics] + [MongoosePush.Metrics.TelemetryMetrics] ++ service_children() ++ [MongoosePushWeb.Endpoint] # See http://elixir-lang.org/docs/stable/elixir/Supervisor.html # for other strategies and supported options From 777b018d55d237180ce78b854fae795e138b1be6 Mon Sep 17 00:00:00 2001 From: Michal Piotrowski Date: Tue, 16 Jun 2020 13:35:16 +0200 Subject: [PATCH 6/8] Update doc with counter metrics --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 13a4129e..3e5c7686 100644 --- a/README.md +++ b/README.md @@ -497,6 +497,9 @@ MongoosePush 2.1 provides metrics in the Prometheus format on the `/metrics` end This is a breaking change compared to previous releases. Existing dashboards will need to be updated. +It is important to know that metrics are created inside MongoosePush only when a certain event happens. +This may mean that a freshly started MongoosePush node will not have all the possible metrics available yet. + #### Available metrics ##### Histograms @@ -525,6 +528,15 @@ This histogram metric shows the distribution of times needed to: 2. Send a request. 3. Get a response from push notifications provider. +##### Counters + +* `mongoose_push_supervisor_init_count{service=${SERVICE}}` - Counts number of push notification service supervisor starts. + The `SERVICE` variable can take `"apns"` or `"fcm"` as a value. + This metrics is update when MongoosePush starts or later when the underlying supervision tree is terminated and the error is propagate to the main application supervisor. +* `mongoose_push_apns_state_init_count` - Counts number of APNS state initialisations. +* `mongoose_push_apns_state_terminate_count` - Counts number of APNS state terminations. +* `mongoose_push_apns_state_get_default_topic_count` - Counts number of default topic reads from cache. + #### How to quickly see all metrics ```bash From 03f66065f3abcdbe03887684b16f308a46241197 Mon Sep 17 00:00:00 2001 From: Michal Piotrowski Date: Tue, 16 Jun 2020 13:35:44 +0200 Subject: [PATCH 7/8] Add short metrics description in the code --- .../metrics/telemetry_metrics.ex | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/lib/mongoose_push/metrics/telemetry_metrics.ex b/lib/mongoose_push/metrics/telemetry_metrics.ex index 752fffbc..feab757b 100644 --- a/lib/mongoose_push/metrics/telemetry_metrics.ex +++ b/lib/mongoose_push/metrics/telemetry_metrics.ex @@ -15,17 +15,28 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do event_name: [:mongoose_push, :notification, :send], measurement: :time, buckets: [1000, 10_000, 25_000, 50_000, 100_000, 250_000, 500_000, 1000_000], - tags: [:status, :service, :error_category, :error_reason] + tags: [:status, :service, :error_category, :error_reason], + description: + "A histogram showing push notification sent times. Includes worker selection (with possible waiting if all are busy)" ), # measurement is ignored in Counter metric - Telemetry.Metrics.counter("mongoose_push.supervisor.init.count", tags: [:service]), - Telemetry.Metrics.counter("mongoose_push.apns.state.init.count"), + Telemetry.Metrics.counter("mongoose_push.supervisor.init.count", + tags: [:service], + description: "Counts number of push notification service supervisor starts" + ), + Telemetry.Metrics.counter("mongoose_push.apns.state.init.count", + description: "Counts number of APNS state initialisations" + ), Telemetry.Metrics.counter("mongoose_push.apns.state.terminate.count", tags: [:error_reason], - tag_values: fn metadata -> %{metadata | error_reason: metadata.reason} end + tag_values: fn metadata -> %{metadata | error_reason: metadata.reason} end, + description: "Counts number of APNS state terminations" ), - Telemetry.Metrics.counter("mongoose_push.apns.state.get_default_topic.count") + Telemetry.Metrics.counter( + "mongoose_push.apns.state.get_default_topic.count", + description: "Counts number of APNS default topic reads from the ETS cache" + ) ] end end From 7e7e451bff5a33f0acbb134e7dcc2a6040cf88e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Piotrowski?= Date: Tue, 16 Jun 2020 17:16:51 +0200 Subject: [PATCH 8/8] Apply suggestions from code review Co-authored-by: Nelson Vides --- README.md | 22 +++++++++---------- .../metrics/telemetry_metrics.ex | 13 +++++------ 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 3e5c7686..e44952ad 100644 --- a/README.md +++ b/README.md @@ -517,11 +517,13 @@ Where: * `SERVICE` is either `"apns"` or `"fcm"` * `CATEGORY` is an arbitrary error category term (in case of `status="error"`) or an empty string (when `status="success"`) * `REASON` is an arbitrary error reason term (in case of `status="error"`) or an empty string (when `status="success"`) -* `LE` defines the `less or equal` values for buckets, currently `1000`, `10_000`, `25_000`, `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` or `+Inf` +* `LE` defines the `upper inclusive bound` (`less than or equal`) values for buckets, currently `1000`, `10_000`, `25_000`, `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` or `+Inf` > **NOTE** -> A mesurment of value 50_000 will be added to all buckets which are less or equal to 50_000. -> In this case these are buckets `50_000`, `100_000`, `250_000`, `500_000`, `1000_000` and `+Inf` +> +> A bucket of value 250_000 will keep the count of measurements that are less than or equal to 250_000. +> A measurement of value 51_836 will be added to all the buckets where the upper bound is greater than 51_836. +> In this case these are buckets `100_000`, `250_000`, `500_000`, `1000_000` and `+Inf` This histogram metric shows the distribution of times needed to: 1. Select a worker (this may include waiting time when all workers are busy). @@ -530,12 +532,12 @@ This histogram metric shows the distribution of times needed to: ##### Counters -* `mongoose_push_supervisor_init_count{service=${SERVICE}}` - Counts number of push notification service supervisor starts. +* `mongoose_push_supervisor_init_count{service=${SERVICE}}` - Counts the number of push notification service supervisor starts. The `SERVICE` variable can take `"apns"` or `"fcm"` as a value. - This metrics is update when MongoosePush starts or later when the underlying supervision tree is terminated and the error is propagate to the main application supervisor. -* `mongoose_push_apns_state_init_count` - Counts number of APNS state initialisations. -* `mongoose_push_apns_state_terminate_count` - Counts number of APNS state terminations. -* `mongoose_push_apns_state_get_default_topic_count` - Counts number of default topic reads from cache. + This metric is updated when MongoosePush starts, and later on when the underlying supervision tree is terminated and the error is propagated to the main application supervisor. +* `mongoose_push_apns_state_init_count` - Counts the number of APNS state initialisations. +* `mongoose_push_apns_state_terminate_count` - Counts the number of APNS state terminations. +* `mongoose_push_apns_state_get_default_topic_count` - Counts the number of default topic reads from cache. #### How to quickly see all metrics @@ -544,7 +546,7 @@ curl -k https://127.0.0.1:8443/metrics ``` The above command assumes that MongoosePush runs on `localhost` and listens on port `8443`. -Please, mind the `HTTPS` protocol, metrics are hosted on the same port as other API. +Please, mind the `HTTPS` protocol, metrics are hosted on the same port than all the other API endpoints. #### Prometheus configuration @@ -564,5 +566,3 @@ scrape_configs: group: 'production' ``` - - diff --git a/lib/mongoose_push/metrics/telemetry_metrics.ex b/lib/mongoose_push/metrics/telemetry_metrics.ex index feab757b..f11c22e1 100644 --- a/lib/mongoose_push/metrics/telemetry_metrics.ex +++ b/lib/mongoose_push/metrics/telemetry_metrics.ex @@ -17,25 +17,24 @@ defmodule MongoosePush.Metrics.TelemetryMetrics do buckets: [1000, 10_000, 25_000, 50_000, 100_000, 250_000, 500_000, 1000_000], tags: [:status, :service, :error_category, :error_reason], description: - "A histogram showing push notification sent times. Includes worker selection (with possible waiting if all are busy)" + "A histogram showing push notification send times. Includes worker selection (with possible waiting if all are busy)" ), # measurement is ignored in Counter metric Telemetry.Metrics.counter("mongoose_push.supervisor.init.count", tags: [:service], - description: "Counts number of push notification service supervisor starts" + description: "Counts the number of push notification service supervisor starts" ), Telemetry.Metrics.counter("mongoose_push.apns.state.init.count", - description: "Counts number of APNS state initialisations" + description: "Counts the number of APNS state initialisations" ), Telemetry.Metrics.counter("mongoose_push.apns.state.terminate.count", tags: [:error_reason], tag_values: fn metadata -> %{metadata | error_reason: metadata.reason} end, - description: "Counts number of APNS state terminations" + description: "Counts the number of APNS state terminations" ), - Telemetry.Metrics.counter( - "mongoose_push.apns.state.get_default_topic.count", - description: "Counts number of APNS default topic reads from the ETS cache" + Telemetry.Metrics.counter("mongoose_push.apns.state.get_default_topic.count", + description: "Counts the number of APNS default topic reads from the ETS cache" ) ] end