diff --git a/.gitmodules b/.gitmodules index 3955ade5ce..abba677814 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,12 +1,6 @@ -[submodule "modules/metrics"] - path = modules/metrics - url = https://github.com/tarantool/metrics.git [submodule "modules/luatest"] path = modules/luatest url = https://github.com/tarantool/luatest -[submodule "modules/grafana-dashboard"] - path = modules/grafana-dashboard - url = https://github.com/tarantool/grafana-dashboard [submodule "modules/tntcxx"] path = modules/tntcxx url = https://github.com/tarantool/tntcxx.git diff --git a/build_submodules.sh b/build_submodules.sh index e19e7258a0..908af47df7 100755 --- a/build_submodules.sh +++ b/build_submodules.sh @@ -11,18 +11,6 @@ po_dest="${project_root}/locale/ru/LC_MESSAGES" # Copy Building Tarantool Docs guide cp README.rst doc/contributing/docs/_includes/README.rst - -# Monitoring -monitoring_root="${project_root}/modules/metrics/doc/monitoring" -monitoring_dest="${project_root}/doc/book" -monitoring_grafana_root="${project_root}/modules/grafana-dashboard/doc/monitoring" - -# Copy monitoring docs to the right destination -mkdir -p "${monitoring_dest}" -cp -rfv "${monitoring_root}" "${monitoring_dest}/" -cp -rfv "${monitoring_grafana_root}" "${monitoring_dest}/" - - # Luatest luatest_root="${project_root}/modules/luatest" luatest_dest="${project_root}/doc/reference/reference_rock/luatest" diff --git a/doc/book/admin/index.rst b/doc/book/admin/index.rst index 0396a88f28..cbdac9af55 100644 --- a/doc/book/admin/index.rst +++ b/doc/book/admin/index.rst @@ -41,4 +41,4 @@ This chapter includes the following sections: os_notes bug_reports troubleshoot - ../monitoring/index + monitoring diff --git a/doc/book/admin/monitoring.rst b/doc/book/admin/monitoring.rst new file mode 100644 index 0000000000..24a696f3fc --- /dev/null +++ b/doc/book/admin/monitoring.rst @@ -0,0 +1,17 @@ +.. _monitoring: + +Monitoring +========== + +Monitoring is the process of capturing runtime information about the instances of a Tarantool cluster using metrics. +Metrics can indicate various characteristics, such as memory usage, the number of records in spaces, replication status, and so on. +Typically, metrics are monitored in real time, allowing for the identification of current issues or the prediction of potential ones. + +.. toctree:: + :maxdepth: 1 + :numbered: 0 + + monitoring/getting_started + monitoring/grafana_dashboard + monitoring/alerting + monitoring/metrics_reference diff --git a/doc/book/admin/monitoring/alerting.rst b/doc/book/admin/monitoring/alerting.rst new file mode 100644 index 0000000000..796f1d5692 --- /dev/null +++ b/doc/book/admin/monitoring/alerting.rst @@ -0,0 +1,420 @@ +.. _monitoring-alerting-page: + +=============================================================================== +Alerting +=============================================================================== + +You can set up alerts on metrics to get a notification when something went +wrong. We will use `Prometheus alert rules `_ +as an example here. You can get full ``alerts.yml`` file at +`tarantool/grafana-dashboard GitHub repo `_. + +.. _monitoring-alerting-tarantool: + +------------------------------------------------------------------------------- +Tarantool metrics +------------------------------------------------------------------------------- + +You can use internal Tarantool metrics to monitor detailed RAM consumption, +replication state, database engine status, track business logic issues (like +HTTP 4xx and 5xx responses or low request rate) and external modules statistics +(like ``CRUD`` errors). Evaluation timeouts, severity +levels and thresholds (especially ones for business logic) are placed here for +the sake of example: you may want to increase or decrease them for your +application. Also, don't forget to set sane rate time ranges based on your +Prometheus configuration. + +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Lua memory +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +Monitoring ``tnt_info_memory_lua`` metric may prevent memory overflow and detect the presence of bad Lua code practices. + +.. NOTE:: + + The Lua memory is limited to 2 GB per instance if Tarantool doesn't have the GC64 mode enabled for LuaJIT. + +.. code-block:: yaml + + - alert: HighLuaMemoryWarning + expr: tnt_info_memory_lua >= (512 * 1024 * 1024) + for: 1m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') Lua runtime warning" + description: "'{{ $labels.alias }}' instance of job '{{ $labels.job }}' uses too much Lua memory + and may hit threshold soon." + + - alert: HighLuaMemoryAlert + expr: tnt_info_memory_lua >= (1024 * 1024 * 1024) + for: 1m + labels: + severity: page + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') Lua runtime alert" + description: "'{{ $labels.alias }}' instance of job '{{ $labels.job }}' uses too much Lua memory + and likely to hit threshold soon." + +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Memtx arena memory +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +By monitoring :ref:`slab allocation statistics ` you can see +how many free RAM is remaining to store memtx tuples and indexes for an +instance. If Tarantool hit the limits, the instance will become unavailable +for write operations, so this alert may help you see when it's time to increase +your ``memtx_memory`` limit or to add a new storage to a vshard cluster. + +.. code-block:: yaml + + - alert: LowMemtxArenaRemainingWarning + expr: (tnt_slab_quota_used_ratio >= 80) and (tnt_slab_arena_used_ratio >= 80) + for: 1m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low arena memory remaining" + description: "Low arena memory (tuples and indexes) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'. + Consider increasing memtx_memory or number of storages in case of sharded data." + + - alert: LowMemtxArenaRemaining + expr: (tnt_slab_quota_used_ratio >= 90) and (tnt_slab_arena_used_ratio >= 90) + for: 1m + labels: + severity: page + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low arena memory remaining" + description: "Low arena memory (tuples and indexes) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'. + You are likely to hit limit soon. + It is strongly recommended to increase memtx_memory or number of storages in case of sharded data." + + - alert: LowMemtxItemsRemainingWarning + expr: (tnt_slab_quota_used_ratio >= 80) and (tnt_slab_items_used_ratio >= 80) + for: 1m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low items memory remaining" + description: "Low items memory (tuples) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'. + Consider increasing memtx_memory or number of storages in case of sharded data." + + - alert: LowMemtxItemsRemaining + expr: (tnt_slab_quota_used_ratio >= 90) and (tnt_slab_items_used_ratio >= 90) + for: 1m + labels: + severity: page + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') low items memory remaining" + description: "Low items memory (tuples) remaining for '{{ $labels.alias }}' instance of job '{{ $labels.job }}'. + You are likely to hit limit soon. + It is strongly recommended to increase memtx_memory or number of storages in case of sharded data." + +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Vinyl engine status +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +You can monitor :ref:`vinyl regulator ` +performance to track possible scheduler or disk issues. + +.. code-block:: yaml + + - alert: LowVinylRegulatorRateLimit + expr: tnt_vinyl_regulator_rate_limit < 100000 + for: 1m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have low vinyl regulator rate limit" + description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have low vinyl engine regulator rate limit. + This indicates issues with the disk or the scheduler." + + +:ref:`Vinyl transactions ` errors are likely +to lead to user requests errors. + +.. code-block:: yaml + + - alert: HighVinylTxConflictRate + expr: rate(tnt_vinyl_tx_conflict[5m]) / rate(tnt_vinyl_tx_commit[5m]) > 0.05 + for: 1m + labels: + severity: critical + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high vinyl tx conflict rate" + description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have + high vinyl transactions conflict rate. It indicates that vinyl is not healthy." + +:ref:`Vinyl scheduler ` failed tasks +are a good signal of disk issues and may be the reason of increasing RAM +consumption. + +.. code-block:: yaml + + - alert: HighVinylSchedulerFailedTasksRate + expr: rate(tnt_vinyl_scheduler_tasks{status="failed"}[5m]) > 0.1 + for: 1m + labels: + severity: critical + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high vinyl scheduler failed tasks rate" + description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have + high vinyl scheduler failed tasks rate." + + +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Replication state +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +If ``tnt_replication_status`` is equal to ``0``, instance :ref:`replication ` +status is not equal to ``"follows"``: replication is either not ready yet or +has been stopped due to some reason. + +.. code-block:: yaml + + - alert: ReplicationNotRunning + expr: tnt_replication_status == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') {{ $labels.stream }} (id {{ $labels.id }}) + replication is not running" + description: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') {{ $labels.stream }} (id {{ $labels.id }}) + replication is not running." + +Even if async replication is ``"follows"``, it could be considered malfunctioning +if the lag is too high. It also may affect Tarantool garbage collector work, +see :ref:`box.info.gc() `. + +.. code-block:: yaml + + - alert: HighReplicationLag + expr: tnt_replication_lag > 1 + for: 1m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') have high replication lag (id {{ $labels.id }})" + description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' have high replication lag + (id {{ $labels.id }}), check up your network and cluster state." + +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Event loop +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +High :ref:`fiber ` event loop time leads to bad application +performance, timeouts and various warnings. The reason could be a high quantity +of working fibers or fibers that spend too much time without any yields or +sleeps. + +.. code-block:: yaml + + - alert: HighEVLoopTime + expr: tnt_ev_loop_time > 0.1 + for: 1m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') event loop has high cycle duration" + description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' event loop has high cycle duration. + Some high loaded fiber has too little yields. It may be the reason of 'Too long WAL write' warnings." + + +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Configuration status +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +:ref:`Configuration status ` displays +Tarantool 3 configuration apply state. Additional metrics desplay the count +of apply warnings and errors. + +.. code-block:: yaml + + - alert: ConfigWarningAlerts + expr: tnt_config_alerts{level="warn"} > 0 + for: 1m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has configuration 'warn' alerts" + description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has configuration 'warn' alerts. + Please, check config:info() for detailed info." + + - alert: ConfigErrorAlerts + expr: tnt_config_alerts{level="error"} > 0 + for: 1m + labels: + severity: page + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has configuration 'error' alerts" + description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has configuration 'error' alerts. + Latest configuration has not been applied. + Please, check config:info() for detailed info." + + - alert: ConfigStatusNotReady + expr: tnt_config_status{status="ready"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') configuration is not ready" + description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' configuration is not ready. + Please, check config:info() for detailed info." + + + +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +HTTP server statistics +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +:ref:`metrics ` allows to monitor `tarantool/http `_ +handles, see :ref:`"Collecting HTTP request latency statistics" `. +Here we use a ``summary`` collector with a default name and 0.99 quantile +computation. + +Too many responses with error codes usually is a sign of API issues or +application malfunction. + +.. code-block:: yaml + + - alert: HighInstanceHTTPClientErrorRate + expr: sum by (job, instance, method, path, alias) (rate(http_server_request_latency_count{ job="tarantool", status=~"^4\\d{2}$" }[5m])) > 10 + for: 1m + labels: + severity: page + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') high rate of client error responses" + description: "Too many {{ $labels.method }} requests to {{ $labels.path }} path + on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get client error (4xx) responses." + + - alert: HighHTTPClientErrorRate + expr: sum by (job, method, path) (rate(http_server_request_latency_count{ job="tarantool", status=~"^4\\d{2}$" }[5m])) > 20 + for: 1m + labels: + severity: page + annotations: + summary: "Job '{{ $labels.job }}' high rate of client error responses" + description: "Too many {{ $labels.method }} requests to {{ $labels.path }} path + on instances of job '{{ $labels.job }}' get client error (4xx) responses." + + - alert: HighHTTPServerErrorRate + expr: sum by (job, instance, method, path, alias) (rate(http_server_request_latency_count{ job="tarantool", status=~"^5\\d{2}$" }[5m])) > 0 + for: 1m + labels: + severity: page + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') server error responses" + description: "Some {{ $labels.method }} requests to {{ $labels.path }} path + on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get server error (5xx) responses." + +Responding with high latency is a synonym of insufficient performance. It may +be a sign of application malfunction. Or maybe you need to add more routers to +your cluster. + +.. code-block:: yaml + + - alert: HighHTTPLatency + expr: http_server_request_latency{ job="tarantool", quantile="0.99" } > 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') high HTTP latency" + description: "Some {{ $labels.method }} requests to {{ $labels.path }} path with {{ $labels.status }} response status + on '{{ $labels.alias }}' instance of job '{{ $labels.job }}' are processed too long." + +Having too little requests when you expect them may detect balancer, external +client or network malfunction. + +.. code-block:: yaml + + - alert: LowRouterHTTPRequestRate + expr: sum by (job, instance, alias) (rate(http_server_request_latency_count{ job="tarantool", alias=~"^.*router.*$" }[5m])) < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Router '{{ $labels.alias }}' ('{{ $labels.job }}') low activity" + description: "Router '{{ $labels.alias }}' instance of job '{{ $labels.job }}' gets too little requests. + Please, check up your balancer middleware." + + +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +CRUD module statistics +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +If your application uses `CRUD `_ module +requests, monitoring module statistics may track internal errors caused by +invalid process of input and internal parameters. + +.. code-block:: yaml + + - alert: HighCRUDErrorRate + expr: rate(tnt_crud_stats_count{ job="tarantool", status="error" }[5m]) > 0.1 + for: 1m + labels: + severity: critical + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') too many CRUD {{ $labels.operation }} errors." + description: "Too many {{ $labels.operation }} CRUD requests for '{{ $labels.name }}' space on + '{{ $labels.alias }}' instance of job '{{ $labels.job }}' get module error responses." + +Statistics could also monitor requests performance. Too high request latency +will lead to high latency of client responses. It may be caused by network +or disk issues. Read requests with bad (with respect to space indexes and +sharding schema) conditions may lead to full-scans or map reduces and also +could be the reason of high latency. + +.. code-block:: yaml + + - alert: HighCRUDLatency + expr: tnt_crud_stats{ job="tarantool", quantile="0.99" } > 0.1 + for: 1m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') too high CRUD {{ $labels.operation }} latency." + description: "Some {{ $labels.operation }} {{ $labels.status }} CRUD requests for '{{ $labels.name }}' space on + '{{ $labels.alias }}' instance of job '{{ $labels.job }}' are processed too long." + +You also can directly monitor map reduces and scan rate. + +.. code-block:: yaml + + - alert: HighCRUDMapReduceRate + expr: rate(tnt_crud_map_reduces{ job="tarantool" }[5m]) > 0.1 + for: 1m + labels: + severity: warning + annotations: + summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') too many CRUD {{ $labels.operation }} map reduces." + description: "There are too many {{ $labels.operation }} CRUD map reduce requests for '{{ $labels.name }}' space on + '{{ $labels.alias }}' instance of job '{{ $labels.job }}'. + Check your request conditions or consider changing sharding schema." + + +.. _monitoring-alerting-server: + +------------------------------------------------------------------------------- +Server-side monitoring +------------------------------------------------------------------------------- + +If there are no Tarantool metrics, you may miss critical conditions. Prometheus +provide ``up`` metric to monitor the health of its targets. + +.. code-block:: yaml + + - alert: InstanceDown + expr: up == 0 + for: 1m + labels: + severity: page + annotations: + summary: "Instance '{{ $labels.instance }}' ('{{ $labels.job }}') down" + description: "'{{ $labels.instance }}' of job '{{ $labels.job }}' has been down for more than a minute." + +Do not forget to monitor your server's CPU, disk and RAM from server side with +your favorite tools. For example, on some high CPU consumption cases Tarantool +instance may stop to send metrics, so you can track such breakdowns only from +the outside. diff --git a/doc/book/admin/monitoring/getting_started.rst b/doc/book/admin/monitoring/getting_started.rst new file mode 100644 index 0000000000..e36b1dbe3f --- /dev/null +++ b/doc/book/admin/monitoring/getting_started.rst @@ -0,0 +1,111 @@ +.. _monitoring-getting_started: + +Getting started with monitoring +=============================== + +Example on GitHub: `sharded_cluster_crud_metrics `_ + +Tarantool allows you to configure and expose its :ref:`metrics ` using a :ref:`YAML configuration `. +You can also use the built-in :ref:`metrics ` module to create and collect custom metrics. + + + + +.. _monitoring_configuring_metrics: + +Configuring metrics +------------------- + +To configure metrics, use the :ref:`metrics ` section in a cluster configuration. +The configuration below enables all metrics excluding :ref:`vinyl `-specific ones: + +.. literalinclude:: /code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/config.yaml + :start-at: metrics: + :end-at: instance_name + :language: yaml + :dedent: + +The ``metrics.labels`` option accepts the predefined :ref:`{{ instance_name }} ` variable. +This adds an instance name as a :ref:`label ` to every observation. + +Third-party Lua modules, like `crud `_ or `expirationd `_, offer their own metrics. +You can enable these metrics by :ref:`configuring the corresponding role `. +The example below shows how to enable statistics on called operations by providing the ``roles.crud-router`` role's configuration: + +.. literalinclude:: /code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/config.yaml + :language: yaml + :start-after: routers: + :end-at: stats_quantiles + :dedent: + +``expirationd`` metrics can be enabled as follows: + +.. code-block:: yaml + + expirationd: + cfg: + metrics: true + + + +.. _monitoring_exposing_metrics: + +Exposing metrics +---------------- + +To expose metrics in different formats, you can use a third-party `metrics-export-role `__ role. +In the following example, the metrics of ``storage-a-001`` are provided on two endpoints: + +- ``/metrics/prometheus``: exposes metrics in the Prometheus format. +- ``/metrics/json``: exposes metrics in the JSON format. + +.. literalinclude:: /code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/config.yaml + :start-at: storage-a-001: + :end-at: format: json + :language: yaml + :dedent: + +Example on GitHub: `sharded_cluster_crud_metrics `_ + +.. NOTE:: + + The ``metrics`` module provides a set of plugins that can be used to collect and expose metrics in different formats. Learn more in :ref:`metrics-api_reference_collecting_using_plugins`. + + + +.. _monitoring_create_metrics: + +Creating custom metrics +----------------------- + +The ``metrics`` module allows you to create and collect custom metrics. +The example below shows how to collect the number of data operations performed on the specified space by increasing a ``counter`` value inside the :ref:`on_replace() ` trigger function: + +.. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_replace_count.lua + :start-after: -- Collect a custom metric + :end-before: -- End + :language: lua + :dedent: + +Learn more in :ref:`metrics-api_reference_custom_metrics`. + + + +.. _monitoring_collecting_metrics: + +Collecting metrics +------------------ + +When metrics are configured and exposed, you can use the desired third-party tool to collect them. +Below is the example of a Prometheus scrape configuration that collects metrics of multiple Tarantool instances: + +.. literalinclude:: /code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/prometheus/prometheus.yml + :language: yaml + :dedent: + +For more information on collecting and visualizing metrics, refer to :ref:`monitoring-grafana_dashboard-page`. + +.. NOTE:: + + |tcm_full_name| allows you to view metrics of connected clusters in real time. + Learn more in :ref:`tcm_cluster_metrics`. diff --git a/doc/book/admin/monitoring/grafana_dashboard.rst b/doc/book/admin/monitoring/grafana_dashboard.rst new file mode 100644 index 0000000000..fb596fbd88 --- /dev/null +++ b/doc/book/admin/monitoring/grafana_dashboard.rst @@ -0,0 +1,171 @@ +.. _monitoring-grafana_dashboard-page: + +Grafana dashboard +================= + +After :ref:`enabling and configuring metrics `, you can visualise them using Tarantool Grafana dashboards. +These dashboards are available as part of +`Grafana official & community-built dashboards `_: + +.. container:: table + + .. list-table:: + :widths: 50 50 + :header-rows: 0 + + * - Tarantool 3 + - `Prometheus `_, `InfluxDB `_ + + * - Tarantool Cartridge and Tarantool 1.10—2.x + - `Prometheus `_, `InfluxDB `_ + + * - Tarantool Data Grid 2 + - `Prometheus `_, `InfluxDB `_ + +The Tarantool Grafana dashboard is a ready for import template with basic memory, +space operations, and HTTP load panels, based on default `metrics `_ +package functionality. + +.. image:: images/Prometheus_dashboard_1.png + :width: 30% + +.. image:: images/Prometheus_dashboard_2.png + :width: 30% + +.. image:: images/Prometheus_dashboard_3.png + :width: 30% + +.. _monitoring-grafana_dashboard-monitoring_stack: + + +Prepare a monitoring stack +-------------------------- + +Since there are Prometheus and InfluxDB data source Grafana dashboards, +you can use one of the following: + +- `Telegraf `_ + as a server agent for collecting metrics, `InfluxDB `_ + as a time series database for storing metrics, and `Grafana `_ + as a visualization platform. +- `Prometheus `_ as both a server agent for collecting metrics + and a time series database for storing metrics, and `Grafana `_ + as a visualization platform. + +For issues related to setting up Prometheus, Telegraf, InfluxDB, or Grafana instances, refer to the corresponding project's documentation. + +.. _monitoring-grafana_dashboard-collect_metrics: + +Collect metrics with server agents +---------------------------------- + +.. _monitoring-grafana_dashboard-collect_metrics_prometheus: + +Prometheus +~~~~~~~~~~ + +To collect metrics for Prometheus, first set up metrics output with ``prometheus`` format. +You can use the :ref:`roles.metrics-export ` configuration or set up the :ref:`Prometheus plugin ` manually. +To start collecting metrics, `add a job `_ +to Prometheus configuration with each Tarantool instance URI as a target and +metrics path as it was configured on Tarantool instances: + +.. literalinclude:: /code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/prometheus/prometheus.yml + :language: yaml + :dedent: + +.. _monitoring-grafana_dashboard-collect_metrics_influxdb: + +InfluxDB +~~~~~~~~ + +To collect metrics for InfluxDB, use the Telegraf agent. +First off, configure Tarantool metrics output in ``json`` format +with :ref:`roles.metrics-export ` configuration or corresponding :ref:`JSON plugin `. +To start collecting metrics, add `http input `_ +to Telegraf configuration including each Tarantool instance metrics URL: + +.. code-block:: toml + + [[inputs.http]] + urls = [ + "http://example_project:8081/metrics/json", + "http://example_project:8082/metrics/json", + "http://example_project:8083/metrics/json", + "http://example_project:8084/metrics/json", + "http://example_project:8085/metrics/json" + ] + timeout = "30s" + tag_keys = [ + "metric_name", + "label_pairs_alias", + "label_pairs_quantile", + "label_pairs_path", + "label_pairs_method", + "label_pairs_status", + "label_pairs_operation", + "label_pairs_level", + "label_pairs_id", + "label_pairs_engine", + "label_pairs_name", + "label_pairs_index_name", + "label_pairs_delta", + "label_pairs_stream", + "label_pairs_thread", + "label_pairs_kind" + ] + insecure_skip_verify = true + interval = "10s" + data_format = "json" + name_prefix = "tarantool_" + fieldpass = ["value"] + +Be sure to include each label key as ``label_pairs_`` to extract it +with the plugin. +For example, if you use :code:`{ state = 'ready' }` labels somewhere in metric collectors, add ``label_pairs_state`` tag key. + + + +.. _monitoring-grafana_dashboard-import: + +Import the dashboard +-------------------- + +Open Grafana import menu. + +.. image:: images/grafana_import.png + :align: left + +To import a specific dashboard, choose one of the following options: + +- paste the dashboard id (``21474`` for Prometheus dashboard, ``21484`` for InfluxDB dashboard) +- paste a link to the dashboard (https://grafana.com/grafana/dashboards/21474 for Prometheus dashboard, https://grafana.com/grafana/dashboards/21484 for InfluxDB dashboard) +- paste the dashboard JSON file contents +- upload the dashboard JSON file + +Set dashboard name, folder and uid (if needed). + +.. image:: images/grafana_import_setup.png + :align: left + +You can choose the data source and data source variables after import. + +.. image:: images/grafana_variables_setup.png + :align: left + +.. _monitoring-grafana_dashboard-troubleshooting: + +Troubleshooting +--------------- + +- If there are no data on the graphs, make sure that you picked datasource and job/measurement correctly. + +- If there are no data on the graphs, make sure that you have ``info`` group of Tarantool metrics + (in particular, ``tnt_info_uptime``). + +- If some Prometheus graphs show no data because of ``parse error: missing unit character in duration``, + ensure that you use Grafana 7.2 or newer. + +- If some Prometheus graphs display ``parse error: bad duration syntax "1m0"`` or similar error, you need + to update your Prometheus version. See + `grafana/grafana#44542 `_ for more details. diff --git a/doc/book/admin/monitoring/images/Prometheus_dashboard_1.png b/doc/book/admin/monitoring/images/Prometheus_dashboard_1.png new file mode 100644 index 0000000000..3ea5fed7ce Binary files /dev/null and b/doc/book/admin/monitoring/images/Prometheus_dashboard_1.png differ diff --git a/doc/book/admin/monitoring/images/Prometheus_dashboard_2.png b/doc/book/admin/monitoring/images/Prometheus_dashboard_2.png new file mode 100644 index 0000000000..5cf04310ce Binary files /dev/null and b/doc/book/admin/monitoring/images/Prometheus_dashboard_2.png differ diff --git a/doc/book/admin/monitoring/images/Prometheus_dashboard_3.png b/doc/book/admin/monitoring/images/Prometheus_dashboard_3.png new file mode 100644 index 0000000000..c03c0e7f03 Binary files /dev/null and b/doc/book/admin/monitoring/images/Prometheus_dashboard_3.png differ diff --git a/doc/book/admin/monitoring/images/grafana_import.png b/doc/book/admin/monitoring/images/grafana_import.png new file mode 100644 index 0000000000..1260367d93 Binary files /dev/null and b/doc/book/admin/monitoring/images/grafana_import.png differ diff --git a/doc/book/admin/monitoring/images/grafana_import_setup.png b/doc/book/admin/monitoring/images/grafana_import_setup.png new file mode 100644 index 0000000000..23400abb14 Binary files /dev/null and b/doc/book/admin/monitoring/images/grafana_import_setup.png differ diff --git a/doc/book/admin/monitoring/images/grafana_variables_setup.png b/doc/book/admin/monitoring/images/grafana_variables_setup.png new file mode 100644 index 0000000000..4737b4ee69 Binary files /dev/null and b/doc/book/admin/monitoring/images/grafana_variables_setup.png differ diff --git a/doc/book/admin/monitoring/metrics_reference.rst b/doc/book/admin/monitoring/metrics_reference.rst new file mode 100644 index 0000000000..3a1db70d8f --- /dev/null +++ b/doc/book/admin/monitoring/metrics_reference.rst @@ -0,0 +1,986 @@ +.. _metrics-reference: + +Metrics reference +================= + +This page provides a detailed description of metrics from the ``metrics`` module. + +General metrics +--------------- + +General instance information: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_cfg_current_time`` + - Instance system time in the Unix timestamp format + * - ``tnt_info_uptime`` + - Time in seconds since the instance has started + * - ``tnt_read_only`` + - Indicates if the instance is in read-only mode (``1`` if true, ``0`` if false) + +.. _metrics-reference-memory_general: + +Memory general +-------------- + +The following metrics provide a picture of memory usage by the Tarantool process. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_info_memory_cache`` + - Number of bytes in the cache used to store + tuples with the vinyl storage engine. + * - ``tnt_info_memory_data`` + - Number of bytes used to store user data (tuples) + with the memtx engine and with level 0 of the vinyl engine, + without regard for memory fragmentation. + * - ``tnt_info_memory_index`` + - Number of bytes used for indexing user data. + Includes memtx and vinyl memory tree extents, + the vinyl page index, and the vinyl bloom filters. + * - ``tnt_info_memory_lua`` + - Number of bytes used for the Lua runtime. + Monitoring this metric can prevent memory overflow. + * - ``tnt_info_memory_net`` + - Number of bytes used for network input/output buffers. + * - ``tnt_info_memory_tx`` + - Number of bytes in use by active transactions. + For the vinyl storage engine, + this is the total size of all allocated objects + (struct ``txv``, struct ``vy_tx``, struct ``vy_read_interval``) + and tuples pinned for those objects. + +.. _metrics-reference-memory_allocation: + +Memory allocation +----------------- + +Provides a memory usage report for the slab allocator. +The slab allocator is the main allocator used to store tuples. +The following metrics help monitor the total memory usage and memory fragmentation. +To learn more about use cases, refer to the +:ref:`box.slab submodule documentation `. + +Available memory, bytes: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_slab_quota_size`` + - Amount of memory available to store tuples and indexes. + Is equal to ``memtx_memory``. + * - ``tnt_slab_arena_size`` + - Total memory available to store both tuples and indexes. + Includes allocated but currently free slabs. + * - ``tnt_slab_items_size`` + - Total amount of memory available to store only tuples and not indexes. + Includes allocated but currently free slabs. + +Memory usage, bytes: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_slab_quota_used`` + - The amount of memory that is already reserved by the slab allocator. + * - ``tnt_slab_arena_used`` + - The effective memory used to store both tuples and indexes. + Disregards allocated but currently free slabs. + * - ``tnt_slab_items_used`` + - The effective memory used to store only tuples and not indexes. + Disregards allocated but currently free slabs. + +Memory utilization, %: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_slab_quota_used_ratio`` + - ``tnt_slab_quota_used / tnt_slab_quota_size`` + * - ``tnt_slab_arena_used_ratio`` + - ``tnt_slab_arena_used / tnt_slab_arena_size`` + * - ``tnt_slab_items_used_ratio`` + - ``tnt_slab_items_used / tnt_slab_items_size`` + +.. _metrics-reference-spaces: + +Spaces +------ + +The following metrics provide specific information +about each individual space in a Tarantool instance. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_space_len`` + - Number of records in the space. + This metric always has 2 labels: ``{name="test", engine="memtx"}``, + where ``name`` is the name of the space and + ``engine`` is the engine of the space. + * - ``tnt_space_bsize`` + - Total number of bytes in all tuples. + This metric always has 2 labels: ``{name="test", engine="memtx"}``, + where ``name`` is the name of the space + and ``engine`` is the engine of the space. + * - ``tnt_space_index_bsize`` + - Total number of bytes taken by the index. + This metric always has 2 labels: ``{name="test", index_name="pk"}``, + where ``name`` is the name of the space and + ``index_name`` is the name of the index. + * - ``tnt_space_total_bsize`` + - Total size of tuples and all indexes in the space. + This metric always has 2 labels: ``{name="test", engine="memtx"}``, + where ``name`` is the name of the space and + ``engine`` is the engine of the space. + * - ``tnt_vinyl_tuples`` + - Total tuple count for vinyl. + This metric always has 2 labels: ``{name="test", engine="vinyl"}``, + where ``name`` is the name of the space and + ``engine`` is the engine of the space. For vinyl this metric is disabled + by default and can be enabled only with global variable setup: + ``rawset(_G, 'include_vinyl_count', true)``. + +.. _metrics-reference-network: + +Network +------- + +Network activity stats. +These metrics can be used to monitor network load, usage peaks, and traffic drops. + +Sent bytes: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_net_sent_total`` + - Bytes sent from the instance over the network since the instance's start time + +Received bytes: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_net_received_total`` + - Bytes received by the instance since start time + +Connections: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_net_connections_total`` + - Number of incoming network connections since the instance's start time + * - ``tnt_net_connections_current`` + - Number of active network connections + +Requests: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_net_requests_total`` + - Number of network requests the instance has handled since its start time + * - ``tnt_net_requests_current`` + - Number of pending network requests + +Requests in progress: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_net_requests_in_progress_total`` + - Total count of requests processed by tx thread + * - ``tnt_net_requests_in_progress_current`` + - Count of requests currently being processed in the tx thread + +Requests placed in queues of streams: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_net_requests_in_stream_total`` + - Total count of requests, which was placed in queues of streams + for all time + * - ``tnt_net_requests_in_stream_current`` + - Count of requests currently waiting in queues of streams + +Since Tarantool 2.10 in each network metric has the label ``thread``, showing per-thread network statistics. + +.. _metrics-reference-fibers: + +Fibers +------ + +Provides the statistics for :ref:`fibers `. +If your application creates a lot of fibers, +you can use the metrics below to monitor fiber count and memory usage. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_fiber_amount`` + - Number of fibers + * - ``tnt_fiber_csw`` + - Overall number of fiber context switches + * - ``tnt_fiber_memalloc`` + - Amount of memory reserved for fibers + * - ``tnt_fiber_memused`` + - Amount of memory used by fibers + +.. _metrics-reference-operations: + +Operations +---------- + +You can collect iproto requests an instance has processed +and aggregate them by request type. +This may help you find out what operations your clients perform most often. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_stats_op_total`` + - Total number of calls since server start + +To distinguish between request types, this metric has the ``operation`` label. +For example, it can look as follows: ``{operation="select"}``. +For the possible request types, check the table below. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``auth`` + - Authentication requests + * - ``call`` + - Requests to execute stored procedures + * - ``delete`` + - Delete calls + * - ``error`` + - Requests resulted in an error + * - ``eval`` + - Calls to evaluate Lua code + * - ``execute`` + - Execute SQL calls + * - ``insert`` + - Insert calls + * - ``prepare`` + - SQL prepare calls + * - ``replace`` + - Replace calls + * - ``select`` + - Select calls + * - ``update`` + - Update calls + * - ``upsert`` + - Upsert calls + +.. _metrics-reference-replication: + +Replication +----------- + +Provides the current replication status. +Learn more about :ref:`replication in Tarantool `. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_info_lsn`` + - LSN of the instance. + * - ``tnt_info_vclock`` + - LSN number in vclock. + This metric always has the label ``{id="id"}``, + where ``id`` is the instance's number in the replica set. + * - ``tnt_replication_lsn`` + - LSN of the tarantool instance. + This metric always has labels ``{id="id", type="type"}``, where + ``id`` is the instance's number in the replica set, + ``type`` is ``master`` or ``replica``. + * - ``tnt_replication_lag`` + - Replication lag value in seconds. + This metric always has labels ``{id="id", stream="stream"}``, + where ``id`` is the instance's number in the replica set, + ``stream`` is ``downstream`` or ``upstream``. + * - ``tnt_replication_status`` + - This metrics equals 1 when replication status is "follow" and 0 otherwise. + This metric always has labels ``{id="id", stream="stream"}``, + where ``id`` is the instance's number in the replica set, + ``stream`` is ``downstream`` or ``upstream``. + +.. _metrics-reference-runtime: + +Runtime +------- + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_runtime_lua`` + - Lua garbage collector size in bytes + * - ``tnt_runtime_used`` + - Number of bytes used for the Lua runtime + * - ``tnt_runtime_tuple`` + - Number of bytes used for the tuples (except tuples owned by memtx and vinyl) + + + +.. _metrics-reference-luajit: + +LuaJIT metrics +-------------- + +LuaJIT metrics provide an insight into the work of the Lua garbage collector. +These metrics are available in Tarantool 2.6 and later. + +General JIT metrics: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``lj_jit_snap_restore_total`` + - Overall number of snap restores + * - ``lj_jit_trace_num`` + - Number of JIT traces + * - ``lj_jit_trace_abort_total`` + - Overall number of abort traces + * - ``lj_jit_mcode_size`` + - Total size of allocated machine code areas + +JIT strings: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``lj_strhash_hit_total`` + - Number of strings being interned + * - ``lj_strhash_miss_total`` + - Total number of string allocations + +GC steps: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``lj_gc_steps_atomic_total`` + - Count of incremental GC steps (atomic state) + * - ``lj_gc_steps_sweepstring_total`` + - Count of incremental GC steps (sweepstring state) + * - ``lj_gc_steps_finalize_total`` + - Count of incremental GC steps (finalize state) + * - ``lj_gc_steps_sweep_total`` + - Count of incremental GC steps (sweep state) + * - ``lj_gc_steps_propagate_total`` + - Count of incremental GC steps (propagate state) + * - ``lj_gc_steps_pause_total`` + - Count of incremental GC steps (pause state) + +Allocations: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``lj_gc_strnum`` + - Number of allocated ``string`` objects + * - ``lj_gc_tabnum`` + - Number of allocated ``table`` objects + * - ``lj_gc_cdatanum`` + - Number of allocated ``cdata`` objects + * - ``lj_gc_udatanum`` + - Number of allocated ``udata`` objects + * - ``lj_gc_freed_total`` + - Total amount of freed memory + * - ``lj_gc_memory`` + - Current allocated Lua memory + * - ``lj_gc_allocated_total`` + - Total amount of allocated memory + +.. _metrics-reference-psutils: +.. _metrics-api_reference-cpu_usage_metrics: + +CPU metrics +----------- + +The following metrics provide CPU usage statistics. +They are only available on Linux. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_cpu_number`` + - Total number of processors configured by the operating system + * - ``tnt_cpu_time`` + - Host CPU time + * - ``tnt_cpu_thread`` + - Tarantool thread CPU time. + This metric always has the labels + ``{kind="user", thread_name="tarantool", thread_pid="pid", file_name="init.lua"}``, + where: + + * ``kind`` can be either ``user`` or ``system`` + * ``thread_name`` is ``tarantool``, ``wal``, ``iproto``, or ``coio`` + * ``file_name`` is the entrypoint file name, for example, ``init.lua``. + +There are also two cross-platform metrics, which can be obtained with a ``getrusage()`` call. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_cpu_user_time`` + - Tarantool CPU user time + * - ``tnt_cpu_system_time`` + - Tarantool CPU system time + +.. _metrics-reference-vinyl: + +Vinyl +----- + +Vinyl metrics provide :ref:`vinyl engine ` statistics. + +.. _metrics-reference-vinyl-disk: + +Disk +~~~~ + +The disk metrics are used to monitor overall data size on disk. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_vinyl_disk_data_size`` + - Amount of data in bytes stored in the ``.run`` files + located in :ref:`vinyl_dir ` + * - ``tnt_vinyl_disk_index_size`` + - Amount of data in bytes stored in the ``.index`` files + located in :ref:`vinyl_dir ` + +.. _metrics-reference-vinyl_regulator: + +Regulator +~~~~~~~~~ + +The vinyl regulator decides when to commence disk IO actions. +It groups activities in batches so that they are more consistent and +efficient. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_vinyl_regulator_dump_bandwidth`` + - Estimated average dumping rate, bytes per second. + The rate value is initially 10485760 (10 megabytes per second). + It is recalculated depending on the the actual rate. + Only significant dumps that are larger than 1 MB are used for estimating. + * - ``tnt_vinyl_regulator_write_rate`` + - Actual average rate of performing write operations, bytes per second. + The rate is calculated as a 5-second moving average. + If the metric value is gradually going down, + this can indicate disk issues. + * - ``tnt_vinyl_regulator_rate_limit`` + - Write rate limit, bytes per second. + The regulator imposes the limit on transactions + based on the observed dump/compaction performance. + If the metric value is down to approximately ``10^5``, + this indicates issues with the disk + or the :ref:`scheduler `. + * - ``tnt_vinyl_regulator_dump_watermark`` + - Maximum amount of memory in bytes used + for in-memory storing of a vinyl LSM tree. + When this maximum is accessed, a dump must occur. + For details, see :ref:`engines-algorithm_filling_lsm`. + The value is slightly smaller + than the amount of memory allocated for vinyl trees, + reflected in the :ref:`vinyl_memory ` parameter. + * - ``tnt_vinyl_regulator_blocked_writers`` + - The number of fibers that are blocked waiting + for Vinyl level0 memory quota. + +.. _metrics-reference-transactional-activity: + +Transactional activity +~~~~~~~~~~~~~~~~~~~~~~ + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_vinyl_tx_commit`` + - Counter of commits (successful transaction ends) + Includes implicit commits: for example, any insert operation causes a + commit unless it is within a + :doc:`/reference/reference_lua/box_txn_management/begin`\ --\ :doc:`/reference/reference_lua/box_txn_management/commit` + block. + * - ``tnt_vinyl_tx_rollback`` + - Сounter of rollbacks (unsuccessful transaction ends). + This is not merely a count of explicit + :doc:`/reference/reference_lua/box_txn_management/rollback` + requests -- it includes requests that ended with errors. + * - ``tnt_vinyl_tx_conflict`` + - Counter of conflicts that caused transactions to roll back. + The ratio ``tnt_vinyl_tx_conflict / tnt_vinyl_tx_commit`` + above 5% indicates that vinyl is not healthy. + At that moment, you'll probably see a lot of other problems with vinyl. + * - ``tnt_vinyl_tx_read_views`` + - Current number of read views -- that is, transactions + that entered the read-only state to avoid conflict temporarily. + Usually the value is ``0``. + If it stays non-zero for a long time, it is indicative of a memory leak. + + +.. _metrics-reference-memory: + +Memory +~~~~~~ + +The following metrics show state memory areas used by vinyl for caches and write buffers. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_vinyl_memory_tuple_cache`` + - Amount of memory in bytes currently used to store tuples (data) + * - ``tnt_vinyl_memory_level0`` + - "Level 0" (L0) memory area, bytes. + L0 is the area that vinyl can use for in-memory storage of an LSM tree. + By monitoring this metric, you can see when L0 is getting close to its + maximum (``tnt_vinyl_regulator_dump_watermark``), + at which time a dump will occur. + You can expect L0 = 0 immediately after the dump operation is completed. + * - ``tnt_vinyl_memory_page_index`` + - Amount of memory in bytes currently used to store indexes. + If the metric value is close to :ref:`vinyl_memory `, + this indicates that :ref:`vinyl_page_size ` + was chosen incorrectly. + * - ``tnt_vinyl_memory_bloom_filter`` + - Amount of memory in bytes used by + :ref:`bloom filters `. + * - ``tnt_vinyl_memory_tuple`` + - Total size of memory in bytes occupied by Vinyl tuples. + It includes cached tuples and tuples pinned by the Lua world. + +.. _metrics-reference-vinyl_scheduler: + +Scheduler +~~~~~~~~~ + +The vinyl scheduler invokes the :ref:`regulator ` and +updates the related variables. This happens once per second. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_vinyl_scheduler_tasks`` + - Number of scheduler dump/compaction tasks. + The metric always has label ``{status = }``, + where ```` can be one of the following: + + * ``inprogress`` for currently running tasks + * ``completed`` for successfully completed tasks + * ``failed`` for tasks aborted due to errors. + + * - ``tnt_vinyl_scheduler_dump_time`` + - Total time in seconds spent by all worker threads performing dumps. + * - ``tnt_vinyl_scheduler_dump_total`` + - Counter of dumps completed. + +.. _metrics-reference-memory_event_loop: + +Event loop metrics +------------------ + +Event loop tx thread information: + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_ev_loop_time`` + - Event loop time (ms) + * - ``tnt_ev_loop_prolog_time`` + - Event loop prolog time (ms) + * - ``tnt_ev_loop_epilog_time`` + - Event loop epilog time (ms) + + +.. _metrics-reference-synchro: + +Synchro +------- + +Shows the current state of a synchronous replication. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_synchro_queue_owner`` + - Instance ID of the current synchronous replication master. + + * - ``tnt_synchro_queue_term`` + - Current queue term. + + * - ``tnt_synchro_queue_len`` + - How many transactions are collecting confirmations now. + + * - ``tnt_synchro_queue_busy`` + - Whether the queue is processing any system entry (CONFIRM/ROLLBACK/PROMOTE/DEMOTE). + +.. _metrics-reference-election: + +Election +-------- + +Shows the current state of a replica set node in regards to leader election. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_election_state`` + - Election state (mode) of the node. + When election is enabled, the node is writable only in the leader state. + Possible values: + + * 0 (``follower``): all the non-leader nodes are called followers + * 1 (``candidate``): the nodes that start a new election round are called candidates. + * 2 (``leader``): the node that collected a quorum of votes becomes the leader + + * - ``tnt_election_vote`` + - ID of a node the current node votes for. + If the value is 0, it means the node hasn’t voted in the current term yet. + + * - ``tnt_election_leader`` + - Leader node ID in the current term. + If the value is 0, it means the node doesn’t know which node is the leader in the current term. + + * - ``tnt_election_term`` + - Current election term. + + * - ``tnt_election_leader_idle`` + - Time in seconds since the last interaction with the known leader. + +Memtx +----- + +Memtx mvcc memory statistics. +Transaction manager consists of two parts: + +- the transactions themselves (TXN section) +- MVCC + +.. _metrics-reference-memtx_txn: + +TXN +~~~ + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_memtx_tnx_statements`` are the transaction statements. + - For example, the user started a transaction and made an action in it `space:replace{0, 1}`. + Under the hood, this operation will turn into ``statement`` for the current transaction. + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``total``: the number of bytes that are allocated for the statements of all current transactions. + * ``average``: average bytes used by transactions for statements + (`txn.statements.total` bytes / number of open transactions). + * ``max``: the maximum number of bytes used by one the current transaction for statements. + + * - ``tnt_memtx_tnx_user`` + - In Tarantool C API there is a function `box_txn_alloc()`. + By using this function user can allocate memory for the current transaction. + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``total``: memory allocated by the `box_txn_alloc()` function on all current transactions. + * ``average``: transaction average (total allocated bytes / number of all current transactions). + * ``max``: the maximum number of bytes allocated by `box_txn_alloc()` function per transaction. + + * - ``tnt_memtx_tnx_system`` + - There are internals: logs, savepoints. + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``total``: memory allocated by internals on all current transactions. + * ``average``: average allocated memory by internals (total memory / number of all current transactions). + * ``max``: the maximum number of bytes allocated by internals per transaction. + +.. _metrics-reference-memtx_mvcc: + +MVCC +~~~~ + +``mvcc`` is responsible for the isolation of transactions. +It detects conflicts and makes sure that tuples that are no longer in the space, but read by some transaction +(or can be read) have not been deleted. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_memtx_mvcc_trackers`` + - Trackers that keep track of transaction reads. + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``total``: trackers of all current transactions are allocated in total (in bytes). + * ``average``: average for all current transactions (total memory bytes / number of transactions). + * ``max``: maximum trackers allocated per transaction (in bytes). + + * - ``tnt_memtx_mvcc_conflicts`` + - Allocated in case of transaction conflicts. + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``total``: bytes allocated for conflicts in total. + * ``average``: average for all current transactions (total memory bytes / number of transactions). + * ``max``: maximum bytes allocated for conflicts per transaction. + + +.. _metrics-reference-tuples: + +~~~~~~ +Tuples +~~~~~~ + +Saved tuples are divided into 3 categories: ``used``, ``read_view``, ``tracking``. + +Each category has two metrics: + +- ``retained`` tuples - they are no longer in the index, but MVCC does not allow them to be removed. +- ``stories`` - MVCC is based on the story mechanism, almost every tuple has a story. +This is a separate metric because even the tuples that are in the index can have a story. +So ``stories`` and ``retained`` need to be measured separately. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_memtx_mvcc_tuples_used_stories`` + - Tuples that are used by active read-write transactions. + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``count``: number of ``used`` tuples / number of stories. + * ``total``: amount of bytes used by stories ``used`` tuples. + + * - ``tnt_memtx_mvcc_tuples_used_retained`` + - Tuples that are used by active read-write transactions. + But they are no longer in the index, but MVCC does not allow them to be removed. + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``count``: number of retained ``used`` tuples / number of stories. + * ``total``: amount of bytes used by retained ``used`` tuples. + + * - ``tnt_memtx_mvcc_tuples_read_view_stories`` + - Tuples that are not used by active read-write transactions, + but are used by read-only transactions (i.e. in read view). + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``count``: number of ``read_view`` tuples / number of stories. + * ``total``: amount of bytes used by stories ``read_view`` tuples. + + * - ``tnt_memtx_mvcc_tuples_read_view_retained`` + - Tuples that are not used by active read-write transactions, + but are used by read-only transactions (i.e. in read view). + This tuples are no longer in the index, but MVCC does not allow them to be removed. + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``count``: number of retained ``read_view`` tuples / number of stories. + * ``total``: amount of bytes used by retained ``read_view`` tuples. + + * - ``tnt_memtx_mvcc_tuples_tracking_stories`` + - Tuples that are not directly used by any transactions, but are used by MVCC to track reads. + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``count``: number of ``tracking`` tuples / number of tracking stories. + * ``total``: amount of bytes used by stories ``tracking`` tuples. + + * - ``tnt_memtx_mvcc_tuples_tracking_retained`` + - Tuples that are not directly used by any transactions, but are used by MVCC to track reads. + This tuples are no longer in the index, but MVCC does not allow them to be removed. + This metric always has the label ``{kind="..."}``, + which has the following possible values: + + * ``count``: number of retained ``tracking`` tuples / number of stories. + * ``total``: amount of bytes used by retained ``tracking`` tuples. + + +.. _metrics-reference-read-view: + +~~~~~~~~~~~~~~~~~~~~ +Read view statistics +~~~~~~~~~~~~~~~~~~~~ + + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_memtx_tuples_data_total`` + - Total amount of memory (in bytes) allocated for data tuples. + This includes ``tnt_memtx_tuples_data_read_view`` and + ``tnt_memtx_tuples_data_garbage`` metric values plus tuples that + are actually stored in memtx spaces. + + * - ``tnt_memtx_tuples_data_read_view`` + - Memory (in bytes) held for read views. + + * - ``tnt_memtx_tuples_data_garbage`` + - Memory (in bytes) that is unused and scheduled to be freed + (freed lazily on memory allocation). + + * - ``tnt_memtx_index_total`` + - Total amount of memory (in bytes) allocated for indexing data. + This includes ``tnt_memtx_index_read_view`` metric value + plus memory used for indexing tuples + that are actually stored in memtx spaces. + + * - ``tnt_memtx_index_read_view`` + - Memory (in bytes) held for read views. + + +.. _metrics-reference-tarantool-config: + +Tarantool configuration +----------------------- + +**Since:** :doc:`3.0.0 `. + +.. container:: table + + .. list-table:: + :widths: 25 75 + :header-rows: 0 + + * - ``tnt_config_alerts`` + - Count of current instance :ref:`configuration apply alerts `. + ``{level="warn"}`` label covers warnings and + ``{level="error"}`` covers errors. + + * - ``tnt_config_status`` + - The status of current instance :ref:`configuration apply `. + ``status`` label contains possible status name. + Current status has metric value ``1``, inactive statuses have metric value ``0``. + + .. code-block:: none + + # HELP tnt_config_status Tarantool 3 configuration status + # TYPE tnt_config_status gauge + tnt_config_status{status="reload_in_progress",alias="router-001-a"} 0 + tnt_config_status{status="uninitialized",alias="router-001-a"} 0 + tnt_config_status{status="check_warnings",alias="router-001-a"} 0 + tnt_config_status{status="ready",alias="router-001-a"} 1 + tnt_config_status{status="check_errors",alias="router-001-a"} 0 + tnt_config_status{status="startup_in_progress",alias="router-001-a"} 0 + + For example, this set of metrics means that current configuration + for ``router-001-a`` status is ``ready``. \ No newline at end of file diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/README.md b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/README.md new file mode 100644 index 0000000000..fef01516ca --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/README.md @@ -0,0 +1,11 @@ +# Collecting custom metrics + +A sample application showing how to collect custom [metrics](https://www.tarantool.io/doc/latest/book/monitoring/). + +## Running + +Start the application by executing the following command in the [config](../../../config) directory: + +```shell +$ tt start metrics_collect_custom +``` diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/config.yaml b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/config.yaml new file mode 100644 index 0000000000..54bf57667e --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/config.yaml @@ -0,0 +1,20 @@ +metrics: + include: [ all ] + exclude: [ vinyl ] + labels: + alias: '{{ instance_name }}' +roles: +- examples.collect_custom_replace_count +- examples.collect_custom_waste_size +app: + file: 'load_data.lua' + +groups: + group001: + replicasets: + replicaset001: + instances: + instance001: + iproto: + listen: + - uri: '127.0.0.1:3301' diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_replace_count.lua b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_replace_count.lua new file mode 100644 index 0000000000..a15d81a4bf --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_replace_count.lua @@ -0,0 +1,22 @@ +local function apply() + -- Collect a custom metric at an arbitrary moment in time -- + local metrics = require('metrics') + local bands_replace_count = metrics.counter('bands_replace_count', 'The number of data operations') + local trigger = require('trigger') + trigger.set( + 'box.space.bands.on_replace', + 'update_bands_replace_count_metric', + function(_, _, _, request_type) + bands_replace_count:inc(1, { request_type = request_type }) + end + ) + -- End -- +end + +return { + validate = function() + end, + apply = apply, + stop = function() + end, +} diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_waste_size.lua b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_waste_size.lua new file mode 100644 index 0000000000..520975b468 --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_waste_size.lua @@ -0,0 +1,17 @@ +local function apply() + -- Collect a custom metric when the data collected by metrics is requested -- + local metrics = require('metrics') + local bands_waste_size = metrics.gauge('bands_waste_size', 'The size of memory wasted due to internal fragmentation') + metrics.register_callback(function() + bands_waste_size:set(box.space.bands:stat()['tuple']['memtx']['waste_size']) + end) + -- End -- +end + +return { + validate = function() + end, + apply = apply, + stop = function() + end, +} diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/instances.yml b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/instances.yml new file mode 100644 index 0000000000..aa60c2fc42 --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/instances.yml @@ -0,0 +1 @@ +instance001: diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/load_data.lua b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/load_data.lua new file mode 100644 index 0000000000..cc721d61a3 --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_custom/load_data.lua @@ -0,0 +1,25 @@ +function create_space() + box.schema.space.create('bands') + box.space.bands:format({ + { name = 'id', type = 'unsigned' }, + { name = 'band_name', type = 'string' }, + { name = 'year', type = 'unsigned' } + }) + box.space.bands:create_index('primary', { parts = { 'id' } }) +end + +function load_data() + box.space.bands:insert { 1, 'Roxette', 1986 } + box.space.bands:insert { 2, 'Scorpions', 1965 } + box.space.bands:insert { 3, 'Ace of Base', 1987 } + box.space.bands:insert { 4, 'The Beatles', 1960 } + box.space.bands:insert { 5, 'Pink Floyd', 1965 } + box.space.bands:insert { 6, 'The Rolling Stones', 1962 } + box.space.bands:insert { 7, 'The Doors', 1965 } + box.space.bands:insert { 8, 'Nirvana', 1987 } + box.space.bands:insert { 9, 'Led Zeppelin', 1968 } + box.space.bands:insert { 10, 'Queen', 1970 } +end + +create_space() +load_data() diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/README.md b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/README.md new file mode 100644 index 0000000000..95fbea43a3 --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/README.md @@ -0,0 +1,17 @@ +# Collecting HTTP metrics + +A sample application showing how to enable and configure [metrics](https://www.tarantool.io/doc/latest/book/monitoring/) in your application. + +## Running + +Before starting the application, install the `http` module by executing the `tt rocks install` command in the [config](../../../config) directory: + +```shell +$ tt rocks install http +``` + +Then, start the application: + +```shell +$ tt start metrics_collect_http +``` diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/collect_http_metrics.lua b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/collect_http_metrics.lua new file mode 100644 index 0000000000..2e57785196 --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/collect_http_metrics.lua @@ -0,0 +1,34 @@ +local httpd + +local function apply() + if httpd then + httpd:stop() + end + + -- Collect HTTP metrics for the '/metrics/hello' route -- + httpd = require('http.server').new('127.0.0.1', 8080) + local metrics = require('metrics') + metrics.http_middleware.configure_default_collector('summary') + httpd:route({ + method = 'GET', + path = '/metrics/hello' + }, metrics.http_middleware.v1( + function() + return { status = 200, + headers = { ['content-type'] = 'text/plain' }, + body = 'Hello from http_middleware!' } + end)) + + httpd:start() +end + +local function stop() + httpd:stop() +end + +return { + validate = function() + end, + apply = apply, + stop = stop, +} diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/config.yaml b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/config.yaml new file mode 100644 index 0000000000..2697252750 --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/config.yaml @@ -0,0 +1,17 @@ +metrics: + include: [ all ] + exclude: [ vinyl ] + labels: + alias: '{{ instance_name }}' +roles: +- collect_http_metrics + +groups: + group001: + replicasets: + replicaset001: + instances: + instance001: + iproto: + listen: + - uri: '127.0.0.1:3301' diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/instances.yml b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/instances.yml new file mode 100644 index 0000000000..aa60c2fc42 --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_collect_http/instances.yml @@ -0,0 +1 @@ +instance001: diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/README.md b/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/README.md new file mode 100644 index 0000000000..11c9d7705d --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/README.md @@ -0,0 +1,29 @@ +# Exposing metrics using plugins + +A sample application showing how to use [metrics](https://www.tarantool.io/doc/latest/book/monitoring/) plugins for exposing metrics. + +## Running + +Before starting the application, install the `http` module by executing the `tt rocks install` command in the [config](../../../config) directory: + +```shell +$ tt rocks install http +``` + +Then, start the application: + +```shell +$ tt start metrics_plugins +``` + +To get Prometheus metrics, make the following request: + +```console +$ curl -X GET --location "http://127.0.0.1:8080/metrics/prometheus" +``` + +To get metrics in the JSON format, make the following request: + +```console +$ curl -X GET --location "http://127.0.0.1:8081/metrics/json" +``` diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/config.yaml b/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/config.yaml new file mode 100644 index 0000000000..fde2bf723f --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/config.yaml @@ -0,0 +1,18 @@ +metrics: + include: [ all ] + exclude: [ vinyl ] + labels: + alias: '{{ instance_name }}' +roles: +- examples.expose_prometheus_metrics +- examples.expose_json_metrics + +groups: + group001: + replicasets: + replicaset001: + instances: + instance001: + iproto: + listen: + - uri: '127.0.0.1:3301' diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/examples/expose_json_metrics.lua b/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/examples/expose_json_metrics.lua new file mode 100644 index 0000000000..411f32014f --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/examples/expose_json_metrics.lua @@ -0,0 +1,32 @@ +local httpd + +local function apply() + if httpd then + httpd:stop() + end + + -- Expose JSON metrics -- + httpd = require('http.server').new('127.0.0.1', 8081) + httpd:route({ + method = 'GET', + path = '/metrics/json' + }, function() + local json_plugin = require('metrics.plugins.json') + local json_metrics = json_plugin.export() + return { status = 200, + headers = { ['content-type'] = 'application/json' }, + body = json_metrics } + end) + httpd:start() +end + +local function stop() + httpd:stop() +end + +return { + validate = function() + end, + apply = apply, + stop = stop, +} diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/examples/expose_prometheus_metrics.lua b/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/examples/expose_prometheus_metrics.lua new file mode 100644 index 0000000000..c812df8598 --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/examples/expose_prometheus_metrics.lua @@ -0,0 +1,30 @@ +local httpd + +local function apply() + if httpd then + httpd:stop() + end + + -- Expose Prometheus metrics -- + httpd = require('http.server').new('127.0.0.1', 8080) + httpd:route({ + method = 'GET', + path = '/metrics/prometheus' + }, function() + local prometheus_plugin = require('metrics.plugins.prometheus') + local prometheus_metrics = prometheus_plugin.collect_http() + return prometheus_metrics + end) + httpd:start() +end + +local function stop() + httpd:stop() +end + +return { + validate = function() + end, + apply = apply, + stop = stop, +} diff --git a/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/instances.yml b/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/instances.yml new file mode 100644 index 0000000000..aa60c2fc42 --- /dev/null +++ b/doc/code_snippets/snippets/config/instances.enabled/metrics_plugins/instances.yml @@ -0,0 +1 @@ +instance001: diff --git a/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud/config.yaml b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud/config.yaml index 4c505e1a29..dbccf140cb 100644 --- a/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud/config.yaml +++ b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud/config.yaml @@ -59,14 +59,6 @@ groups: client: '127.0.0.1:3305' routers: roles: [ roles.crud-router ] - roles_cfg: - roles.crud-router: - stats: true - stats_driver: metrics - stats_quantiles: false - stats_quantile_tolerated_error: 0.001 - stats_quantile_age_buckets_count: 5 - stats_quantile_max_age_time: 180 app: module: router sharding: diff --git a/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/README.md b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/README.md new file mode 100644 index 0000000000..1d3419d44f --- /dev/null +++ b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/README.md @@ -0,0 +1,44 @@ +# Sharded cluster: Exposing metrics + +A sample application showing how to enable and expose [metrics](https://www.tarantool.io/doc/latest/book/monitoring/) through HTTP. + +## Running + +Before starting the application, install dependencies defined in the `*.rockspec` file: + +```console +$ tt build sharded_cluster_crud_metrics +``` + +Then, start the application: + +```console +$ tt start sharded_cluster_crud_metrics +``` + +To get Prometheus metrics, make the following request: + +```console +$ curl -X GET --location "http://127.0.0.1:8081/metrics/prometheus" +``` + +To get metrics in the JSON format, make the following request: + +```console +$ curl -X GET --location "http://127.0.0.1:8081/metrics/json" +``` + + +## Running the Prometheus server + +To monitor the metrics of a running sample, you need to install Prometheus either locally or using Docker. +To install and run Prometheus using Docker, follow the steps below: + +1. Open the [sharded_cluster_crud_metrics](../../../sharding/instances.enabled/sharded_cluster_crud_metrics) directory in the terminal. +2. Replace `127.0.0.1` with `host.docker.internal` in the `prometheus/prometheus.yml` file. +3. Then, run a server: + ```Bash + docker compose up + ``` +4. Open the [http://localhost:9090/graph](http://localhost:9090/graph) page to access the Prometheus expression browser. +5. Enter the desired Tarantool metric, for example, `tnt_info_uptime`or `tnt_info_memory_data` to see monitoring results. diff --git a/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/config.yaml b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/config.yaml new file mode 100644 index 0000000000..4e208542f2 --- /dev/null +++ b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/config.yaml @@ -0,0 +1,134 @@ +credentials: + users: + replicator: + password: 'topsecret' + roles: [ replication ] + storage: + password: 'secret' + roles: [ sharding ] + +iproto: + advertise: + peer: + login: replicator + sharding: + login: storage + +sharding: + bucket_count: 1000 + +metrics: + include: [ all ] + exclude: [ vinyl ] + labels: + alias: '{{ instance_name }}' + +groups: + storages: + roles: + - roles.crud-storage + - roles.metrics-export + app: + module: storage + sharding: + roles: [ storage ] + replication: + failover: manual + replicasets: + storage-a: + leader: storage-a-001 + instances: + storage-a-001: + roles_cfg: + roles.metrics-export: + http: + - listen: '127.0.0.1:8082' + endpoints: + - path: /metrics/prometheus/ + format: prometheus + - path: /metrics/json + format: json + iproto: + listen: + - uri: '127.0.0.1:3302' + advertise: + client: '127.0.0.1:3302' + storage-a-002: + roles_cfg: + roles.metrics-export: + http: + - listen: '127.0.0.1:8083' + endpoints: + - path: /metrics/prometheus/ + format: prometheus + - path: /metrics/json + format: json + iproto: + listen: + - uri: '127.0.0.1:3303' + advertise: + client: '127.0.0.1:3303' + storage-b: + leader: storage-b-001 + instances: + storage-b-001: + roles_cfg: + roles.metrics-export: + http: + - listen: '127.0.0.1:8084' + endpoints: + - path: /metrics/prometheus/ + format: prometheus + - path: /metrics/json + format: json + iproto: + listen: + - uri: '127.0.0.1:3304' + advertise: + client: '127.0.0.1:3304' + storage-b-002: + roles_cfg: + roles.metrics-export: + http: + - listen: '127.0.0.1:8085' + endpoints: + - path: /metrics/prometheus/ + format: prometheus + - path: /metrics/json + format: json + iproto: + listen: + - uri: '127.0.0.1:3305' + advertise: + client: '127.0.0.1:3305' + routers: + roles: + - roles.crud-router + - roles.metrics-export + roles_cfg: + roles.crud-router: + stats: true + stats_driver: metrics + stats_quantiles: true + app: + module: router + sharding: + roles: [ router ] + replicasets: + router-a: + instances: + router-a-001: + roles_cfg: + roles.metrics-export: + http: + - listen: '127.0.0.1:8081' + endpoints: + - path: /metrics/prometheus/ + format: prometheus + - path: /metrics/json + format: json + iproto: + listen: + - uri: '127.0.0.1:3301' + advertise: + client: '127.0.0.1:3301' diff --git a/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/docker-compose.yml b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/docker-compose.yml new file mode 100644 index 0000000000..fb0d5d974f --- /dev/null +++ b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/docker-compose.yml @@ -0,0 +1,27 @@ +services: + prometheus: + image: prom/prometheus + container_name: prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + ports: + - 9090:9090 + volumes: + - ./prometheus:/etc/prometheus + networks: + - monitoring_network + grafana: + image: grafana/grafana + container_name: grafana + ports: + - 3000:3000 + restart: unless-stopped + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=grafana + networks: + - monitoring_network +networks: + monitoring_network: + name: monitoring_network + driver: bridge diff --git a/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/instances.yaml b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/instances.yaml new file mode 100644 index 0000000000..96d4e2111f --- /dev/null +++ b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/instances.yaml @@ -0,0 +1,5 @@ +storage-a-001: +storage-a-002: +storage-b-001: +storage-b-002: +router-a-001: diff --git a/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/prometheus/prometheus.yml b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/prometheus/prometheus.yml new file mode 100644 index 0000000000..4fc09ed358 --- /dev/null +++ b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/prometheus/prometheus.yml @@ -0,0 +1,14 @@ +global: + scrape_interval: 5s + evaluation_interval: 5s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - 127.0.0.1:8081 + - 127.0.0.1:8082 + - 127.0.0.1:8083 + - 127.0.0.1:8084 + - 127.0.0.1:8085 + metrics_path: "/metrics/prometheus" diff --git a/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/router.lua b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/router.lua new file mode 100644 index 0000000000..61ccb2c40b --- /dev/null +++ b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/router.lua @@ -0,0 +1 @@ +local vshard = require('vshard') diff --git a/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/sharded_cluster_crud_metrics-scm-1.rockspec b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/sharded_cluster_crud_metrics-scm-1.rockspec new file mode 100644 index 0000000000..1245faba68 --- /dev/null +++ b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/sharded_cluster_crud_metrics-scm-1.rockspec @@ -0,0 +1,14 @@ +package = 'sharded_cluster_crud_metrics' +version = 'scm-1' +source = { + url = '/dev/null', +} + +dependencies = { + 'vshard == 0.1.27', + 'crud == 1.5.2', + 'metrics-export-role == 0.1.0-1', +} +build = { + type = 'none'; +} diff --git a/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/storage.lua b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/storage.lua new file mode 100644 index 0000000000..f692015c03 --- /dev/null +++ b/doc/code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/storage.lua @@ -0,0 +1,17 @@ +box.watch('box.status', function() + if box.info.ro then + return + end + + box.schema.create_space('bands', { + format = { + { name = 'id', type = 'unsigned' }, + { name = 'bucket_id', type = 'unsigned' }, + { name = 'band_name', type = 'string' }, + { name = 'year', type = 'unsigned' } + }, + if_not_exists = true + }) + box.space.bands:create_index('id', { parts = { 'id' }, if_not_exists = true }) + box.space.bands:create_index('bucket_id', { parts = { 'bucket_id' }, unique = false, if_not_exists = true }) +end) diff --git a/doc/concepts/configuration.rst b/doc/concepts/configuration.rst index 40b578d022..0fbb6c9d30 100644 --- a/doc/concepts/configuration.rst +++ b/doc/concepts/configuration.rst @@ -227,13 +227,13 @@ In this option, the role name is the key and the role configuration is the value The example below shows how to enable statistics on called operations by providing the ``roles.crud-router`` role's configuration: -.. literalinclude:: /code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud/config.yaml +.. literalinclude:: /code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/config.yaml :language: yaml - :start-at: roles.crud-router - :end-at: stats_quantile_max_age_time + :start-after: routers: + :end-at: stats_quantiles :dedent: -Example on GitHub: `sharded_cluster_crud `_ +Example on GitHub: `sharded_cluster_crud_metrics `_ diff --git a/doc/how-to/vshard_quick.rst b/doc/how-to/vshard_quick.rst index 8b1265a1e7..cb2d3dcb46 100644 --- a/doc/how-to/vshard_quick.rst +++ b/doc/how-to/vshard_quick.rst @@ -206,7 +206,6 @@ Here is a schematic view of the cluster topology: The main group-level options here are: * ``roles``: This option enables the ``roles.crud-router`` :ref:`role ` provided by the CRUD module for a router instance. - * ``roles_cfg``: This section enables and configures statistics on called operations for a router with the enabled ``roles.crud-router`` role. * ``app``: The ``app.module`` option specifies that code specific to a router should be loaded from the ``router`` module. This is explained below in the :ref:`vshard-quick-start-router-code` section. * ``sharding``: The :ref:`sharding.roles ` option specifies that an instance inside this group acts as a router. * ``replicasets``: This section configures a replica set with one router instance. @@ -444,18 +443,6 @@ Writing and selecting data - null ... -4. To get statistics on called operations, pass the space name to ``crud.stats()``: - - .. code-block:: tarantoolsession - - sharded_cluster_crud:router-a-001> crud.stats('bands') - --- - - get: - ok: - latency: 0.00069199999961711 - count: 1 - time: 0.00069199999961711 - latency_average: 0.00069199999961711 diff --git a/doc/reference/configuration/configuration_reference.rst b/doc/reference/configuration/configuration_reference.rst index 8f84112aff..a733b12cf3 100644 --- a/doc/reference/configuration/configuration_reference.rst +++ b/doc/reference/configuration/configuration_reference.rst @@ -3039,6 +3039,73 @@ The ``memtx`` section is used to configure parameters related to the :ref:`memtx | Default: box.NULL | Environment variable: TT_MEMTX_SORT_THREADS + + + +.. _configuration_reference_metrics: + +metrics +------- + +The ``metrics`` section defines configuration parameters for :ref:`metrics `. + +.. NOTE:: + + ``metrics`` can be defined in any :ref:`scope `. + +- :ref:`metrics.exclude ` +- :ref:`metrics.include ` +- :ref:`metrics.labels ` + + +.. _configuration_reference_metrics_exclude: + +.. confval:: metrics.exclude + + An array containing the metrics to turn off. + The array can contain the same values as the ``exclude`` configuration parameter passed to :ref:`metrics.cfg() `. + + **Example** + + .. literalinclude:: /code_snippets/snippets/sharding/instances.enabled/sharded_cluster_crud_metrics/config.yaml + :start-at: metrics: + :end-at: instance_name + :language: yaml + :dedent: + + | + | Type: array + | Default: ``[]`` + | Environment variable: TT_METRICS_EXCLUDE + + +.. _configuration_reference_metrics_include: + +.. confval:: metrics.include + + An array containing the metrics to turn on. + The array can contain the same values as the ``include`` configuration parameter passed to :ref:`metrics.cfg() `. + + | + | Type: array + | Default: ``[ all ]`` + | Environment variable: TT_METRICS_INCLUDE + + +.. _configuration_reference_metrics_labels: + +.. confval:: metrics.labels + + Global :ref:`labels ` to be added to every observation. + + | + | Type: map + | Default: ``{ alias = names.instance_name }`` + | Environment variable: TT_METRICS_LABELS + + + + .. _configuration_reference_process: process diff --git a/doc/reference/reference_lua/index.rst b/doc/reference/reference_lua/index.rst index 72e7204179..896bb2e506 100644 --- a/doc/reference/reference_lua/index.rst +++ b/doc/reference/reference_lua/index.rst @@ -44,6 +44,7 @@ This reference covers Tarantool's built-in Lua modules. key_def log merger + metrics msgpack net_box osmodule diff --git a/doc/reference/reference_lua/metrics.rst b/doc/reference/reference_lua/metrics.rst new file mode 100644 index 0000000000..6f603718c4 --- /dev/null +++ b/doc/reference/reference_lua/metrics.rst @@ -0,0 +1,1104 @@ +.. _metrics-api_reference: + +Module metrics +============== + +**Since:** `2.11.1 `__ + +The ``metrics`` module provides the ability to collect and expose :ref:`Tarantool metrics `. + +.. NOTE:: + + If you use a Tarantool version below `2.11.1 `__, + it is necessary to install the latest version of `metrics `__ first. + For Tarantool 2.11.1 and above, you can also use the external ``metrics`` module. + In this case, the external ``metrics`` module takes priority over the built-in one. + + +.. _metrics-api_reference_overview: + +Overview +-------- + +.. _metrics-api_reference-collectors: + +Collectors +~~~~~~~~~~ + +Tarantool provides the following metric collectors: + +.. contents:: + :local: + :depth: 1 + +A collector is a representation of one or more observations that change over time. + + +.. _metrics-api_reference-counter: + +counter +******* + +A counter is a cumulative metric that denotes a single monotonically increasing counter. Its value might only +increase or be reset to zero on restart. For example, you can use the counter to represent the number of requests +served, tasks completed, or errors. + +The design is based on the `Prometheus counter `__. + + +.. _metrics-api_reference-gauge: + +gauge +***** + +A gauge is a metric that denotes a single numerical value that can arbitrarily increase and decrease. + +The gauge type is typically used for measured values like temperature or current memory usage. +It could also be used for values that can increase or decrease, such as the number of concurrent requests. + +The design is based on the `Prometheus gauge `__. + + + + +.. _metrics-api_reference-histogram: + +histogram +********* + +A histogram metric is used to collect and analyze +statistical data about the distribution of values within the application. +Unlike metrics that track the average value or quantity of events, a histogram provides detailed visibility into the distribution of values and can uncover hidden dependencies. + +The design is based on the `Prometheus histogram `__. + + + +.. _metrics-api_reference-summary: + +summary +******* + +A summary metric is used to collect statistical data +about the distribution of values within the application. + +Each summary provides several measurements: + +* total count of measurements +* sum of measured values +* values at specific quantiles + +Similar to histograms, the summary also operates with value ranges. However, unlike histograms, +it uses quantiles (defined by a number between 0 and 1) for this purpose. In this case, +it is not required to define fixed boundaries. For summary type, the ranges depend +on the measured values and the number of measurements. + +The design is based on the `Prometheus summary `__. + + + +.. _metrics-api_reference-labels: + +Labels +~~~~~~ + +A label is a piece of metainfo that you associate with a metric in the key-value format. +For details, see `labels in Prometheus `_ and `tags in Graphite `_. + +Labels are used to differentiate between the characteristics of a thing being +measured. For example, in a metric associated with the total number of HTTP +requests, you can represent methods and statuses as label pairs: + +.. code-block:: lua + + http_requests_total_counter:inc(1, { method = 'POST', status = '200' }) + +The example above allows extracting the following time series: + +#. The total number of requests over time with ``method = "POST"`` (and any status). +#. The total number of requests over time with ``status = 500`` (and any method). + + + +.. _metrics-api_reference_configuring: + +Configuring metrics +------------------- + +To configure metrics, use :ref:`metrics.cfg() `. +This function can be used to turn on or off the specified metrics or to configure labels applied to all collectors. +Moreover, you can use the following shortcut functions to set-up metrics or labels: + +- :ref:`metrics.enable_default_metrics() ` +- :ref:`metrics.set_global_labels() ` + +.. NOTE:: + + Starting from version 3.0, metrics can be configured using a :ref:`configuration file ` in the :ref:`metrics ` section. + + + +.. _metrics-api_reference_custom_metrics: + +Custom metrics +-------------- + +.. _metrics-api_reference_create_custom_metrics: + +Creating custom metrics +~~~~~~~~~~~~~~~~~~~~~~~ + +To create a custom metric, follow the steps below: + +1. **Create a metric** + + To create a new metric, you need to call a function corresponding to the desired :ref:`collector type `. For example, call :ref:`metrics.counter() ` or :ref:`metrics.gauge() ` to create a new counter or gauge, respectively. + In the example below, a new counter is created: + + .. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_replace_count.lua + :start-at: local metrics + :end-at: local bands_replace_count + :language: lua + :dedent: + + This counter is intended to collect the number of data operations performed on the specified space. + + In the next example, a gauge is created: + + .. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_waste_size.lua + :start-at: local metrics + :end-at: local bands_waste_size + :language: lua + :dedent: + +2. **Observe a value** + + You can observe a value in two ways: + + - At the appropriate place, for example, in an API request handler or :ref:`trigger `. + In this example below, the counter value is increased any time a data operation is performed on the ``bands`` space. + To increase a counter value, :ref:`counter_obj:inc() ` is called. + + .. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_replace_count.lua + :start-after: -- Collect a custom metric + :end-before: -- End + :language: lua + :dedent: + + - At the time of requesting the data collected by metrics. + In this case, you need to collect the required metric inside :ref:`metrics.register_callback() `. + The example below shows how to use a gauge collector to measure the size of memory wasted due to internal fragmentation: + + .. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_waste_size.lua + :start-after: -- Collect a custom metric + :end-before: -- End + :language: lua + :dedent: + + To set a gauge value, :ref:`gauge_obj:set() ` is called. + +You can find the full example on GitHub: `metrics_collect_custom `_. + + + + +.. _monitoring-getting_started-warning: + +Possible limitations +~~~~~~~~~~~~~~~~~~~~ + +The module allows to add your own metrics, but there are some subtleties when working with specific tools. + +When adding your custom metric, it's important to ensure that the number of label value combinations is kept to a minimum. +Otherwise, combinatorial explosion may happen in the timeseries database with metrics values stored. +Examples of data labels: + +* `Labels `__ in Prometheus +* `Tags `__ in InfluxDB + +For example, if your company uses InfluxDB for metric collection, you can potentially disrupt the entire +monitoring setup, both for your application and for all other systems within the company. As a result, +monitoring data is likely to be lost. + +Example: + +.. code-block:: lua + + local some_metric = metrics.counter('some', 'Some metric') + + -- THIS IS POSSIBLE + local function on_value_update(instance_alias) + some_metric:inc(1, { alias = instance_alias }) + end + + -- THIS IS NOT ALLOWED + local function on_value_update(customer_id) + some_metric:inc(1, { customer_id = customer_id }) + end + +In the example, there are two versions of the function ``on_value_update``. The top version labels +the data with the cluster instance's alias. Since there's a relatively small number of nodes, using +them as labels is feasible. In the second case, an identifier of a record is used. If there are many +records, it's recommended to avoid such situations. + +The same principle applies to URLs. Using the entire URL with parameters is not recommended. +Use a URL template or the name of the command instead. + +In essence, when designing custom metrics and selecting labels or tags, it's crucial to opt for a minimal +set of values that can uniquely identify the data without introducing unnecessary complexity or potential +conflicts with existing metrics and systems. + + + +.. _metrics-api_reference-collecting_http_statistics: + +Collecting HTTP metrics +----------------------- + +The ``metrics`` module provides middleware for monitoring HTTP latency statistics for endpoints that are created using the `http `_ module. +The latency collector observes both latency information and the number of invocations. +The metrics collected by HTTP middleware are separated by a set of :ref:`labels `: + +* a route (``path``) +* a method (``method``) +* an HTTP status code (``status``) + +For each route that you want to track, you must specify the middleware explicitly. +The example below shows how to collect statistics for requests made to the ``/metrics/hello`` endpoint. + +.. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_collect_http/collect_http_metrics.lua + :start-after: Collect HTTP metrics + :end-at: httpd:start() + :language: lua + :dedent: + +.. NOTE:: + + The middleware does not cover the 404 errors. + + +.. _metrics-plugins-available: +.. _metrics-api_reference_collecting_using_plugins: + +Collecting metrics using plugins +-------------------------------- + +The ``metrics`` module provides a set of plugins that let you collect metrics through a unified interface: + +- :ref:`metrics-prometheus-api_reference` +- :ref:`metrics-json-api_reference` +- :ref:`metrics-graphite-api_reference` + + +For example, you can obtain an HTTP response object containing metrics in the Prometheus format by calling the ``metrics.plugins.prometheus.collect_http()`` function: + +.. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_plugins/examples/expose_prometheus_metrics.lua + :start-at: local prometheus_plugin + :end-at: local prometheus_metrics + :language: lua + :dedent: + +To expose the collected metrics, you can use the `http `_ module: + +.. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_plugins/examples/expose_prometheus_metrics.lua + :start-after: Expose Prometheus metrics + :end-at: httpd:start() + :language: lua + :dedent: + +Example on GitHub: `metrics_plugins `_ + + + + +.. _metrics-plugins-plugin-specific_api: +.. _metrics-plugins-custom: + +Creating custom plugins +~~~~~~~~~~~~~~~~~~~~~~~ + +Use the following API to create custom plugins: + +- :ref:`metrics.invoke_callbacks() ` +- :ref:`metrics.collectors() ` +- :ref:`collector_object ` + +To create a plugin, you need to include the following in your main export function: + +.. code-block:: lua + + -- Invoke all callbacks registered via `metrics.register_callback()` + metrics.invoke_callbacks() + + -- Loop over collectors + for _, c in pairs(metrics.collectors()) do + ... + + -- Loop over instant observations in the collector + for _, obs in pairs(c:collect()) do + -- Export observation `obs` + ... + end + end + +See the source code of built-in plugins in the `metrics GitHub repository `_. + + + + + +.. _metrics-module-api-reference: + +API Reference +------------- + +.. container:: table + + .. rst-class:: left-align-column-1 + .. rst-class:: left-align-column-2 + + .. list-table:: + :widths: 50 50 + + * - **metrics API** + - + + * - :ref:`metrics.cfg() ` + - Entrypoint to setup the module + + * - :ref:`metrics.collect() ` + - Collect observations from each collector + + * - :ref:`metrics.collectors() ` + - List all collectors in the registry + + * - :ref:`metrics.counter() ` + - Register a new counter + + * - :ref:`metrics.enable_default_metrics() ` + - Same as ``metrics.cfg{ include = include, exclude = exclude }`` + + * - :ref:`metrics.gauge() ` + - Register a new gauge + + * - :ref:`metrics.histogram() ` + - Register a new histogram + + * - :ref:`metrics.invoke_callbacks() ` + - Invoke all registered callbacks + + * - :ref:`metrics.register_callback() ` + - Register a function named ``callback`` + + * - :ref:`metrics.set_global_labels() ` + - Same as ``metrics.cfg{ labels = label_pairs }`` + + * - :ref:`metrics.summary() ` + - Register a new summary + + * - :ref:`metrics.unregister_callback() ` + - Unregister a function named ``callback`` + + * - **metrics.http_middleware API** + - + + * - :ref:`metrics.http_middleware.build_default_collector() ` + - Register and return a collector for the middleware + + * - :ref:`metrics.http_middleware.configure_default_collector() ` + - Register a collector for the middleware and set it as default + + * - :ref:`metrics.http_middleware.get_default_collector() ` + - Get the default collector + + * - :ref:`metrics.http_middleware.set_default_collector() ` + - Set the default collector + + * - :ref:`metrics.http_middleware.v1() ` + - Latency measuring wrap-up + + * - **Related objects** + - + + * - :ref:`collector_object ` + - A collector object + + * - :ref:`counter_obj ` + - A counter object + + * - :ref:`gauge_obj ` + - A gauge object + + * - :ref:`histogram_obj ` + - A histogram object + + * - :ref:`registry ` + - A metrics registry + + * - :ref:`summary_obj ` + - A summary object + + + +.. _metrics-api_reference-functions: + +metrics API +~~~~~~~~~~~ + +.. module:: metrics + +.. _metrics_cfg: + +.. function:: cfg([config]) + + Entrypoint to setup the module. + + :param table config: module configuration options: + + * ``cfg.include`` (string/table, default ``all``): ``all`` to enable all + supported default metrics, ``none`` to disable all default metrics, + table with names of the default metrics to enable a specific set of metrics. + * ``cfg.exclude`` (table, default ``{}``): a table containing the names of + the default metrics that you want to disable. Has higher priority + than ``cfg.include``. + * ``cfg.labels`` (table, default ``{}``): a table containing label names as + string keys, label values as values. See also: :ref:`metrics-api_reference-labels`. + + You can work with ``metrics.cfg`` as a table to read values, but you must call + ``metrics.cfg{}`` as a function to update them. + + Supported default metric names (for ``cfg.include`` and ``cfg.exclude`` tables): + + * ``all`` (metasection including all metrics) + * ``network`` + * ``operations`` + * ``system`` + * ``replicas`` + * ``info`` + * ``slab`` + * ``runtime`` + * ``memory`` + * ``spaces`` + * ``fibers`` + * ``cpu`` + * ``vinyl`` + * ``memtx`` + * ``luajit`` + * ``clock`` + * ``event_loop`` + * ``config`` + + See :ref:`metrics reference ` for details. + All metric collectors from the collection have ``metainfo.default = true``. + + ``cfg.labels`` are the global labels to be added to every observation. + + Global labels are applied only to metric collection. They have no effect + on how observations are stored. + + Global labels can be changed on the fly. + + ``label_pairs`` from observation objects have priority over global labels. + If you pass ``label_pairs`` to an observation method with the same key as + some global label, the method argument value will be used. + + Note that both label names and values in ``label_pairs`` are treated as strings. + + + + +.. _metrics_collect: + +.. function:: collect([opts]) + + Collect observations from each collector. + + :param table opts: table of collect options: + + * ``invoke_callbacks`` -- if ``true``, :ref:`invoke_callbacks() ` is triggered before actual collect. + * ``default_only`` -- if ``true``, observations contain only default metrics (``metainfo.default = true``). + + + +.. _metrics_collectors: + +.. function:: collectors() + + List all collectors in the registry. Designed to be used in exporters. + + :return: A list of created collectors (see :ref:`collector_object `). + + See also: :ref:`metrics-plugins-custom` + + + +.. _metrics_counter: + +.. function:: counter(name [, help, metainfo]) + + Register a new counter. + + :param string name: collector name. Must be unique. + :param string help: collector description. + :param table metainfo: collector metainfo. + :return: A counter object (see :ref:`counter_obj `). + :rtype: counter_obj + + See also: :ref:`metrics-api_reference_create_custom_metrics` + + + + +.. _metrics_enable_default_metrics: + +.. function:: enable_default_metrics([include, exclude]) + + Same as ``metrics.cfg{include=include, exclude=exclude}``, but ``include={}`` is + treated as ``include='all'`` for backward compatibility. + + + +.. _metrics_gauge: + +.. function:: gauge(name [, help, metainfo]) + + Register a new gauge. + + :param string name: collector name. Must be unique. + :param string help: collector description. + :param table metainfo: collector metainfo. + + :return: A gauge object (see :ref:`gauge_obj `). + + :rtype: gauge_obj + + See also: :ref:`metrics-api_reference_create_custom_metrics` + + + +.. _metrics_histogram: + +.. function:: histogram(name [, help, buckets, metainfo]) + + Register a new histogram. + + :param string name: collector name. Must be unique. + :param string help: collector description. + :param table buckets: histogram buckets (an array of sorted positive numbers). + The infinity bucket (``INF``) is appended automatically. + Default: ``{.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0, INF}``. + :param table metainfo: collector metainfo. + + :return: A histogram object (see :ref:`histogram_obj `). + + :rtype: histogram_obj + + See also: :ref:`metrics-api_reference_create_custom_metrics` + + .. note:: + + A histogram is basically a set of collectors: + + * ``name .. "_sum"`` -- a counter holding the sum of added observations. + * ``name .. "_count"`` -- a counter holding the number of added observations. + * ``name .. "_bucket"`` -- a counter holding all bucket sizes under the label + ``le`` (less or equal). To access a specific bucket -- ``x`` (where ``x`` is a number), + specify the value ``x`` for the label ``le``. + + + + + + +.. _metrics_invoke_callbacks: + +.. function:: invoke_callbacks() + + Invoke all registered callbacks. Has to be called before each :ref:`collect() `. + You can also use ``collect{invoke_callbacks = true}`` instead. + If you're using one of the default exporters, + ``invoke_callbacks()`` will be called by the exporter. + + See also: :ref:`metrics-plugins-custom` + + +.. _metrics_register_callback: + +.. function:: register_callback(callback) + + Register a function named ``callback``, which will be called right before metric + collection on plugin export. + + :param function callback: a function that takes no parameters. + + This method is most often used for gauge metrics updates. + + **Example:** + + .. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_collect_custom/examples/collect_custom_waste_size.lua + :start-after: -- Collect a custom metric + :end-before: -- End + :language: lua + :dedent: + + See also: :ref:`metrics-api_reference_custom_metrics` + + + + + +.. _metrics_set_global_labels: + +.. function:: set_global_labels(label_pairs) + + Same as ``metrics.cfg{ labels = label_pairs }``. + Learn more in :ref:`metrics.cfg() `. + + + + + +.. _metrics_summary: + +.. function:: summary(name [, help, objectives, params, metainfo]) + + Register a new summary. Quantile computation is based on the + `"Effective computation of biased quantiles over data streams" `_ + algorithm. + + :param string name: collector name. Must be unique. + :param string help: collector description. + :param table objectives: a list of "targeted" φ-quantiles in the ``{quantile = error, ... }`` form. + Example: ``{[0.5]=0.01, [0.9]=0.01, [0.99]=0.01}``. + The targeted φ-quantile is specified in the form of a φ-quantile and the tolerated + error. For example, ``{[0.5] = 0.1}`` means that the median (= 50th + percentile) is to be returned with a 10-percent error. Note that + percentiles and quantiles are the same concept, except that percentiles are + expressed as percentages. The φ-quantile must be in the interval ``[0, 1]``. + A lower tolerated error for a φ-quantile results in higher memory and CPU + usage during summary calculation. + + :param table params: table of the summary parameters used to configuring the sliding + time window. This window consists of several buckets to store observations. + New observations are added to each bucket. After a time period, the head bucket + (from which observations are collected) is reset, and the next bucket becomes the + new head. This way, each bucket stores observations for + ``max_age_time * age_buckets_count`` seconds before it is reset. + ``max_age_time`` sets the duration of each bucket's lifetime -- that is, how + many seconds the observations are kept before they are discarded. + ``age_buckets_count`` sets the number of buckets in the sliding time window. + This variable determines the number of buckets used to exclude observations + older than ``max_age_time`` from the summary. The value is + a trade-off between resources (memory and CPU for maintaining the bucket) + and how smooth the time window moves. + Default value: ``{max_age_time = math.huge, age_buckets_count = 1}``. + + :param table metainfo: collector metainfo. + + :return: A summary object (see :ref:`summary_obj `). + + :rtype: summary_obj + + See also: :ref:`metrics-api_reference_create_custom_metrics` + + .. note:: + + A summary represents a set of collectors: + + * ``name .. "_sum"`` -- a counter holding the sum of added observations. + * ``name .. "_count"`` -- a counter holding the number of added observations. + * ``name`` holds all the quantiles under observation that find themselves + under the label ``quantile`` (less or equal). + To access bucket ``x`` (where ``x`` is a number), + specify the value ``x`` for the label ``quantile``. + + + +.. _metrics_unregister_callback: + +.. function:: unregister_callback(callback) + + Unregister a function named ``callback`` that is called right before metric + collection on plugin export. + + :param function callback: a function that takes no parameters. + + **Example:** + + .. code-block:: lua + + local cpu_callback = function() + local cpu_metrics = require('metrics.psutils.cpu') + cpu_metrics.update() + end + + metrics.register_callback(cpu_callback) + + -- after a while, we don't need that callback function anymore + + metrics.unregister_callback(cpu_callback) + + + +.. _metrics-http_middleware-api_reference-functions: + +metrics.http_middleware API +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. module:: metrics.http_middleware + +.. _metrics_http_middleware_build_default_collector: + +.. function:: build_default_collector(type_name, name [, help]) + + Register and return a collector for the middleware. + + :param string type_name: collector type: ``histogram`` or ``summary``. The default is ``histogram``. + :param string name: collector name. The default is ``http_server_request_latency``. + :param string help: collector description. The default is ``HTTP Server Request Latency``. + + :return: A collector object + + **Possible errors:** + + * A collector with the same type and name already exists in the registry. + + + +.. _metrics_http_middleware_configure_default_collector: + +.. function:: configure_default_collector(type_name, name, help) + + Register a collector for the middleware and set it as default. + + :param string type_name: collector type: ``histogram`` or ``summary``. The default is ``histogram``. + :param string name: collector name. The default is ``http_server_request_latency``. + :param string help: collector description. The default is ``HTTP Server Request Latency``. + + **Possible errors:** + + * A collector with the same type and name already exists in the registry. + + +.. _metrics_http_middleware_get_default_collector: + +.. function:: get_default_collector() + + Return the default collector. + If the default collector hasn't been set yet, register it + (with default :ref:`http_middleware.build_default_collector() ` parameters) + and set it as default. + + :return: A collector object + + +.. _metrics_http_middleware_set_default_collector: + +.. function:: set_default_collector(collector) + + Set the default collector. + + :param collector: middleware collector object + + + +.. _metrics_http_middleware_v1: + +.. function:: v1(handler, collector) + + Latency measuring wrap-up for the HTTP ver. ``1.x.x`` handler. Returns a wrapped handler. + + Learn more in :ref:`metrics-api_reference-collecting_http_statistics`. + + :param function handler: handler function. + :param collector: middleware collector object. + If not set, the default collector is used + (like in :ref:`http_middleware.get_default_collector() `). + + **Usage:** + + .. code-block:: lua + + httpd:route(route, http_middleware.v1(request_handler, collector)) + + See also: :ref:`metrics-api_reference-collecting_http_statistics` + + + + + +.. _metrics-module-api-reference-objects: + +Related objects +~~~~~~~~~~~~~~~ + +.. _metrics_collector_object: + +.. class:: collector_object + + A collector object. + + See also: :ref:`metrics-plugins-custom` + + .. method:: collect() + + Collect observations from this collector. + To collect observations from each collector, use :ref:`metrics.collectors() `. + + ``collector_object:collect()`` is equivalent to the following code: + + .. code-block:: lua + + for _, c in pairs(metrics.collectors()) do + for _, obs in ipairs(c:collect()) do + ... -- handle observation + end + end + + :return: A concatenation of ``observation`` objects across all created collectors. + + .. code-block:: lua + + { + label_pairs: table, -- `label_pairs` key-value table + timestamp: ctype, -- current system time (in microseconds) + value: number, -- current value + metric_name: string, -- collector + } + + :rtype: table + + + +.. _metrics_counter_obj: + +.. class:: counter_obj + + A counter object. + + .. _metrics-api_reference-counter_inc: + + .. method:: inc(num, label_pairs) + + Increment the observation for ``label_pairs``. + If ``label_pairs`` doesn't exist, the method creates it. + + See also: :ref:`metrics-api_reference-labels` + + :param number num: increment value. + :param table label_pairs: table containing label names as keys, + label values as values. Note that both + label names and values in ``label_pairs`` + are treated as strings. + + .. _metrics-api_reference-counter_collect: + + .. method:: collect() + + :return: Array of ``observation`` objects for a given counter. + + .. code-block:: lua + + { + label_pairs: table, -- `label_pairs` key-value table + timestamp: ctype, -- current system time (in microseconds) + value: number, -- current value + metric_name: string, -- collector + } + + :rtype: table + + .. _metrics-api_reference-counter_remove: + + .. method:: remove(label_pairs) + + Remove the observation for :ref:`label_pairs `. + + .. _metrics-api_reference-counter_reset: + + .. method:: reset(label_pairs) + + Set the observation for :ref:`label_pairs ` to 0. + + :param table label_pairs: table containing label names as keys, + label values as values. Note that both + label names and values in ``label_pairs`` + are treated as strings. + + + +.. _metrics_gauge_obj: + +.. class:: gauge_obj + + .. _metrics_gauge_obj_inc: + + .. method:: inc(num, label_pairs) + + Increment the observation for :ref:`label_pairs `. + If ``label_pairs`` doesn't exist, the method creates it. + + .. _metrics_gauge_obj_dec: + + .. method:: dec(num, label_pairs) + + Decrement the observation for :ref:`label_pairs `. + + .. _metrics_gauge_obj_set: + + .. method:: set(num, label_pairs) + + Set the observation for :ref:`label_pairs ` to ``num``. + + .. _metrics_gauge_obj_collect: + + .. method:: collect() + + Get an array of ``observation`` objects for a given gauge. + For the description of ``observation``, see + :ref:`counter_obj:collect() `. + + .. _metrics_gauge_obj_remove: + + .. method:: remove(label_pairs) + + Remove the observation for :ref:`label_pairs `. + + + +.. _metrics_histogram_obj: + +.. class:: histogram_obj + + .. _metrics_histogram_obj_observe: + + .. method:: observe(num, label_pairs) + + Record a new value in a histogram. + This increments all bucket sizes under the labels ``le`` >= ``num`` + and the labels that match ``label_pairs``. + + :param number num: value to put in the histogram. + :param table label_pairs: table containing label names as keys, + label values as values. + All internal counters that have these labels specified + observe new counter values. + Note that both label names and values in ``label_pairs`` + are treated as strings. + See also: :ref:`metrics-api_reference-labels`. + + .. _metrics_histogram_obj_collect: + + .. method:: collect() + + Return a concatenation of ``counter_obj:collect()`` across all internal + counters of ``histogram_obj``. For the description of ``observation``, + see :ref:`counter_obj:collect() `. + + .. _metrics_histogram_obj_remove: + + .. method:: remove(label_pairs) + + Works like the ``remove()`` function + of a :ref:`counter `. + + + + +.. _metrics_registry: + +.. class:: registry + + .. _metrics_registry_unregister: + + .. method:: unregister(collector) + + Remove a collector from the registry. + + :param collector_obj collector: the collector to be removed. + + **Example:** + + .. code-block:: lua + + local collector = metrics.gauge('some-gauge') + + -- after a while, we don't need it anymore + + metrics.registry:unregister(collector) + + .. _metrics_registry_find: + + .. method:: find(kind, name) + + Find a collector in the registry. + + :param string kind: collector kind (``counter``, ``gauge``, ``histogram``, or ``summary``). + :param string name: collector name. + + :return: A collector object or ``nil``. + + :rtype: collector_obj + + **Example:** + + .. code-block:: lua + + local collector = metrics.gauge('some-gauge') + + collector = metrics.registry:find('gauge', 'some-gauge') + + + +.. _metrics_summary_obj: + +.. class:: summary_obj + + .. _metrics_summary_obj_observe: + + .. method:: observe(num, label_pairs) + + Record a new value in a summary. + + :param number num: value to put in the data stream. + :param table label_pairs: a table containing label names as keys, + label values as values. + All internal counters that have these labels specified + observe new counter values. + You can't add the ``"quantile"`` label to a summary. + It is added automatically. + If ``max_age_time`` and ``age_buckets_count`` are set, + the observed value is added to each bucket. + Note that both label names and values in ``label_pairs`` + are treated as strings. + See also: :ref:`metrics-api_reference-labels`. + + .. _metrics_summary_obj_collect: + + .. method:: collect() + + Return a concatenation of ``counter_obj:collect()`` across all internal + counters of ``summary_obj``. For the description of ``observation``, + see :ref:`counter_obj:collect() `. + If ``max_age_time`` and ``age_buckets_count`` are set, quantile observations + are collected only from the head bucket in the sliding time window, + not from every bucket. If no observations were recorded, + the method will return ``NaN`` in the values. + + .. _metrics_summary_obj_remove: + + .. method:: remove(label_pairs) + + Works like the ``remove()`` function + of a :ref:`counter `. + + + + + +.. toctree:: + :hidden: + + metrics/prometheus + metrics/graphite + metrics/json diff --git a/doc/reference/reference_lua/metrics/graphite.rst b/doc/reference/reference_lua/metrics/graphite.rst new file mode 100644 index 0000000000..44d1e49d3e --- /dev/null +++ b/doc/reference/reference_lua/metrics/graphite.rst @@ -0,0 +1,31 @@ +.. _metrics-graphite-api_reference: + +metrics.plugins.graphite +======================== + +.. module:: metrics.plugins.graphite + +.. function:: init(options) + + Send all metrics to a remote Graphite server. + Exported metric names are formatted as follows: ``.``. + + :param table options: possible options: + + * ``prefix`` (string): metrics prefix (``'tarantool'`` by default) + * ``host`` (string): Graphite server host (``'127.0.0.1'`` by default) + * ``port`` (number): Graphite server port (``2003`` by default) + * ``send_interval`` (number): metrics collection interval in seconds + (``2`` by default) + +**Example** + +.. code-block:: lua + + local graphite_plugin = require('metrics.plugins.graphite') + graphite_plugin.init { + prefix = 'tarantool', + host = '127.0.0.1', + port = 2003, + send_interval = 1, + } diff --git a/doc/reference/reference_lua/metrics/json.rst b/doc/reference/reference_lua/metrics/json.rst new file mode 100644 index 0000000000..1f9a9c534b --- /dev/null +++ b/doc/reference/reference_lua/metrics/json.rst @@ -0,0 +1,33 @@ +.. _metrics-json-api_reference: + +metrics.plugins.json +==================== + +.. module:: metrics.plugins.json + +.. function:: export() + + Export metrics in the JSON format. + + :return: a string containing metrics in the JSON format + + :rtype: string + + .. IMPORTANT:: + + The values can also be ``+-math.huge`` and ``math.huge * 0``. In such case: + + * ``math.huge`` is serialized to ``"inf"`` + * ``-math.huge`` is serialized to ``"-inf"`` + * ``math.huge * 0`` is serialized to ``"nan"``. + + +**Example** + +.. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_plugins/examples/expose_json_metrics.lua + :start-at: local json_plugin + :end-at: local json_metrics + :language: lua + :dedent: + +Example on GitHub: `metrics_plugins `_ diff --git a/doc/reference/reference_lua/metrics/prometheus.rst b/doc/reference/reference_lua/metrics/prometheus.rst new file mode 100644 index 0000000000..d475a06779 --- /dev/null +++ b/doc/reference/reference_lua/metrics/prometheus.rst @@ -0,0 +1,28 @@ +.. _metrics-prometheus-api_reference: + +metrics.plugins.prometheus +========================== + +.. module:: metrics.plugins.prometheus + +.. function:: collect_http() + + Get an HTTP response object containing metrics in the Prometheus format. + + :return: a table containing the following fields: + + * ``status``: set to ``200`` + * ``headers``: response headers + * ``body``: metrics in the Prometheus format + + :rtype: table + +**Example** + +.. literalinclude:: /code_snippets/snippets/config/instances.enabled/metrics_plugins/examples/expose_prometheus_metrics.lua + :start-at: local prometheus_plugin + :end-at: local prometheus_metrics + :language: lua + :dedent: + +Example on GitHub: `metrics_plugins `_ diff --git a/doc/reference/reference_rock/index.rst b/doc/reference/reference_rock/index.rst index 3e19e710b3..4d8d9fec95 100644 --- a/doc/reference/reference_rock/index.rst +++ b/doc/reference/reference_rock/index.rst @@ -10,7 +10,6 @@ This reference covers third-party Lua modules for Tarantool. :maxdepth: 1 membership - Module metrics <../../book/monitoring/index> Module luatest vshard/index dbms diff --git a/modules/grafana-dashboard b/modules/grafana-dashboard deleted file mode 160000 index 30dd7fad88..0000000000 --- a/modules/grafana-dashboard +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 30dd7fad88b58e6494801c410305a6c7514bdfab diff --git a/modules/metrics b/modules/metrics deleted file mode 160000 index 7652824a1a..0000000000 --- a/modules/metrics +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7652824a1a08c40fb98cd755bf32c8378a1b4362 diff --git a/pull_submodules.py b/pull_submodules.py index 79a9f27ac4..3e2a761810 100755 --- a/pull_submodules.py +++ b/pull_submodules.py @@ -6,9 +6,7 @@ modules_dir = 'modules' modules = { - 'grafana-dashboard': 'INPUT_GRAFANA', 'luatest': 'INPUT_LUATEST', - 'metrics': 'INPUT_METRICS', 'tntcxx': 'INPUT_CPP_DRIVER', } workdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'modules')