From 06dec6a0d6d415d8077c77ffbce227d45911f8df Mon Sep 17 00:00:00 2001
From: Georgy Moiseev <georgy.moiseev@corp.mail.ru>
Date: Fri, 3 Dec 2021 23:57:28 +0300
Subject: [PATCH] Integrate CRUD statistics with metrics

If `metrics` [1] found, metrics collectors are used to store
statistics. It is required to use `>= 0.5.0`, while at least `0.9.0`
is recommended to support age buckets in summary. The metrics are part
of global registry and can be exported together (e.g. to Prometheus)
with default tools without any additional configuration. Disabling
stats destroys the collectors.

If `metrics` found, `latency` statistics are changed to 0.99 quantile
of request execution time (with aging).

Add CI matrix to run tests with `metrics` installed.

1. https://github.com/tarantool/metrics

Closes #224
---
 .github/workflows/test_on_push.yaml |   8 +
 CHANGELOG.md                        |   1 +
 README.md                           |  49 +++++-
 crud/stats/metrics_registry.lua     | 223 ++++++++++++++++++++++++++
 crud/stats/module.lua               |  12 +-
 test/integration/stats_test.lua     | 237 ++++++++++++++++++++++++++++
 6 files changed, 526 insertions(+), 4 deletions(-)
 create mode 100644 crud/stats/metrics_registry.lua

diff --git a/.github/workflows/test_on_push.yaml b/.github/workflows/test_on_push.yaml
index bcf6d348c..baec4ea5f 100644
--- a/.github/workflows/test_on_push.yaml
+++ b/.github/workflows/test_on_push.yaml
@@ -14,12 +14,16 @@ jobs:
         # We need 1.10.6 here to check that module works with
         # old Tarantool versions that don't have "tuple-keydef"/"tuple-merger" support.
         tarantool-version: ["1.10.6", "1.10", "2.2", "2.3", "2.4", "2.5", "2.6", "2.7"]
+        metrics-version: [""]
         remove-merger: [false]
         include:
+          - tarantool-version: "1.10"
+            metrics-version: "0.12.0"
           - tarantool-version: "2.7"
             remove-merger: true
           - tarantool-version: "2.8"
             coveralls: true
+            metrics-version: ["", "0.12.0"]
       fail-fast: false
     runs-on: [ubuntu-latest]
     steps:
@@ -47,6 +51,10 @@ jobs:
           tarantool --version
           ./deps.sh
 
+      - name: Install metrics
+        if: matrix.metrics-version != ''
+        run: tarantoolctl rocks install metrics ${{ matrix.metrics-version }}
+
       - name: Remove external merger if needed
         if: ${{ matrix.remove-merger }}
         run: rm .rocks/lib/tarantool/tuple/merger.so
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0a3d9325f..403ddf61c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### Added
 * Statistics for CRUD operations on router (#224).
+* Integrate CRUD statistics with `metrics` (#224).
 
 ### Changed
 
diff --git a/README.md b/README.md
index 3e4781c97..196c578b2 100644
--- a/README.md
+++ b/README.md
@@ -604,6 +604,11 @@ crud.disable_stats()
 crud.enable_stats()
 ```
 
+If [`metrics`](https://github.com/tarantool/metrics) found,
+metrics collectors are used to store statistics.
+It is required to use `>= 0.5.0`, while at least `0.9.0`
+is recommended to support age buckets in summary.
+
 Enabling stats on non-router instances is meaningless.
 
 `crud.stats()` contains several sections: `insert` (for `insert` and `insert_object` calls),
@@ -628,9 +633,46 @@ crud.stats()['insert']
 Each section contains different collectors for success calls
 and error (both error throw and `nil, err`) returns. `count`
 is total requests count since instance start or stats restart.
-`latency` is average time of requests execution,
+`latency` is 0.99 quantile of request execution time,
+(if `metrics` not found, shows average instead).
 `time` is total time of requests execution.
 
+In `metrics` registry statistics are stored as `tnt_crud_stats` metrics
+with `operation` and `status` label_pairs.
+```
+metrics:collect()
+---
+- - label_pairs:
+      status: ok
+      operation: insert
+    value: 221411
+    metric_name: tnt_crud_stats_count
+  - label_pairs:
+      status: ok
+      operation: insert
+    value: 10.49834896344692
+    metric_name: tnt_crud_stats_sum
+  - label_pairs:
+      status: ok
+      operation: insert
+      quantile: 0.5
+    value: 0.000003523699706
+    metric_name: tnt_crud_stats
+  - label_pairs:
+      status: ok
+      operation: insert
+      quantile: 0.9
+    value: 0.000006997063523
+    metric_name: tnt_crud_stats
+  - label_pairs:
+      status: ok
+      operation: insert
+      quantile: 0.99
+    value: 0.00023606420935973
+    metric_name: tnt_crud_stats
+...
+```
+
 Additionally, `select` section contains `details` collectors.
 ```lua
 crud.stats()['select']['details']
@@ -644,7 +686,10 @@ crud.stats()['select']['details']
 (including those not executed successfully). `tuples_fetched`
 is a count of tuples fetched from storages during execution,
 `tuples_lookup` is a count of tuples looked up on storages
-while collecting response for call.
+while collecting response for call. In `metrics` registry they
+are stored as `tnt_crud_map_reduces`, `tnt_crud_tuples_fetched`
+and `tnt_crud_tuples_lookup` metrics with
+`{ operation = 'select' }` label_pairs.
 
 ## Cartridge roles
 
diff --git a/crud/stats/metrics_registry.lua b/crud/stats/metrics_registry.lua
new file mode 100644
index 000000000..81d1fe1da
--- /dev/null
+++ b/crud/stats/metrics_registry.lua
@@ -0,0 +1,223 @@
+local is_package, metrics = pcall(require, 'metrics')
+local label = require('crud.stats.label')
+local dev_checks = require('crud.common.dev_checks')
+local registry_common = require('crud.stats.registry_common')
+
+local registry = {}
+local _registry = {}
+
+local metric_name = {
+    -- Summary collector for all operations.
+    op = 'tnt_crud_stats',
+    -- `*_count` and `*_sum` are automatically created
+    -- by summary collector.
+    op_count = 'tnt_crud_stats_count',
+    op_sum = 'tnt_crud_stats_sum',
+
+    -- Counter collectors for select/pairs details.
+    tuples_fetched = 'tnt_crud_tuples_fetched',
+    tuples_lookup = 'tnt_crud_tuples_lookup',
+    map_reduces = 'tnt_crud_map_reduces',
+}
+
+local LATENCY_QUANTILE = 0.99
+
+local DEFAULT_QUANTILES = {
+    [0.5] = 1e-9,
+    [0.9] = 1e-9,
+    [LATENCY_QUANTILE] = 1e-9,
+}
+
+local DEFAULT_SUMMARY_PARAMS = {
+    max_age_time = 60,
+    age_buckets_count = 5,
+}
+
+--- Check if application supports metrics rock for registry
+--
+--  `metrics >= 0.5.0` is required to use summary,
+--  while at least `metrics >= 0.9.0` is required
+--  for age bucket support in quantiles.
+--
+-- @function is_supported
+--
+-- @treturn boolean Returns true if `metrics >= 0.5.0` found, false otherwise.
+--
+function registry.is_supported()
+    if is_package == false then
+        return false
+    end
+
+    -- Only metrics >= 0.5.0 supported.
+    if metrics.summary == nil then
+        return false
+    end
+
+    return true
+end
+
+
+--- Initialize collectors in global metrics registry
+--
+-- @function init
+--
+-- @treturn boolean Returns true.
+--
+function registry.init()
+    _registry[metric_name.op] = metrics.summary(
+        metric_name.op,
+        'CRUD router calls statistics',
+        DEFAULT_QUANTILES,
+        DEFAULT_SUMMARY_PARAMS)
+
+    _registry[metric_name.tuples_fetched] = metrics.counter(
+        metric_name.tuples_fetched,
+        'Tuples fetched from CRUD storages during select/pairs')
+
+    _registry[metric_name.tuples_lookup] = metrics.counter(
+        metric_name.tuples_lookup,
+        'Tuples looked up on CRUD storages while collecting response during select/pairs')
+
+    _registry[metric_name.map_reduces] = metrics.counter(
+        metric_name.map_reduces,
+        'Map reduces planned during CRUD select/pairs')
+
+    return true
+end
+
+--- Unregister collectors in global metrics registry
+--
+-- @function destroy
+--
+-- @treturn boolean Returns true.
+--
+function registry.destroy()
+    for _, c in pairs(_registry) do
+        metrics.registry:unregister(c)
+    end
+
+    _registry = {}
+    return true
+end
+
+--- Get copy of global metrics registry
+--
+-- @function get
+--
+-- @treturn table Returns copy of metrics registry.
+function registry.get()
+    local stats = {}
+
+    if next(_registry) == nil then
+        return stats
+    end
+
+    -- Fill empty collectors with zero values.
+    for _, op_label in pairs(label) do
+        stats[op_label] = registry_common.build_collector(op_label)
+    end
+
+    for _, obs in ipairs(_registry[metric_name.op]:collect()) do
+        local operation = obs.label_pairs.operation
+        local status = obs.label_pairs.status
+        if obs.metric_name == metric_name.op then
+            if obs.label_pairs.quantile == LATENCY_QUANTILE then
+                stats[operation][status].latency = obs.value
+            end
+        elseif obs.metric_name == metric_name.op_sum then
+            stats[operation][status].time = obs.value
+        elseif obs.metric_name == metric_name.op_count then
+            stats[operation][status].count = obs.value
+        end
+    end
+
+    local _, obs_tuples_fetched = next(_registry[metric_name.tuples_fetched]:collect())
+    if obs_tuples_fetched ~= nil then
+        stats[label.SELECT].details.tuples_fetched = obs_tuples_fetched.value
+    end
+
+    local _, obs_tuples_lookup = next(_registry[metric_name.tuples_lookup]:collect())
+    if obs_tuples_lookup ~= nil then
+        stats[label.SELECT].details.tuples_lookup = obs_tuples_lookup.value
+    end
+
+    local _, obs_map_reduces = next(_registry[metric_name.map_reduces]:collect())
+    if obs_map_reduces ~= nil then
+        stats[label.SELECT].details.map_reduces = obs_map_reduces.value
+    end
+
+    return stats
+end
+
+--- Increase requests count and update latency info
+--
+-- @function observe
+--
+-- @tparam string op_label
+--  Label of registry collectos.
+--  Use `require('crud.common.const').OP` to pick one.
+--
+-- @tparam boolean success
+--  true if no errors on execution, false otherwise.
+--
+-- @tparam number latency
+--  Time of call execution.
+--
+-- @treturn boolean Returns true.
+--
+function registry.observe(op_label, success, latency)
+    dev_checks('string', 'boolean', 'number')
+
+    local label_pairs = { operation = op_label }
+    if success == true then
+        label_pairs.status = 'ok'
+    else
+        label_pairs.status = 'error'
+    end
+
+    _registry[metric_name.op]:observe(latency, label_pairs)
+
+    return true
+end
+
+--- Increase statistics of storage select/pairs calls
+--
+-- @function observe_fetch
+--
+-- @tparam number tuples_fetched
+--  Count of tuples fetched during storage call.
+--
+-- @tparam number tuples_lookup
+--  Count of tuples looked up on storages while collecting response.
+--
+-- @treturn boolean Returns true.
+--
+function registry.observe_fetch(tuples_fetched, tuples_lookup)
+    dev_checks('number', 'number')
+
+    local label_pairs = { operation = label.SELECT }
+
+    _registry[metric_name.tuples_fetched]:inc(tuples_fetched, label_pairs)
+    _registry[metric_name.tuples_lookup]:inc(tuples_lookup, label_pairs)
+    return true
+end
+
+--- Increase statistics of planned map reduces during select/pairs
+--
+-- @function observe_map_reduces
+--
+-- @tparam number count
+--  Count of map reduces planned.
+--
+-- @treturn boolean Returns true.
+--
+function registry.observe_map_reduces(count)
+    dev_checks('number')
+
+    local label_pairs = { operation = label.SELECT }
+
+    _registry[metric_name.map_reduces]:inc(count, label_pairs)
+    return true
+end
+
+return registry
diff --git a/crud/stats/module.lua b/crud/stats/module.lua
index d13f94217..647380f57 100644
--- a/crud/stats/module.lua
+++ b/crud/stats/module.lua
@@ -2,11 +2,19 @@ local clock = require('clock')
 local dev_checks = require('crud.common.dev_checks')
 local utils = require('crud.common.utils')
 
-local stats_registry = require('crud.stats.local_registry')
-
 local stats = {}
 local _is_enabled = false
 
+local stats_registry
+local local_registry = require('crud.stats.local_registry')
+local metrics_registry = require('crud.stats.metrics_registry')
+
+if metrics_registry.is_supported() then
+    stats_registry = metrics_registry
+else
+    stats_registry = local_registry
+end
+
 --- Check if statistics module if enabled
 --
 -- @function is_enabled
diff --git a/test/integration/stats_test.lua b/test/integration/stats_test.lua
index f6c0ad4a2..4c1aaf9a3 100644
--- a/test/integration/stats_test.lua
+++ b/test/integration/stats_test.lua
@@ -456,3 +456,240 @@ for name, case in pairs(select_cases) do
             'Expected count of map reduces planned')
     end
 end
+
+local function check_metrics_registry_supported(router)
+    local is_metrics_supported = router:eval([[
+        return require('crud.stats.metrics_registry').is_supported()
+    ]])
+    t.skip_if(is_metrics_supported == false, 'Metrics registry is unsupported')
+end
+
+local function generate_stats(router)
+    -- Generate non-null stats for all operations.
+    for _, case in pairs(simple_operation_cases) do
+        if case.prepare ~= nil then
+            case.prepare(g)
+        end
+        local _, err = router:call(case.func, case.args)
+        if case.expect_error ~= true then
+            t.assert_equals(err, nil)
+        end
+    end
+
+    prepare_select_data(g)
+    for _, case in pairs(select_cases) do
+        local _, err = router:eval(case.eval, { case.conditions })
+        if case.expect_error ~= true then
+            t.assert_equals(err, nil)
+        end
+    end
+end
+
+-- https://github.com/tarantool/metrics/blob/fc5a67072340b12f983f09b7d383aca9e2f10cf1/test/utils.lua#L22-L31
+local function find_obs(metric_name, label_pairs, observations)
+    for _, obs in pairs(observations) do
+        local same_label_pairs = pcall(t.assert_equals, obs.label_pairs, label_pairs)
+        if obs.metric_name == metric_name and same_label_pairs then
+            return obs
+        end
+    end
+    t.assert_items_include(
+        observations,
+        { metric_name = metric_name, label_pairs = label_pairs },
+        'Missing observation')
+end
+
+-- https://github.com/tarantool/metrics/blob/fc5a67072340b12f983f09b7d383aca9e2f10cf1/test/utils.lua#L55-L63
+local function find_metric(metric_name, metrics_data)
+    local m = {}
+    for _, v in ipairs(metrics_data) do
+        if v.metric_name == metric_name then
+            table.insert(m, v)
+        end
+    end
+    return #m > 0 and m or nil
+end
+
+local function get_unique_label_values(metrics_data, label_key)
+    local label_values_map = {}
+    for _, v in ipairs(metrics_data) do
+        local label_pairs = v.label_pairs or {}
+        if label_pairs[label_key] ~= nil then
+            label_values_map[label_pairs[label_key]] = true
+        end
+    end
+
+    local label_values = {}
+    for k, _ in pairs(label_values_map) do
+        table.insert(label_values, k)
+    end
+
+    return label_values
+end
+
+g.before_test('test_stats_stored_in_global_metrics_registry', function(g)
+    local router = g.cluster:server('router').net_box
+    check_metrics_registry_supported(router)
+    generate_stats(router)
+end)
+
+g.test_stats_stored_in_global_metrics_registry = function(g)
+    local router = g.cluster:server('router').net_box
+    local metrics = router:eval("return require('metrics').collect()")
+
+    local stats = find_metric('tnt_crud_stats', metrics)
+    t.assert_type(stats, 'table', '`tnt_crud_stats` summary metrics found')
+
+    local stats_count = find_metric('tnt_crud_stats_count', metrics)
+    t.assert_type(stats_count, 'table', '`tnt_crud_stats` summary metrics found')
+
+    local stats_sum = find_metric('tnt_crud_stats_sum', metrics)
+    t.assert_type(stats_sum, 'table', '`tnt_crud_stats` summary  metrics found')
+
+
+    local expected_operations = { 'insert', 'get', 'replace', 'update',
+        'upsert', 'delete', 'select', 'truncate', 'len', 'borders' }
+
+    t.assert_items_equals(
+        get_unique_label_values(stats, 'operation'),
+        expected_operations,
+        'Metrics are labelled with operation')
+
+    t.assert_items_equals(
+        get_unique_label_values(stats_count, 'operation'),
+        expected_operations,
+        'Metrics are labelled with operation')
+
+    t.assert_items_equals(
+        get_unique_label_values(stats_sum, 'operation'),
+        expected_operations,
+        'Metrics are labelled with operation')
+
+
+    local expected_statuses = { 'ok', 'error' }
+
+    t.assert_items_equals(
+        get_unique_label_values(stats, 'status'),
+        expected_statuses,
+        'Metrics are labelled with status')
+
+    t.assert_items_equals(
+        get_unique_label_values(stats_count, 'status'),
+        expected_statuses,
+        'Metrics are labelled with operation')
+
+    t.assert_items_equals(
+        get_unique_label_values(stats_sum, 'status'),
+        expected_statuses,
+        'Metrics are labelled with operation')
+
+
+    local tuples_fetched = find_metric('tnt_crud_tuples_fetched', metrics)
+    t.assert_type(tuples_fetched, 'table', '`tnt_crud_tuples_fetched` metrics found')
+
+    t.assert_items_equals(
+        get_unique_label_values(tuples_fetched, 'operation'),
+        { 'select' },
+        'Metrics are labelled with operation')
+
+
+    local tuples_lookup = find_metric('tnt_crud_tuples_lookup', metrics)
+    t.assert_type(tuples_lookup, 'table', '`tnt_crud_tuples_lookup` metrics found')
+
+    t.assert_items_equals(
+        get_unique_label_values(tuples_lookup, 'operation'),
+        { 'select' },
+        'Metrics are labelled with operation')
+
+
+    local map_reduces = find_metric('tnt_crud_map_reduces', metrics)
+    t.assert_type(map_reduces, 'table', '`tnt_crud_map_reduces` metrics found')
+
+    t.assert_items_equals(
+        get_unique_label_values(map_reduces, 'operation'),
+        { 'select' },
+        'Metrics are labelled with operation')
+end
+
+
+g.before_test('test_metrics_updated_per_call', function(g)
+    local router = g.cluster:server('router').net_box
+    check_metrics_registry_supported(router)
+    generate_stats(router)
+end)
+
+g.test_metrics_updated_per_call = function(g)
+    local router = g.cluster:server('router').net_box
+
+    local metrics_before = router:eval("return require('metrics').collect()")
+    local count_before = find_obs('tnt_crud_stats_count',
+        { operation = 'select', status = 'ok' }, metrics_before)
+    local time_before = find_obs('tnt_crud_stats_sum',
+        { operation = 'select', status = 'ok' }, metrics_before)
+    local tuples_lookup_before = find_obs('tnt_crud_tuples_lookup',
+        { operation = 'select' }, metrics_before)
+    local tuples_fetched_before = find_obs('tnt_crud_tuples_fetched',
+        { operation = 'select' }, metrics_before)
+    local map_reduces_before = find_obs('tnt_crud_map_reduces',
+        { operation = 'select' }, metrics_before)
+
+    local case = select_cases['select_by_secondary_index']
+    local _, err = router:eval(case.eval, { case.conditions })
+    t.assert_equals(err, nil)
+
+    local metrics_after = router:eval("return require('metrics').collect()")
+    local count_after = find_obs('tnt_crud_stats_count',
+        { operation = 'select', status = 'ok' }, metrics_after)
+    local time_after = find_obs('tnt_crud_stats_sum',
+        { operation = 'select', status = 'ok' }, metrics_after)
+    local tuples_lookup_after = find_obs('tnt_crud_tuples_lookup',
+        { operation = 'select' }, metrics_after)
+    local tuples_fetched_after = find_obs('tnt_crud_tuples_fetched',
+        { operation = 'select' }, metrics_after)
+    local map_reduces_after = find_obs('tnt_crud_map_reduces',
+        { operation = 'select' }, metrics_after)
+
+    t.assert_equals(count_after.value - count_before.value, 1,
+        '`select` metrics count increased')
+    t.assert_ge(time_after.value - time_before.value, 0,
+        '`select` total time increased')
+    t.assert_ge(tuples_lookup_after.value - tuples_lookup_before.value, case.tuples_lookup,
+        '`select` tuples lookup expected change')
+    t.assert_ge(tuples_fetched_after.value - tuples_fetched_before.value, case.tuples_lookup,
+        '`select` tuples feched expected change')
+    t.assert_ge(map_reduces_after.value - map_reduces_before.value, case.tuples_lookup,
+        '`select` map reduces expected change')
+end
+
+g.before_test('test_metrics_collectors_destroyed_if_stats_disabled', function(g)
+    local router = g.cluster:server('router').net_box
+    check_metrics_registry_supported(router)
+    generate_stats(router)
+end)
+
+g.test_metrics_collectors_destroyed_if_stats_disabled = function(g)
+    local router = g.cluster:server('router').net_box
+
+    local resp = router:eval("return require('crud').disable_stats()")
+    t.assert_equals(resp, true, 'Stats disabled on router')
+
+    local metrics = router:eval("return require('metrics').collect()")
+
+    local stats = find_metric('tnt_crud_stats', metrics)
+    t.assert_equals(stats, nil, '`tnt_crud_stats` summary metrics not found')
+
+    local stats_count = find_metric('tnt_crud_stats_count', metrics)
+    t.assert_equals(stats_count, nil, '`tnt_crud_stats` summary metrics not found')
+
+    local stats_sum = find_metric('tnt_crud_stats_sum', metrics)
+    t.assert_equals(stats_sum, nil, '`tnt_crud_stats` summary metrics not found')
+
+    local tuples_fetched = find_metric('tnt_crud_tuples_fetched', metrics)
+    t.assert_equals(tuples_fetched, nil, '`tnt_crud_tuples_fetched` metrics not found')
+
+    local tuples_lookup = find_metric('tnt_crud_tuples_lookup', metrics)
+    t.assert_equals(tuples_lookup, nil, '`tnt_crud_tuples_lookup` metrics not found')
+
+    local map_reduces = find_metric('tnt_crud_map_reduces', metrics)
+    t.assert_equals(map_reduces, nil, '`tnt_crud_map_reduces` metrics not found')
+end