Skip to content

Commit

Permalink
Support metrics integration without quantiles
Browse files Browse the repository at this point in the history
Quantiles computations increases performance overhead by near 10% when
used in statistics. One may want to use statistics with metrics without
quantiles. The patch allows one to do it.

Follows up #224
  • Loading branch information
DifferentialOrange committed Dec 24, 2021
1 parent 3dd1c6f commit 195ff75
Show file tree
Hide file tree
Showing 9 changed files with 212 additions and 96 deletions.
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -610,14 +610,17 @@ If [`metrics`](https://github.com/tarantool/metrics) `0.9.0` or greater
found, metrics collectors will be used by default to store statistics
instead of local collectors. You can manually choose driver if needed.
```lua
-- Use metrics collectors.
-- Use metrics collectors. (Default if metrics found).
crud.enable_stats({ driver = 'metrics' })

-- Use metrics collectors with 0.99 quantile.
crud.enable_stats({ driver = 'metrics', quantiles = true })

-- Use simple local collectors.
crud.enable_stats({ driver = 'local' })
```
Performance overhead is 3-5% in case of `local` driver and
10-20% in case of `metrics` driver.
Performance overhead is 3-7% in case of `local` driver and
5-10% in case of `metrics` driver, up to 20% for `metrics` with quantiles.

Format is as follows.
```
Expand Down Expand Up @@ -668,7 +671,8 @@ Each operation section contains of different collectors
for success calls and error (both error throw and `nil, err`)
returns. `count` is total requests count since instance start
or stats restart. `latency` is 0.99 quantile of request execution
time if `metrics` driver used, otherwise `latency` is total average.
time if `metrics` driver used and quantiles enabled,
otherwise `latency` is total average.
`time` is total time of requests execution.

In [`metrics`](https://www.tarantool.io/en/doc/latest/book/monitoring/)
Expand Down
43 changes: 28 additions & 15 deletions crud/stats/local_registry.lua
Original file line number Diff line number Diff line change
@@ -1,23 +1,36 @@
local errors = require('errors')

local dev_checks = require('crud.common.dev_checks')
local op_module = require('crud.stats.operation')
local registry_common = require('crud.stats.registry_common')
local stash = require('crud.stats.stash')

local registry = {}
local internal_registry = stash.get('local_registry')
local internal = stash.get('local_registry')
local StatsLocalError = errors.new_class('StatsLocalError', {capture_stack = false})

--- Initialize local metrics registry
--
-- Registries are not meant to used explicitly
-- by users, init is not guaranteed to be idempotent.
--
-- @function init
-- @tparam table opts
--
-- @tfield boolean quantiles
-- Quantiles is not supported for local, only `false` is valid.
--
-- @treturn boolean Returns true.
--
function registry.init()
internal_registry.spaces = {}
internal_registry.space_not_found = 0
function registry.init(opts)
dev_checks({ quantiles = 'boolean' })

StatsLocalError:assert(opts.quantiles == false,
"Quantiles are not supported for 'local' statistics registry")

internal.registry = {}
internal.registry.spaces = {}
internal.registry.space_not_found = 0

return true
end
Expand All @@ -32,7 +45,7 @@ end
-- @treturn boolean Returns true.
--
function registry.destroy()
internal_registry = stash.reset('local_registry')
internal.registry = nil

return true
end
Expand All @@ -58,10 +71,10 @@ function registry.get(space_name)
dev_checks('?string')

if space_name ~= nil then
return table.deepcopy(internal_registry.spaces[space_name]) or {}
return table.deepcopy(internal.registry.spaces[space_name]) or {}
end

return table.deepcopy(internal_registry)
return table.deepcopy(internal.registry)
end

--- Check if space statistics are present in registry
Expand All @@ -76,7 +89,7 @@ end
function registry.is_unknown_space(space_name)
dev_checks('string')

return internal_registry.spaces[space_name] == nil
return internal.registry.spaces[space_name] == nil
end

--- Increase requests count and update latency info
Expand All @@ -101,8 +114,8 @@ end
function registry.observe(latency, space_name, op, status)
dev_checks('number', 'string', 'string', 'string')

registry_common.init_collectors_if_required(internal_registry.spaces, space_name, op)
local collectors = internal_registry.spaces[space_name][op][status]
registry_common.init_collectors_if_required(internal.registry.spaces, space_name, op)
local collectors = internal.registry.spaces[space_name][op][status]

collectors.count = collectors.count + 1
collectors.time = collectors.time + latency
Expand All @@ -118,7 +131,7 @@ end
-- @treturn boolean Returns true.
--
function registry.observe_space_not_found()
internal_registry.space_not_found = internal_registry.space_not_found + 1
internal.registry.space_not_found = internal.registry.space_not_found + 1

return true
end
Expand All @@ -142,8 +155,8 @@ function registry.observe_fetch(tuples_fetched, tuples_lookup, space_name)
dev_checks('number', 'number', 'string')

local op = op_module.SELECT
registry_common.init_collectors_if_required(internal_registry.spaces, space_name, op)
local collectors = internal_registry.spaces[space_name][op].details
registry_common.init_collectors_if_required(internal.registry.spaces, space_name, op)
local collectors = internal.registry.spaces[space_name][op].details

collectors.tuples_fetched = collectors.tuples_fetched + tuples_fetched
collectors.tuples_lookup = collectors.tuples_lookup + tuples_lookup
Expand All @@ -167,8 +180,8 @@ function registry.observe_map_reduces(count, space_name)
dev_checks('number', 'string')

local op = op_module.SELECT
registry_common.init_collectors_if_required(internal_registry.spaces, space_name, op)
local collectors = internal_registry.spaces[space_name][op].details
registry_common.init_collectors_if_required(internal.registry.spaces, space_name, op)
local collectors = internal.registry.spaces[space_name][op].details

collectors.map_reduces = collectors.map_reduces + count

Expand Down
116 changes: 91 additions & 25 deletions crud/stats/metrics_registry.lua
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ local stash = require('crud.stats.stash')

local registry = {}
-- Used to cache collectors.
local internal_registry = stash.get('metrics_registry')
local internal = stash.get('metrics_registry')

local metric_name = {
-- Summary collector for all operations.
Expand All @@ -32,10 +32,10 @@ local LATENCY_QUANTILE = 0.99

-- Increasing tolerance threshold affects performance.
local DEFAULT_QUANTILES = {
[LATENCY_QUANTILE] = 1e-3,
[LATENCY_QUANTILE] = 1e-2,
}

local DEFAULT_SUMMARY_PARAMS = {
local DEFAULT_AGE_PARAMS = {
age_buckets_count = 2,
max_age_time = 60,
}
Expand Down Expand Up @@ -73,28 +73,45 @@ end
--
-- @function init
--
-- @tparam table opts
--
-- @tfield boolean quantiles
-- If true, computes latency as 0.99 quantile with aging.
--
-- @treturn boolean Returns true.
--
function registry.init()
internal_registry[metric_name.stats] = metrics.summary(
function registry.init(opts)
dev_checks({ quantiles = 'boolean' })

internal.opts = table.deepcopy(opts)

local quantile_params = nil
local age_params = nil
if opts.quantiles == true then
quantile_params = DEFAULT_QUANTILES
age_params = DEFAULT_AGE_PARAMS
end

internal.registry = {}
internal.registry[metric_name.stats] = metrics.summary(
metric_name.stats,
'CRUD router calls statistics',
DEFAULT_QUANTILES,
DEFAULT_SUMMARY_PARAMS)
quantile_params,
age_params)

internal_registry[metric_name.space_not_found] = metrics.counter(
internal.registry[metric_name.space_not_found] = metrics.counter(
metric_name.space_not_found,
'Spaces not found during CRUD calls')

internal_registry[metric_name.details.tuples_fetched] = metrics.counter(
internal.registry[metric_name.details.tuples_fetched] = metrics.counter(
metric_name.details.tuples_fetched,
'Tuples fetched from CRUD storages during select/pairs')

internal_registry[metric_name.details.tuples_lookup] = metrics.counter(
internal.registry[metric_name.details.tuples_lookup] = metrics.counter(
metric_name.details.tuples_lookup,
'Tuples looked up on CRUD storages while collecting response during select/pairs')

internal_registry[metric_name.details.map_reduces] = metrics.counter(
internal.registry[metric_name.details.map_reduces] = metrics.counter(
metric_name.details.map_reduces,
'Map reduces planned during CRUD select/pairs')

Expand All @@ -112,14 +129,58 @@ end
-- @treturn boolean Returns true.
--
function registry.destroy()
for _, c in pairs(internal_registry) do
for _, c in pairs(internal.registry) do
metrics.registry:unregister(c)
end

internal_registry = stash.reset('metrics_registry')
internal.registry = nil
internal.opts = nil

return true
end

--- Compute `latency` field of an observation
-- If it is a { time = ..., count = ... } observation,
-- compute latency as overall average and store it
-- inside observation object.
--
-- @tparam table obs
-- Objects from registry_common
-- stats.spaces[name][op][status].
-- If something like `details` collector
-- passed, do nothing.
--
-- @function compute_obs_latency
--
local function compute_obs_latency(obs)
if obs.count == nil or obs.time == nil then
return
end

if obs.count == 0 then
obs.latency = 0
else
obs.latency = obs.time / obs.count
end
end

--- Compute `latency` field of each observation
-- If quantiles disabled, we need to compute
-- latency as overall average from `time` and
-- `count` values.
--
-- @function compute_obs_latency
--
local function compute_latencies(stats)
for _, space_stats in pairs(stats.spaces) do
for _, op_stats in pairs(space_stats) do
for _, obs in pairs(op_stats) do
compute_obs_latency(obs)
end
end
end
end

--- Get copy of global metrics registry
--
-- Registries are not meant to used explicitly
Expand All @@ -145,7 +206,7 @@ function registry.get(space_name)
}

-- Fill operation basic statistics values.
for _, obs in ipairs(internal_registry[metric_name.stats]:collect()) do
for _, obs in ipairs(internal.registry[metric_name.stats]:collect()) do
local op = obs.label_pairs.operation
local status = obs.label_pairs.status
local name = obs.label_pairs.name
Expand All @@ -157,6 +218,7 @@ function registry.get(space_name)
registry_common.init_collectors_if_required(stats.spaces, name, op)
local space_stats = stats.spaces[name]

-- metric_name.stats presents only if quantiles enabled.
if obs.metric_name == metric_name.stats then
if obs.label_pairs.quantile == LATENCY_QUANTILE then
space_stats[op][status].latency = obs.value
Expand All @@ -170,9 +232,13 @@ function registry.get(space_name)
:: stats_continue ::
end

if not internal.opts.quantiles then
compute_latencies(stats)
end

-- Fill select/pairs detail statistics values.
for stat_name, metric_name in pairs(metric_name.details) do
for _, obs in ipairs(internal_registry[metric_name]:collect()) do
for _, obs in ipairs(internal.registry[metric_name]:collect()) do
local name = obs.label_pairs.name
local op = obs.label_pairs.operation

Expand All @@ -191,7 +257,7 @@ function registry.get(space_name)
return stats.spaces[space_name] or {}
end

local _, obs = next(internal_registry[metric_name.space_not_found]:collect())
local _, obs = next(internal.registry[metric_name.space_not_found]:collect())
if obs ~= nil then
stats.space_not_found = obs.value
end
Expand All @@ -211,7 +277,7 @@ end
function registry.is_unknown_space(space_name)
dev_checks('string')

for _, obs in ipairs(internal_registry[metric_name.stats]:collect()) do
for _, obs in ipairs(internal.registry[metric_name.stats]:collect()) do
local name = obs.label_pairs.name

if name == space_name then
Expand All @@ -220,7 +286,7 @@ function registry.is_unknown_space(space_name)
end

for _, metric_name in pairs(metric_name.details) do
for _, obs in ipairs(internal_registry[metric_name]:collect()) do
for _, obs in ipairs(internal.registry[metric_name]:collect()) do
local name = obs.label_pairs.name

if name == space_name then
Expand Down Expand Up @@ -259,7 +325,7 @@ function registry.observe(latency, space_name, op, status)
-- Use `status` label to be consistent with `tnt_vinyl_*` and HTTP metrics labels.
local label_pairs = { operation = op, name = space_name, status = status }

internal_registry[metric_name.stats]:observe(latency, label_pairs)
internal.registry[metric_name.stats]:observe(latency, label_pairs)

return true
end
Expand All @@ -271,7 +337,7 @@ end
-- @treturn boolean Returns true.
--
function registry.observe_space_not_found()
internal_registry[metric_name.space_not_found]:inc(1)
internal.registry[metric_name.space_not_found]:inc(1)

return true
end
Expand All @@ -296,8 +362,8 @@ function registry.observe_fetch(tuples_fetched, tuples_lookup, space_name)

local label_pairs = { name = space_name, operation = op_module.SELECT }

internal_registry[metric_name.details.tuples_fetched]:inc(tuples_fetched, label_pairs)
internal_registry[metric_name.details.tuples_lookup]:inc(tuples_lookup, label_pairs)
internal.registry[metric_name.details.tuples_fetched]:inc(tuples_fetched, label_pairs)
internal.registry[metric_name.details.tuples_lookup]:inc(tuples_lookup, label_pairs)

return true
end
Expand All @@ -318,7 +384,7 @@ function registry.observe_map_reduces(count, space_name)
dev_checks('number', 'string')

local label_pairs = { name = space_name, operation = op_module.SELECT }
internal_registry[metric_name.details.map_reduces]:inc(count, label_pairs)
internal.registry[metric_name.details.map_reduces]:inc(count, label_pairs)

return true
end
Expand All @@ -334,14 +400,14 @@ local function workaround_role_reload()
end

-- Check if this registry was enabled before reload.
if next(internal_registry) == nil then
if internal.registry == nil then
return
end

-- Check if base collector is in metrics package registry.
-- If it's not, then registry has beed cleaned up on role reload.
if metrics.registry:find('summary', metric_name.stats) == nil then
registry.init()
registry.init(internal.opts)
end
end

Expand Down
Loading

0 comments on commit 195ff75

Please sign in to comment.