Skip to content

stats: make quantile tolerated error configurable #281

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]

### Added
* Make metrics quantile collector tolerated error configurable (#281).

### Changed
* Change metrics quantile collector default tolerated error
from 1e-2 to 1e-3 (#281).

### Fixed
* Requests no more fail with "Sharding hash mismatch" error
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,13 @@ metrics:collect()
metric_name: tnt_crud_stats
...
```
If you see `-Inf` value in quantile metrics, try to decrease the tolerated error:
```lua
crud.cfg{stats_quantile_tolerated_error = 1e-4}
```
See [tarantool/metrics#189](https://github.com/tarantool/metrics/issues/189) for
details about the issue.


`select` section additionally contains `details` collectors.
```lua
Expand Down
28 changes: 25 additions & 3 deletions crud/cfg.lua
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ local function set_defaults_if_empty(cfg)
cfg.stats_quantiles = false
end

if cfg.stats_quantile_tolerated_error == nil then
cfg.stats_quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR
end

return cfg
end

Expand All @@ -33,7 +37,8 @@ local cfg = set_defaults_if_empty(stash.get(stash.name.cfg))
local function configure_stats(cfg, opts)
if (opts.stats == nil)
and (opts.stats_driver == nil)
and (opts.stats_quantiles == nil) then
and (opts.stats_quantiles == nil)
and (opts.stats_quantile_tolerated_error == nil) then
return
end

Expand All @@ -49,15 +54,24 @@ local function configure_stats(cfg, opts)
opts.stats_quantiles = cfg.stats_quantiles
end

if opts.stats_quantiles == nil then
opts.stats_quantile_tolerated_error = cfg.stats_quantile_tolerated_error
end

if opts.stats == true then
stats.enable{ driver = opts.stats_driver, quantiles = opts.stats_quantiles }
stats.enable{
driver = opts.stats_driver,
quantiles = opts.stats_quantiles,
quantile_tolerated_error = opts.stats_quantile_tolerated_error,
}
else
stats.disable()
end

rawset(cfg, 'stats', opts.stats)
rawset(cfg, 'stats_driver', opts.stats_driver)
rawset(cfg, 'stats_quantiles', opts.stats_quantiles)
rawset(cfg, 'stats_quantile_tolerated_error', opts.stats_quantile_tolerated_error)
end

--- Configure CRUD module.
Expand Down Expand Up @@ -86,13 +100,21 @@ end
-- Enable or disable statistics quantiles (only for metrics driver).
-- Quantiles computations increases performance overhead up to 10%.
--
-- @number[opt=1e-3] opts.stats_quantile_tolerated_error
-- See tarantool/metrics summary API for details:
-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
-- If quantile value is -Inf, try to decrease quantile tolerance.
-- See https://github.com/tarantool/metrics/issues/189 for issue details.
-- Decreasing the value increases computational load.
--
-- @return Configuration table.
--
local function __call(self, opts)
checks('table', {
stats = '?boolean',
stats_driver = '?string',
stats_quantiles = '?boolean'
stats_quantiles = '?boolean',
stats_quantile_tolerated_error = '?number',
})

opts = table.deepcopy(opts) or {}
Expand Down
36 changes: 31 additions & 5 deletions crud/stats/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,20 @@ end
-- computing requests latency as 0.99 quantile with aging.
-- Performance overhead for enabling is near 10%.
--
-- @number[opt=1e-3] opts.quantile_tolerated_error
-- See tarantool/metrics summary API for details:
-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
-- If quantile value is -Inf, try to decrease quantile tolerance.
-- See https://github.com/tarantool/metrics/issues/189 for issue details.
--
-- @treturn boolean Returns `true`.
--
function stats.enable(opts)
checks({ driver = '?string', quantiles = '?boolean' })
checks({
driver = '?string',
quantiles = '?boolean',
quantile_tolerated_error = '?number',
})

StatsError:assert(
rawget(_G, 'crud') ~= nil,
Expand All @@ -108,19 +118,29 @@ function stats.enable(opts)
opts.quantiles = false
end

if opts.quantile_tolerated_error == nil then
opts.quantile_tolerated_error = stats.DEFAULT_QUANTILE_TOLERATED_ERROR
end

-- Do not reinit if called with same options.
if internal.driver == opts.driver
and internal.quantiles == opts.quantiles then
and internal.quantiles == opts.quantiles
and internal.quantile_tolerated_error == opts.quantile_tolerated_error then
return true
end

-- Disable old driver registry, if another one was requested.
stats.disable()

internal.driver = opts.driver
internal.quantiles = opts.quantiles

internal:get_registry().init({ quantiles = internal.quantiles })
internal:get_registry().init{
quantiles = opts.quantiles,
quantile_tolerated_error = opts.quantile_tolerated_error
}

internal.quantiles = opts.quantiles
internal.quantile_tolerated_error = opts.quantile_tolerated_error

return true
end
Expand All @@ -140,7 +160,10 @@ function stats.reset()
end

internal:get_registry().destroy()
internal:get_registry().init({ quantiles = internal.quantiles })
internal:get_registry().init{
quantiles = internal.quantiles,
quantile_tolerated_error = internal.quantile_tolerated_error
}

return true
end
Expand Down Expand Up @@ -469,4 +492,7 @@ stats.op = op_module
-- @tfield[opt] boolean quantiles Is quantiles computed.
stats.internal = internal

--- Default metrics quantile precision.
stats.DEFAULT_QUANTILE_TOLERATED_ERROR = 1e-3

return stats
8 changes: 7 additions & 1 deletion crud/stats/local_registry.lua
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,16 @@ local StatsLocalError = errors.new_class('StatsLocalError', {capture_stack = fal
-- @bool opts.quantiles
-- Quantiles is not supported for local, only `false` is valid.
--
-- @number opts.quantile_tolerated_error
-- Quantiles is not supported for local, so the value is ignored.
--
-- @treturn boolean Returns `true`.
--
function registry.init(opts)
dev_checks({ quantiles = 'boolean' })
dev_checks({
quantiles = 'boolean',
quantile_tolerated_error = 'number',
})

StatsLocalError:assert(opts.quantiles == false,
"Quantiles are not supported for 'local' statistics registry")
Expand Down
22 changes: 13 additions & 9 deletions crud/stats/metrics_registry.lua
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,6 @@ local metric_name = {

local LATENCY_QUANTILE = 0.99

-- Increasing tolerance threshold affects performance.
local DEFAULT_QUANTILES = {
[LATENCY_QUANTILE] = 1e-2,
}

local DEFAULT_AGE_PARAMS = {
age_buckets_count = 2,
max_age_time = 60,
Expand Down Expand Up @@ -86,17 +81,24 @@ end
-- @bool opts.quantiles
-- If `true`, computes latency as 0.99 quantile with aging.
--
-- @number[opt=1e-3] opts.quantile_tolerated_error
-- See metrics summary API for details:
-- https://www.tarantool.io/ru/doc/latest/book/monitoring/api_reference/#summary
-- If quantile value is -Inf, try to decrease quantile tolerance.
-- See https://github.com/tarantool/metrics/issues/189 for issue details.
--
-- @treturn boolean Returns `true`.
--
function registry.init(opts)
dev_checks({ quantiles = 'boolean' })

internal.opts = table.deepcopy(opts)
dev_checks({
quantiles = 'boolean',
quantile_tolerated_error = 'number',
})

local quantile_params = nil
local age_params = nil
if opts.quantiles == true then
quantile_params = DEFAULT_QUANTILES
quantile_params = {[LATENCY_QUANTILE] = opts.quantile_tolerated_error}
age_params = DEFAULT_AGE_PARAMS
end

Expand All @@ -119,6 +121,8 @@ function registry.init(opts)
metric_name.details.map_reduces,
'Map reduces planned during CRUD select/pairs')

internal.opts = table.deepcopy(opts)

return true
end

Expand Down
1 change: 1 addition & 0 deletions test/integration/cfg_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ group.test_defaults = function(g)
stats = false,
stats_driver = stats.get_default_driver(),
stats_quantiles = false,
stats_quantile_tolerated_error = 1e-3,
})
end

Expand Down
34 changes: 34 additions & 0 deletions test/unit/stats_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,40 @@ group_driver.test_default_quantiles = function(g)
end


group_driver.test_default_quantile_tolerated_error = function(g)
enable_stats(g)

local quantile_tolerated_error = g.router:eval(" return stats_module.internal.quantile_tolerated_error ")
t.assert_equals(quantile_tolerated_error, 1e-3)
end


group_driver.before_test(
'test_custom_quantile_tolerated_error',
function(g)
t.skip_if(g.is_metrics_supported == false, 'Metrics registry is unsupported')
end
)

group_driver.test_custom_quantile_tolerated_error = function(g)
g.router:call('crud.cfg', {{
stats = true,
stats_driver = 'metrics',
stats_quantiles = true,
stats_quantile_tolerated_error = 5e-4,
}})

local resp = g.router:eval([[
local metrics = require('metrics')

local summary = metrics.summary('tnt_crud_stats')
return summary.objectives
]])

t.assert_equals(resp, {[0.99] = 5e-4})
end


group_driver.before_test(
'test_stats_reenable_with_different_driver_reset_stats',
function(g)
Expand Down