From f1af9b43168b98e0e5bc676f3881a49984cd3182 Mon Sep 17 00:00:00 2001 From: Ying Mao Date: Wed, 24 Jul 2024 16:00:11 -0400 Subject: [PATCH] [Response Ops][Task Manager] Resource based task scheduling (#187999) Resolves https://github.com/elastic/kibana/issues/185043 ## Summary ### Task types can define a `cost` associated with running it - Optional definition that defaults to `Normal` cost ### New `xpack.task_manager.capacity` setting - Previous `xpack.task_manager.max_workers` setting is deprecated, changed to optional, and a warning will be logged if used - New optional `xpack.task_manager.capacity` setting is added. This represents the number of normal cost tasks that can be run at one time. - When `xpack.task_manager.max_workers` is defined and `xpack.task_manager.capacity` is not defined, a deprecation warning is logged and the value for max workers will be used as the capacity value. - When `xpack.task_manager.capacity` is defined and `xpack.task_manager.max_workers` is not defined, the capacity value will be used. For the `default` claiming strategy, this capacity value will be used as the `max_workers` value - When both values are set, a warning will be logged and the value for `xpack.task_manager.capacity` will be used - When neither value is set, the `DEFAULT_CAPACITY` value will be used. ### Updates to `TaskPool` class - Moves the logic to determine used and available capacity so that we can switch between capacity calculators based on claim strategy. For the `default` claim strategy, the capacity will be in units of workers. For the `mget` claim strategy, the capacity will be in units of task cost. ### Updates to `mget` task claimer - Updated `taskStore.fetch` call to take a new parameter that will return a slimmer task document that excludes that task state and task params. This will improve the I/O efficiency of returning up to 400 task docs in one query - Applies capacity constraint to the candidate tasks. - Bulk gets the full task documents for the tasks we have capacity for in order to update them to `claiming` status. Uses the `SavedObjectsClient.bulkGet` which uses an `mget` under the hood. ### Updates the monitoring stats - Emitting capacity config value and also capacity as translated into workers and cost. - Added total cost of running and overdue tasks to the health report ## Tasks for followup issues - Update mget functional tests to include tasks with different costs. - https://github.com/elastic/kibana/issues/189111 - Update cost of indicator match rule to be Extra Large - https://github.com/elastic/kibana/issues/189112 - Set `xpack.task_manager.capacity` on ECH based on the node size - https://github.com/elastic/kibana/pull/189117 --------- Co-authored-by: Elastic Machine Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com> --- .../plugins/task_manager/server/MONITORING.md | 6 +- .../task_manager/server/config.test.ts | 3 - x-pack/plugins/task_manager/server/config.ts | 16 +- .../server/ephemeral_task_lifecycle.test.ts | 29 +- .../server/ephemeral_task_lifecycle.ts | 6 +- x-pack/plugins/task_manager/server/index.ts | 7 +- .../managed_configuration.test.ts | 484 ++++-- .../lib/calculate_health_status.test.ts | 9 +- .../lib/create_managed_configuration.test.ts | 284 +++- .../lib/create_managed_configuration.ts | 100 +- .../task_manager/server/lib/fill_pool.test.ts | 1 - .../server/lib/log_health_metrics.test.ts | 12 +- .../server/metrics/create_aggregator.test.ts | 1 - .../background_task_utilization_statistics.ts | 2 +- .../monitoring/capacity_estimation.test.ts | 40 +- .../server/monitoring/capacity_estimation.ts | 12 +- .../configuration_statistics.test.ts | 27 +- .../monitoring/configuration_statistics.ts | 30 +- .../ephemeral_task_statistics.test.ts | 10 +- .../monitoring/ephemeral_task_statistics.ts | 6 +- .../task_manager/server/monitoring/index.ts | 36 +- .../monitoring_stats_stream.test.ts | 53 +- .../monitoring/monitoring_stats_stream.ts | 43 +- ...s.test.ts => task_run_calculators.test.ts} | 2 +- ..._calcultors.ts => task_run_calculators.ts} | 0 .../server/monitoring/task_run_statistics.ts | 2 +- .../monitoring/workload_statistics.test.ts | 499 +++---- .../server/monitoring/workload_statistics.ts | 179 ++- .../task_manager/server/plugin.test.ts | 1 - x-pack/plugins/task_manager/server/plugin.ts | 22 +- .../polling/delay_on_claim_conflicts.test.ts | 26 +- .../polling/delay_on_claim_conflicts.ts | 12 +- .../server/polling_lifecycle.test.ts | 147 +- .../task_manager/server/polling_lifecycle.ts | 56 +- .../server/queries/task_claiming.test.ts | 4 +- .../server/queries/task_claiming.ts | 10 +- .../task_manager/server/routes/health.test.ts | 12 +- x-pack/plugins/task_manager/server/task.ts | 18 +- .../server/task_claimers/index.ts | 4 +- .../task_claimers/strategy_default.test.ts | 18 +- .../task_claimers/strategy_mget.test.ts | 1297 ++++++++++++++++- .../server/task_claimers/strategy_mget.ts | 115 +- .../task_manager/server/task_pool.test.ts | 471 ------ .../server/task_pool/capacity.mock.ts | 21 + .../server/task_pool/cost_capacity.test.ts | 171 +++ .../server/task_pool/cost_capacity.ts | 109 ++ .../task_manager/server/task_pool/index.ts | 9 + .../server/{ => task_pool}/task_pool.mock.ts | 31 +- .../server/task_pool/task_pool.test.ts | 867 +++++++++++ .../server/{ => task_pool}/task_pool.ts | 116 +- .../server/task_pool/test_utils.ts | 53 + .../task_manager/server/task_pool/types.ts | 31 + .../task_manager/server/task_pool/utils.ts | 16 + .../server/task_pool/worker_capacity.test.ts | 176 +++ .../server/task_pool/worker_capacity.ts | 95 ++ .../task_manager/server/task_store.test.ts | 146 +- .../plugins/task_manager/server/task_store.ts | 56 +- .../server/task_type_dictionary.test.ts | 45 +- .../server/task_type_dictionary.ts | 12 +- .../task_manager_usage_collector.test.ts | 12 +- .../test_suites/task_manager/health_route.ts | 7 +- .../test_suites/task_manager/health_route.ts | 7 +- 62 files changed, 4579 insertions(+), 1513 deletions(-) rename x-pack/plugins/task_manager/server/monitoring/{task_run_calcultors.test.ts => task_run_calculators.test.ts} (98%) rename x-pack/plugins/task_manager/server/monitoring/{task_run_calcultors.ts => task_run_calculators.ts} (100%) delete mode 100644 x-pack/plugins/task_manager/server/task_pool.test.ts create mode 100644 x-pack/plugins/task_manager/server/task_pool/capacity.mock.ts create mode 100644 x-pack/plugins/task_manager/server/task_pool/cost_capacity.test.ts create mode 100644 x-pack/plugins/task_manager/server/task_pool/cost_capacity.ts create mode 100644 x-pack/plugins/task_manager/server/task_pool/index.ts rename x-pack/plugins/task_manager/server/{ => task_pool}/task_pool.mock.ts (58%) create mode 100644 x-pack/plugins/task_manager/server/task_pool/task_pool.test.ts rename x-pack/plugins/task_manager/server/{ => task_pool}/task_pool.ts (73%) create mode 100644 x-pack/plugins/task_manager/server/task_pool/test_utils.ts create mode 100644 x-pack/plugins/task_manager/server/task_pool/types.ts create mode 100644 x-pack/plugins/task_manager/server/task_pool/utils.ts create mode 100644 x-pack/plugins/task_manager/server/task_pool/worker_capacity.test.ts create mode 100644 x-pack/plugins/task_manager/server/task_pool/worker_capacity.ts diff --git a/x-pack/plugins/task_manager/server/MONITORING.md b/x-pack/plugins/task_manager/server/MONITORING.md index 02946b9b3e53f..c4e66ab92bad5 100644 --- a/x-pack/plugins/task_manager/server/MONITORING.md +++ b/x-pack/plugins/task_manager/server/MONITORING.md @@ -50,7 +50,7 @@ The root `timestamp` is the time in which the summary was exposed (either to the Follow this step-by-step guide to make sense of the stats: https://www.elastic.co/guide/en/kibana/master/task-manager-troubleshooting.html#task-manager-diagnosing-root-cause #### The Configuration Section -The `configuration` section summarizes Task Manager's current configuration, including dynamic configurations which change over time, such as `poll_interval` and `max_workers` which adjust in reaction to changing load on the system. +The `configuration` section summarizes Task Manager's current configuration, including dynamic configurations which change over time, such as `poll_interval` and `capacity` which adjust in reaction to changing load on the system. These are "Hot" stats which are updated whenever a change happens in the configuration. @@ -69,8 +69,8 @@ The `runtime` tracks Task Manager's performance as it runs, making note of task These include: - The time it takes a task to run (p50, p90, p95 & p99, using a configurable running average window, `50` by default) - The average _drift_ that tasks experience (p50, p90, p95 & p99, using the same configurable running average window as above). Drift tells us how long after a task's scheduled a task typically executes. - - The average _load_ (p50, p90, p95 & p99, using the same configurable running average window as above). Load tells us what percentage of workers is in use at the end of each polling cycle. - - The polling rate (the timestamp of the last time a polling cycle completed), the polling health stats (number of version clashes and mismatches) and the result [`No tasks | Filled task pool | Unexpectedly ran out of workers`] frequency the past 50 polling cycles (using the same window size as the one used for running averages) + - The average _load_ (p50, p90, p95 & p99, using the same configurable running average window as above). Load tells us what percentage of capacity is in use at the end of each polling cycle. + - The polling rate (the timestamp of the last time a polling cycle completed), the polling health stats (number of version clashes and mismatches) and the result [`No tasks | Filled task pool | Unexpectedly ran out of capacity`] frequency the past 50 polling cycles (using the same window size as the one used for running averages) - The `Success | Retry | Failure ratio` by task type. This is different than the workload stats which tell you what's in the queue, but ca't keep track of retries and of non recurring tasks as they're wiped off the index when completed. These are "Hot" stats which are updated reactively as Tasks are executed and interacted with. diff --git a/x-pack/plugins/task_manager/server/config.test.ts b/x-pack/plugins/task_manager/server/config.test.ts index bb59a73a305d6..81e9e24ea4586 100644 --- a/x-pack/plugins/task_manager/server/config.test.ts +++ b/x-pack/plugins/task_manager/server/config.test.ts @@ -23,7 +23,6 @@ describe('config validation', () => { "warn_threshold": 5000, }, "max_attempts": 3, - "max_workers": 10, "metrics_reset_interval": 30000, "monitored_aggregated_stats_refresh_rate": 60000, "monitored_stats_health_verbose_log": Object { @@ -81,7 +80,6 @@ describe('config validation', () => { "warn_threshold": 5000, }, "max_attempts": 3, - "max_workers": 10, "metrics_reset_interval": 30000, "monitored_aggregated_stats_refresh_rate": 60000, "monitored_stats_health_verbose_log": Object { @@ -137,7 +135,6 @@ describe('config validation', () => { "warn_threshold": 5000, }, "max_attempts": 3, - "max_workers": 10, "metrics_reset_interval": 30000, "monitored_aggregated_stats_refresh_rate": 60000, "monitored_stats_health_verbose_log": Object { diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts index eec63c5be489c..f0f4031a4c8ac 100644 --- a/x-pack/plugins/task_manager/server/config.ts +++ b/x-pack/plugins/task_manager/server/config.ts @@ -8,6 +8,9 @@ import { schema, TypeOf } from '@kbn/config-schema'; export const MAX_WORKERS_LIMIT = 100; +export const DEFAULT_CAPACITY = 10; +export const MAX_CAPACITY = 50; +export const MIN_CAPACITY = 5; export const DEFAULT_MAX_WORKERS = 10; export const DEFAULT_POLL_INTERVAL = 3000; export const DEFAULT_VERSION_CONFLICT_THRESHOLD = 80; @@ -64,6 +67,8 @@ const requestTimeoutsConfig = schema.object({ export const configSchema = schema.object( { allow_reading_invalid_state: schema.boolean({ defaultValue: true }), + /* The number of normal cost tasks that this Kibana instance will run simultaneously */ + capacity: schema.maybe(schema.number({ min: MIN_CAPACITY, max: MAX_CAPACITY })), ephemeral_tasks: schema.object({ enabled: schema.boolean({ defaultValue: false }), /* How many requests can Task Manager buffer before it rejects new requests. */ @@ -81,11 +86,12 @@ export const configSchema = schema.object( min: 1, }), /* The maximum number of tasks that this Kibana instance will run simultaneously. */ - max_workers: schema.number({ - defaultValue: DEFAULT_MAX_WORKERS, - // disable the task manager rather than trying to specify it with 0 workers - min: 1, - }), + max_workers: schema.maybe( + schema.number({ + // disable the task manager rather than trying to specify it with 0 workers + min: 1, + }) + ), /* The interval at which monotonically increasing metrics counters will reset */ metrics_reset_interval: schema.number({ defaultValue: DEFAULT_METRICS_RESET_INTERVAL, diff --git a/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts b/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts index 19cfa2943502c..2a6f1bf8c33b8 100644 --- a/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts +++ b/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.test.ts @@ -18,7 +18,7 @@ import { v4 as uuidv4 } from 'uuid'; import { asTaskPollingCycleEvent, asTaskRunEvent, TaskPersistence } from './task_events'; import { TaskRunResult } from './task_running'; import { TaskPoolRunResult } from './task_pool'; -import { TaskPoolMock } from './task_pool.mock'; +import { TaskPoolMock } from './task_pool/task_pool.mock'; import { executionContextServiceMock } from '@kbn/core/server/mocks'; import { taskManagerMock } from './mocks'; @@ -45,7 +45,6 @@ describe('EphemeralTaskLifecycle', () => { definitions: new TaskTypeDictionary(taskManagerLogger), executionContext, config: { - max_workers: 10, max_attempts: 9, poll_interval: 6000000, version_conflict_threshold: 80, @@ -156,7 +155,7 @@ describe('EphemeralTaskLifecycle', () => { expect(ephemeralTaskLifecycle.attemptToRun(task)).toMatchObject(asOk(task)); poolCapacity.mockReturnValue({ - availableWorkers: 10, + availableCapacity: 10, }); lifecycleEvent$.next( @@ -179,7 +178,7 @@ describe('EphemeralTaskLifecycle', () => { expect(ephemeralTaskLifecycle.attemptToRun(task)).toMatchObject(asOk(task)); poolCapacity.mockReturnValue({ - availableWorkers: 10, + availableCapacity: 10, }); lifecycleEvent$.next( @@ -216,7 +215,7 @@ describe('EphemeralTaskLifecycle', () => { expect(ephemeralTaskLifecycle.attemptToRun(tasks[2])).toMatchObject(asOk(tasks[2])); poolCapacity.mockReturnValue({ - availableWorkers: 2, + availableCapacity: 2, }); lifecycleEvent$.next( @@ -256,9 +255,9 @@ describe('EphemeralTaskLifecycle', () => { // pool has capacity for both poolCapacity.mockReturnValue({ - availableWorkers: 10, + availableCapacity: 10, }); - pool.getOccupiedWorkersByType.mockReturnValue(0); + pool.getUsedCapacityByType.mockReturnValue(0); lifecycleEvent$.next( asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed })) @@ -296,10 +295,10 @@ describe('EphemeralTaskLifecycle', () => { // pool has capacity in general poolCapacity.mockReturnValue({ - availableWorkers: 2, + availableCapacity: 2, }); // but when we ask how many it has occupied by type - wee always have one worker already occupied by that type - pool.getOccupiedWorkersByType.mockReturnValue(1); + pool.getUsedCapacityByType.mockReturnValue(1); lifecycleEvent$.next( asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed })) @@ -308,7 +307,7 @@ describe('EphemeralTaskLifecycle', () => { expect(pool.run).toHaveBeenCalledTimes(0); // now we release the worker in the pool and cause another cycle in the epheemral queue - pool.getOccupiedWorkersByType.mockReturnValue(0); + pool.getUsedCapacityByType.mockReturnValue(0); lifecycleEvent$.next( asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed })) ); @@ -356,9 +355,9 @@ describe('EphemeralTaskLifecycle', () => { // pool has capacity for all poolCapacity.mockReturnValue({ - availableWorkers: 10, + availableCapacity: 10, }); - pool.getOccupiedWorkersByType.mockReturnValue(0); + pool.getUsedCapacityByType.mockReturnValue(0); lifecycleEvent$.next(asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed }))); @@ -389,19 +388,19 @@ describe('EphemeralTaskLifecycle', () => { expect(ephemeralTaskLifecycle.queuedTasks).toBe(3); poolCapacity.mockReturnValue({ - availableWorkers: 1, + availableCapacity: 1, }); lifecycleEvent$.next(asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed }))); expect(ephemeralTaskLifecycle.queuedTasks).toBe(2); poolCapacity.mockReturnValue({ - availableWorkers: 1, + availableCapacity: 1, }); lifecycleEvent$.next(asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed }))); expect(ephemeralTaskLifecycle.queuedTasks).toBe(1); poolCapacity.mockReturnValue({ - availableWorkers: 1, + availableCapacity: 1, }); lifecycleEvent$.next(asTaskPollingCycleEvent(asOk({ result: FillPoolResult.NoTasksClaimed }))); expect(ephemeralTaskLifecycle.queuedTasks).toBe(0); diff --git a/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.ts b/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.ts index 37cc166ece211..c7ee267b848e5 100644 --- a/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.ts +++ b/x-pack/plugins/task_manager/server/ephemeral_task_lifecycle.ts @@ -143,13 +143,13 @@ export class EphemeralTaskLifecycle { taskType && this.definitions.get(taskType)?.maxConcurrency ? Math.max( Math.min( - this.pool.availableWorkers, + this.pool.availableCapacity(), this.definitions.get(taskType)!.maxConcurrency! - - this.pool.getOccupiedWorkersByType(taskType) + this.pool.getUsedCapacityByType(taskType) ), 0 ) - : this.pool.availableWorkers; + : this.pool.availableCapacity(); private emitEvent = (event: TaskLifecycleEvent) => { this.events$.next(event); diff --git a/x-pack/plugins/task_manager/server/index.ts b/x-pack/plugins/task_manager/server/index.ts index 8d50c37adda0b..965df090911fd 100644 --- a/x-pack/plugins/task_manager/server/index.ts +++ b/x-pack/plugins/task_manager/server/index.ts @@ -55,9 +55,6 @@ export type { export const config: PluginConfigDescriptor = { schema: configSchema, - exposeToUsage: { - max_workers: true, - }, deprecations: ({ deprecate }) => { return [ deprecate('ephemeral_tasks.enabled', 'a future version', { @@ -68,6 +65,10 @@ export const config: PluginConfigDescriptor = { level: 'warning', message: `Configuring "xpack.task_manager.ephemeral_tasks.request_capacity" is deprecated and will be removed in a future version. Remove this setting to increase task execution resiliency.`, }), + deprecate('max_workers', 'a future version', { + level: 'warning', + message: `Configuring "xpack.task_manager.max_workers" is deprecated and will be removed in a future version. Remove this setting and use "xpack.task_manager.capacity" instead.`, + }), (settings, fromPath, addDeprecation) => { const taskManager = get(settings, fromPath); if (taskManager?.index) { diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts index c0939b5b31667..7ab60a94fe8e5 100644 --- a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts +++ b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts @@ -35,164 +35,362 @@ describe('managed configuration', () => { }, }; - beforeEach(async () => { - jest.resetAllMocks(); - clock = sinon.useFakeTimers(); - - const context = coreMock.createPluginInitializerContext({ - max_workers: 10, - max_attempts: 9, - poll_interval: 3000, - allow_reading_invalid_state: false, - version_conflict_threshold: 80, - monitored_aggregated_stats_refresh_rate: 60000, - monitored_stats_health_verbose_log: { - enabled: false, - level: 'debug' as const, - warn_delayed_task_start_in_seconds: 60, - }, - monitored_stats_required_freshness: 4000, - monitored_stats_running_average_window: 50, - request_capacity: 1000, - monitored_task_execution_thresholds: { - default: { - error_threshold: 90, - warn_threshold: 80, - }, - custom: {}, - }, - ephemeral_tasks: { - enabled: true, - request_capacity: 10, - }, - unsafe: { - exclude_task_types: [], - authenticate_background_task_utilization: true, - }, - event_loop_delay: { - monitor: true, - warn_threshold: 5000, - }, - worker_utilization_running_average_window: 5, - metrics_reset_interval: 3000, - claim_strategy: 'default', - request_timeouts: { - update_by_query: 1000, - }, + afterEach(() => clock.restore()); + + describe('managed poll interval', () => { + beforeEach(async () => { + jest.resetAllMocks(); + clock = sinon.useFakeTimers(); + + const context = coreMock.createPluginInitializerContext({ + capacity: 10, + max_attempts: 9, + poll_interval: 3000, + allow_reading_invalid_state: false, + version_conflict_threshold: 80, + monitored_aggregated_stats_refresh_rate: 60000, + monitored_stats_health_verbose_log: { + enabled: false, + level: 'debug' as const, + warn_delayed_task_start_in_seconds: 60, + }, + monitored_stats_required_freshness: 4000, + monitored_stats_running_average_window: 50, + request_capacity: 1000, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + ephemeral_tasks: { + enabled: true, + request_capacity: 10, + }, + unsafe: { + exclude_task_types: [], + authenticate_background_task_utilization: true, + }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, + worker_utilization_running_average_window: 5, + metrics_reset_interval: 3000, + claim_strategy: 'default', + request_timeouts: { + update_by_query: 1000, + }, + }); + logger = context.logger.get('taskManager'); + + const taskManager = new TaskManagerPlugin(context); + ( + await taskManager.setup(coreMock.createSetup(), { usageCollection: undefined }) + ).registerTaskDefinitions({ + foo: { + title: 'Foo', + createTaskRunner: jest.fn(), + }, + }); + + const coreStart = coreMock.createStart(); + coreStart.elasticsearch = esStart; + esStart.client.asInternalUser.child.mockReturnValue( + esStart.client.asInternalUser as unknown as Client + ); + coreStart.savedObjects.createInternalRepository.mockReturnValue(savedObjectsClient); + taskManagerStart = await taskManager.start(coreStart); + + // force rxjs timers to fire when they are scheduled for setTimeout(0) as the + // sinon fake timers cause them to stall + clock.tick(0); }); - logger = context.logger.get('taskManager'); - - const taskManager = new TaskManagerPlugin(context); - ( - await taskManager.setup(coreMock.createSetup(), { usageCollection: undefined }) - ).registerTaskDefinitions({ - foo: { - title: 'Foo', - createTaskRunner: jest.fn(), - }, + + test('should increase poll interval when Elasticsearch returns 429 error', async () => { + savedObjectsClient.create.mockRejectedValueOnce( + SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b') + ); + + // Cause "too many requests" error to be thrown + await expect( + taskManagerStart.schedule({ + taskType: 'foo', + state: {}, + params: {}, + }) + ).rejects.toThrowErrorMatchingInlineSnapshot(`"Too Many Requests"`); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + + expect(logger.warn).toHaveBeenCalledWith( + 'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Poll interval configuration changing from 3000 to 3600 after seeing 1 "too many request" and/or "execute [inline] script" error(s)' + ); + expect(logger.debug).toHaveBeenCalledWith('Task poller now using interval of 3600ms'); }); - const coreStart = coreMock.createStart(); - coreStart.elasticsearch = esStart; - esStart.client.asInternalUser.child.mockReturnValue( - esStart.client.asInternalUser as unknown as Client - ); - coreStart.savedObjects.createInternalRepository.mockReturnValue(savedObjectsClient); - taskManagerStart = await taskManager.start(coreStart); - - // force rxjs timers to fire when they are scheduled for setTimeout(0) as the - // sinon fake timers cause them to stall - clock.tick(0); - }); + test('should increase poll interval when Elasticsearch returns "cannot execute [inline] scripts" error', async () => { + const childEsClient = esStart.client.asInternalUser.child({}) as jest.Mocked; + childEsClient.search.mockImplementationOnce(async () => { + throw inlineScriptError; + }); - afterEach(() => clock.restore()); + await expect(taskManagerStart.fetch({})).rejects.toThrowErrorMatchingInlineSnapshot( + `"cannot execute [inline] scripts\\" error"` + ); - test('should lower max workers when Elasticsearch returns 429 error', async () => { - savedObjectsClient.create.mockRejectedValueOnce( - SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b') - ); - - // Cause "too many requests" error to be thrown - await expect( - taskManagerStart.schedule({ - taskType: 'foo', - state: {}, - params: {}, - }) - ).rejects.toThrowErrorMatchingInlineSnapshot(`"Too Many Requests"`); - clock.tick(ADJUST_THROUGHPUT_INTERVAL); - - expect(logger.warn).toHaveBeenCalledWith( - 'Max workers configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' - ); - expect(logger.debug).toHaveBeenCalledWith( - 'Max workers configuration changing from 10 to 8 after seeing 1 "too many request" and/or "execute [inline] script" error(s)' - ); - expect(logger.debug).toHaveBeenCalledWith('Task pool now using 10 as the max worker value'); - }); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); - test('should increase poll interval when Elasticsearch returns 429 error', async () => { - savedObjectsClient.create.mockRejectedValueOnce( - SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b') - ); - - // Cause "too many requests" error to be thrown - await expect( - taskManagerStart.schedule({ - taskType: 'foo', - state: {}, - params: {}, - }) - ).rejects.toThrowErrorMatchingInlineSnapshot(`"Too Many Requests"`); - clock.tick(ADJUST_THROUGHPUT_INTERVAL); - - expect(logger.warn).toHaveBeenCalledWith( - 'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' - ); - expect(logger.debug).toHaveBeenCalledWith( - 'Poll interval configuration changing from 3000 to 3600 after seeing 1 "too many request" and/or "execute [inline] script" error(s)' - ); - expect(logger.debug).toHaveBeenCalledWith('Task poller now using interval of 3600ms'); + expect(logger.warn).toHaveBeenCalledWith( + 'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Poll interval configuration changing from 3000 to 3600 after seeing 1 "too many request" and/or "execute [inline] script" error(s)' + ); + expect(logger.debug).toHaveBeenCalledWith('Task poller now using interval of 3600ms'); + }); }); - test('should lower max workers when Elasticsearch returns "cannot execute [inline] scripts" error', async () => { - const childEsClient = esStart.client.asInternalUser.child({}) as jest.Mocked; - childEsClient.search.mockImplementationOnce(async () => { - throw inlineScriptError; + describe('managed capacity with default claim strategy', () => { + beforeEach(async () => { + jest.resetAllMocks(); + clock = sinon.useFakeTimers(); + + const context = coreMock.createPluginInitializerContext({ + capacity: 10, + max_attempts: 9, + poll_interval: 3000, + allow_reading_invalid_state: false, + version_conflict_threshold: 80, + monitored_aggregated_stats_refresh_rate: 60000, + monitored_stats_health_verbose_log: { + enabled: false, + level: 'debug' as const, + warn_delayed_task_start_in_seconds: 60, + }, + monitored_stats_required_freshness: 4000, + monitored_stats_running_average_window: 50, + request_capacity: 1000, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + ephemeral_tasks: { + enabled: true, + request_capacity: 10, + }, + unsafe: { + exclude_task_types: [], + authenticate_background_task_utilization: true, + }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, + worker_utilization_running_average_window: 5, + metrics_reset_interval: 3000, + claim_strategy: 'default', + request_timeouts: { + update_by_query: 1000, + }, + }); + logger = context.logger.get('taskManager'); + + const taskManager = new TaskManagerPlugin(context); + ( + await taskManager.setup(coreMock.createSetup(), { usageCollection: undefined }) + ).registerTaskDefinitions({ + foo: { + title: 'Foo', + createTaskRunner: jest.fn(), + }, + }); + + const coreStart = coreMock.createStart(); + coreStart.elasticsearch = esStart; + esStart.client.asInternalUser.child.mockReturnValue( + esStart.client.asInternalUser as unknown as Client + ); + coreStart.savedObjects.createInternalRepository.mockReturnValue(savedObjectsClient); + taskManagerStart = await taskManager.start(coreStart); + + // force rxjs timers to fire when they are scheduled for setTimeout(0) as the + // sinon fake timers cause them to stall + clock.tick(0); }); - await expect(taskManagerStart.fetch({})).rejects.toThrowErrorMatchingInlineSnapshot( - `"cannot execute [inline] scripts\\" error"` - ); - clock.tick(ADJUST_THROUGHPUT_INTERVAL); - - expect(logger.warn).toHaveBeenCalledWith( - 'Max workers configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' - ); - expect(logger.debug).toHaveBeenCalledWith( - 'Max workers configuration changing from 10 to 8 after seeing 1 "too many request" and/or "execute [inline] script" error(s)' - ); - expect(logger.debug).toHaveBeenCalledWith('Task pool now using 10 as the max worker value'); + test('should lower capacity when Elasticsearch returns 429 error', async () => { + savedObjectsClient.create.mockRejectedValueOnce( + SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b') + ); + + // Cause "too many requests" error to be thrown + await expect( + taskManagerStart.schedule({ + taskType: 'foo', + state: {}, + params: {}, + }) + ).rejects.toThrowErrorMatchingInlineSnapshot(`"Too Many Requests"`); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + + expect(logger.warn).toHaveBeenCalledWith( + 'Capacity configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Capacity configuration changing from 10 to 8 after seeing 1 "too many request" and/or "execute [inline] script" error(s)' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Task pool now using 10 as the max worker value which is based on a capacity of 10' + ); + }); + + test('should lower capacity when Elasticsearch returns "cannot execute [inline] scripts" error', async () => { + const childEsClient = esStart.client.asInternalUser.child({}) as jest.Mocked; + childEsClient.search.mockImplementationOnce(async () => { + throw inlineScriptError; + }); + + await expect(taskManagerStart.fetch({})).rejects.toThrowErrorMatchingInlineSnapshot( + `"cannot execute [inline] scripts\\" error"` + ); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + + expect(logger.warn).toHaveBeenCalledWith( + 'Capacity configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Capacity configuration changing from 10 to 8 after seeing 1 "too many request" and/or "execute [inline] script" error(s)' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Task pool now using 10 as the max worker value which is based on a capacity of 10' + ); + }); }); - test('should increase poll interval when Elasticsearch returns "cannot execute [inline] scripts" error', async () => { - const childEsClient = esStart.client.asInternalUser.child({}) as jest.Mocked; - childEsClient.search.mockImplementationOnce(async () => { - throw inlineScriptError; + describe('managed capacity with mget claim strategy', () => { + beforeEach(async () => { + jest.resetAllMocks(); + clock = sinon.useFakeTimers(); + + const context = coreMock.createPluginInitializerContext({ + capacity: 10, + max_attempts: 9, + poll_interval: 3000, + allow_reading_invalid_state: false, + version_conflict_threshold: 80, + monitored_aggregated_stats_refresh_rate: 60000, + monitored_stats_health_verbose_log: { + enabled: false, + level: 'debug' as const, + warn_delayed_task_start_in_seconds: 60, + }, + monitored_stats_required_freshness: 4000, + monitored_stats_running_average_window: 50, + request_capacity: 1000, + monitored_task_execution_thresholds: { + default: { + error_threshold: 90, + warn_threshold: 80, + }, + custom: {}, + }, + ephemeral_tasks: { + enabled: true, + request_capacity: 10, + }, + unsafe: { + exclude_task_types: [], + authenticate_background_task_utilization: true, + }, + event_loop_delay: { + monitor: true, + warn_threshold: 5000, + }, + worker_utilization_running_average_window: 5, + metrics_reset_interval: 3000, + claim_strategy: 'unsafe_mget', + request_timeouts: { + update_by_query: 1000, + }, + }); + logger = context.logger.get('taskManager'); + + const taskManager = new TaskManagerPlugin(context); + ( + await taskManager.setup(coreMock.createSetup(), { usageCollection: undefined }) + ).registerTaskDefinitions({ + foo: { + title: 'Foo', + createTaskRunner: jest.fn(), + }, + }); + + const coreStart = coreMock.createStart(); + coreStart.elasticsearch = esStart; + esStart.client.asInternalUser.child.mockReturnValue( + esStart.client.asInternalUser as unknown as Client + ); + coreStart.savedObjects.createInternalRepository.mockReturnValue(savedObjectsClient); + taskManagerStart = await taskManager.start(coreStart); + + // force rxjs timers to fire when they are scheduled for setTimeout(0) as the + // sinon fake timers cause them to stall + clock.tick(0); }); - await expect(taskManagerStart.fetch({})).rejects.toThrowErrorMatchingInlineSnapshot( - `"cannot execute [inline] scripts\\" error"` - ); + test('should lower capacity when Elasticsearch returns 429 error', async () => { + savedObjectsClient.create.mockRejectedValueOnce( + SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b') + ); - clock.tick(ADJUST_THROUGHPUT_INTERVAL); + // Cause "too many requests" error to be thrown + await expect( + taskManagerStart.schedule({ + taskType: 'foo', + state: {}, + params: {}, + }) + ).rejects.toThrowErrorMatchingInlineSnapshot(`"Too Many Requests"`); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + + expect(logger.warn).toHaveBeenCalledWith( + 'Capacity configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Capacity configuration changing from 10 to 8 after seeing 1 "too many request" and/or "execute [inline] script" error(s)' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Task pool now using 20 as the max allowed cost which is based on a capacity of 10' + ); + }); - expect(logger.warn).toHaveBeenCalledWith( - 'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' - ); - expect(logger.debug).toHaveBeenCalledWith( - 'Poll interval configuration changing from 3000 to 3600 after seeing 1 "too many request" and/or "execute [inline] script" error(s)' - ); - expect(logger.debug).toHaveBeenCalledWith('Task poller now using interval of 3600ms'); + test('should lower capacity when Elasticsearch returns "cannot execute [inline] scripts" error', async () => { + const childEsClient = esStart.client.asInternalUser.child({}) as jest.Mocked; + childEsClient.search.mockImplementationOnce(async () => { + throw inlineScriptError; + }); + + await expect(taskManagerStart.fetch({})).rejects.toThrowErrorMatchingInlineSnapshot( + `"cannot execute [inline] scripts\\" error"` + ); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + + expect(logger.warn).toHaveBeenCalledWith( + 'Capacity configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Capacity configuration changing from 10 to 8 after seeing 1 "too many request" and/or "execute [inline] script" error(s)' + ); + expect(logger.debug).toHaveBeenCalledWith( + 'Task pool now using 20 as the max allowed cost which is based on a capacity of 10' + ); + }); }); }); diff --git a/x-pack/plugins/task_manager/server/lib/calculate_health_status.test.ts b/x-pack/plugins/task_manager/server/lib/calculate_health_status.test.ts index fc2f34701e3c1..49c68459982ba 100644 --- a/x-pack/plugins/task_manager/server/lib/calculate_health_status.test.ts +++ b/x-pack/plugins/task_manager/server/lib/calculate_health_status.test.ts @@ -16,7 +16,6 @@ Date.now = jest.fn().mockReturnValue(new Date(now)); const logger = loggingSystemMock.create().get(); const config = { enabled: true, - max_workers: 10, index: 'foo', max_attempts: 9, poll_interval: 3000, @@ -73,6 +72,8 @@ const getStatsWithTimestamp = ({ configuration: { timestamp, value: { + capacity: { config: 10, as_cost: 20, as_workers: 10 }, + claim_strategy: 'default', request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, monitored_stats_running_average_window: 50, @@ -84,7 +85,6 @@ const getStatsWithTimestamp = ({ }, }, poll_interval: 3000, - max_workers: 10, }, status: HealthStatus.OK, }, @@ -213,24 +213,29 @@ const getStatsWithTimestamp = ({ timestamp, value: { count: 2, + cost: 4, task_types: { taskType1: { count: 1, + cost: 2, status: { idle: 1, }, }, taskType2: { count: 1, + cost: 2, status: { idle: 1, }, }, }, non_recurring: 2, + non_recurring_cost: 4, owner_ids: 0, schedule: [['5m', 2]], overdue: 0, + overdue_cost: 0, overdue_non_recurring: 0, estimated_schedule_density: [ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, diff --git a/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts index b1d6ce92c323a..e0762bf054133 100644 --- a/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts +++ b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts @@ -13,6 +13,12 @@ import { ADJUST_THROUGHPUT_INTERVAL, } from './create_managed_configuration'; import { mockLogger } from '../test_utils'; +import { + CLAIM_STRATEGY_DEFAULT, + CLAIM_STRATEGY_MGET, + DEFAULT_CAPACITY, + TaskManagerConfig, +} from '../config'; describe('createManagedConfiguration()', () => { let clock: sinon.SinonFakeTimers; @@ -26,51 +32,140 @@ describe('createManagedConfiguration()', () => { afterEach(() => clock.restore()); test('returns observables with initialized values', async () => { - const maxWorkersSubscription = jest.fn(); + const capacitySubscription = jest.fn(); + const pollIntervalSubscription = jest.fn(); + const { capacityConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ + logger, + errors$: new Subject(), + config: { + capacity: 20, + poll_interval: 2, + } as TaskManagerConfig, + }); + capacityConfiguration$.subscribe(capacitySubscription); + pollIntervalConfiguration$.subscribe(pollIntervalSubscription); + expect(capacitySubscription).toHaveBeenCalledTimes(1); + expect(capacitySubscription).toHaveBeenNthCalledWith(1, 20); + expect(pollIntervalSubscription).toHaveBeenCalledTimes(1); + expect(pollIntervalSubscription).toHaveBeenNthCalledWith(1, 2); + }); + + test('uses max_workers config as capacity if only max workers is defined', async () => { + const capacitySubscription = jest.fn(); + const pollIntervalSubscription = jest.fn(); + const { capacityConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ + logger, + errors$: new Subject(), + config: { + max_workers: 10, + poll_interval: 2, + } as TaskManagerConfig, + }); + capacityConfiguration$.subscribe(capacitySubscription); + pollIntervalConfiguration$.subscribe(pollIntervalSubscription); + expect(capacitySubscription).toHaveBeenCalledTimes(1); + expect(capacitySubscription).toHaveBeenNthCalledWith(1, 10); + expect(pollIntervalSubscription).toHaveBeenCalledTimes(1); + expect(pollIntervalSubscription).toHaveBeenNthCalledWith(1, 2); + }); + + test('uses max_workers config as capacity but does not exceed MAX_CAPACITY', async () => { + const capacitySubscription = jest.fn(); const pollIntervalSubscription = jest.fn(); - const { maxWorkersConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ + const { capacityConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ logger, errors$: new Subject(), - startingMaxWorkers: 1, - startingPollInterval: 2, + config: { + max_workers: 1000, + poll_interval: 2, + } as TaskManagerConfig, }); - maxWorkersConfiguration$.subscribe(maxWorkersSubscription); + capacityConfiguration$.subscribe(capacitySubscription); pollIntervalConfiguration$.subscribe(pollIntervalSubscription); - expect(maxWorkersSubscription).toHaveBeenCalledTimes(1); - expect(maxWorkersSubscription).toHaveBeenNthCalledWith(1, 1); + expect(capacitySubscription).toHaveBeenCalledTimes(1); + expect(capacitySubscription).toHaveBeenNthCalledWith(1, 50); expect(pollIntervalSubscription).toHaveBeenCalledTimes(1); expect(pollIntervalSubscription).toHaveBeenNthCalledWith(1, 2); }); + test('uses DEFAULT_CAPACITY if neither capacity nor max_workers is defined', async () => { + const capacitySubscription = jest.fn(); + const pollIntervalSubscription = jest.fn(); + const { capacityConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ + logger, + errors$: new Subject(), + config: { + poll_interval: 2, + } as TaskManagerConfig, + }); + capacityConfiguration$.subscribe(capacitySubscription); + pollIntervalConfiguration$.subscribe(pollIntervalSubscription); + expect(capacitySubscription).toHaveBeenCalledTimes(1); + expect(capacitySubscription).toHaveBeenNthCalledWith(1, DEFAULT_CAPACITY); + expect(pollIntervalSubscription).toHaveBeenCalledTimes(1); + expect(pollIntervalSubscription).toHaveBeenNthCalledWith(1, 2); + }); + + test('logs warning and uses capacity config if both capacity and max_workers is defined', async () => { + const capacitySubscription = jest.fn(); + const pollIntervalSubscription = jest.fn(); + const { capacityConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ + logger, + errors$: new Subject(), + config: { + capacity: 30, + max_workers: 10, + poll_interval: 2, + } as TaskManagerConfig, + }); + capacityConfiguration$.subscribe(capacitySubscription); + pollIntervalConfiguration$.subscribe(pollIntervalSubscription); + expect(capacitySubscription).toHaveBeenCalledTimes(1); + expect(capacitySubscription).toHaveBeenNthCalledWith(1, 30); + expect(pollIntervalSubscription).toHaveBeenCalledTimes(1); + expect(pollIntervalSubscription).toHaveBeenNthCalledWith(1, 2); + expect(logger.warn).toHaveBeenCalledWith( + `Both \"xpack.task_manager.capacity\" and \"xpack.task_manager.max_workers\" configs are set, max_workers will be ignored in favor of capacity and the setting should be removed.` + ); + }); + test(`skips errors that aren't about too many requests`, async () => { - const maxWorkersSubscription = jest.fn(); + const capacitySubscription = jest.fn(); const pollIntervalSubscription = jest.fn(); const errors$ = new Subject(); - const { maxWorkersConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ + const { capacityConfiguration$, pollIntervalConfiguration$ } = createManagedConfiguration({ errors$, logger, - startingMaxWorkers: 100, - startingPollInterval: 100, + config: { + capacity: 10, + poll_interval: 100, + } as TaskManagerConfig, }); - maxWorkersConfiguration$.subscribe(maxWorkersSubscription); + capacityConfiguration$.subscribe(capacitySubscription); pollIntervalConfiguration$.subscribe(pollIntervalSubscription); errors$.next(new Error('foo')); clock.tick(ADJUST_THROUGHPUT_INTERVAL); - expect(maxWorkersSubscription).toHaveBeenCalledTimes(1); + expect(capacitySubscription).toHaveBeenCalledTimes(1); expect(pollIntervalSubscription).toHaveBeenCalledTimes(1); }); - describe('maxWorker configuration', () => { - function setupScenario(startingMaxWorkers: number) { + describe('capacity configuration', () => { + function setupScenario( + startingCapacity: number, + claimStrategy: string = CLAIM_STRATEGY_DEFAULT + ) { const errors$ = new Subject(); const subscription = jest.fn(); - const { maxWorkersConfiguration$ } = createManagedConfiguration({ + const { capacityConfiguration$ } = createManagedConfiguration({ errors$, - startingMaxWorkers, logger, - startingPollInterval: 1, + config: { + capacity: startingCapacity, + poll_interval: 1, + claim_strategy: claimStrategy, + } as TaskManagerConfig, }); - maxWorkersConfiguration$.subscribe(subscription); + capacityConfiguration$.subscribe(subscription); return { subscription, errors$ }; } @@ -81,66 +176,103 @@ describe('createManagedConfiguration()', () => { afterEach(() => clock.restore()); - test('should decrease configuration at the next interval when an error is emitted', async () => { - const { subscription, errors$ } = setupScenario(100); - errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); - clock.tick(ADJUST_THROUGHPUT_INTERVAL - 1); - expect(subscription).toHaveBeenCalledTimes(1); - clock.tick(1); - expect(subscription).toHaveBeenCalledTimes(2); - expect(subscription).toHaveBeenNthCalledWith(2, 80); - }); + describe('default claim strategy', () => { + test('should decrease configuration at the next interval when an error is emitted', async () => { + const { subscription, errors$ } = setupScenario(10); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL - 1); + expect(subscription).toHaveBeenCalledTimes(1); + expect(subscription).toHaveBeenNthCalledWith(1, 10); + clock.tick(1); + expect(subscription).toHaveBeenCalledTimes(2); + expect(subscription).toHaveBeenNthCalledWith(2, 8); + }); - test('should log a warning when the configuration changes from the starting value', async () => { - const { errors$ } = setupScenario(100); - errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); - clock.tick(ADJUST_THROUGHPUT_INTERVAL); - expect(logger.warn).toHaveBeenCalledWith( - 'Max workers configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' - ); - }); + test('should log a warning when the configuration changes from the starting value', async () => { + const { errors$ } = setupScenario(10); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + expect(logger.warn).toHaveBeenCalledWith( + 'Capacity configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' + ); + }); - test('should increase configuration back to normal incrementally after an error is emitted', async () => { - const { subscription, errors$ } = setupScenario(100); - errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); - clock.tick(ADJUST_THROUGHPUT_INTERVAL * 10); - expect(subscription).toHaveBeenNthCalledWith(2, 80); - expect(subscription).toHaveBeenNthCalledWith(3, 84); - // 88.2- > 89 from Math.ceil - expect(subscription).toHaveBeenNthCalledWith(4, 89); - expect(subscription).toHaveBeenNthCalledWith(5, 94); - expect(subscription).toHaveBeenNthCalledWith(6, 99); - // 103.95 -> 100 from Math.min with starting value - expect(subscription).toHaveBeenNthCalledWith(7, 100); - // No new calls due to value not changing and usage of distinctUntilChanged() - expect(subscription).toHaveBeenCalledTimes(7); + test('should increase configuration back to normal incrementally after an error is emitted', async () => { + const { subscription, errors$ } = setupScenario(10); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL * 10); + expect(subscription).toHaveBeenNthCalledWith(1, 10); + expect(subscription).toHaveBeenNthCalledWith(2, 8); + expect(subscription).toHaveBeenNthCalledWith(3, 9); + expect(subscription).toHaveBeenNthCalledWith(4, 10); + // No new calls due to value not changing and usage of distinctUntilChanged() + expect(subscription).toHaveBeenCalledTimes(4); + }); + + test('should keep reducing configuration when errors keep emitting until it reaches minimum', async () => { + const { subscription, errors$ } = setupScenario(10); + for (let i = 0; i < 20; i++) { + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + } + expect(subscription).toHaveBeenNthCalledWith(1, 10); + expect(subscription).toHaveBeenNthCalledWith(2, 8); + expect(subscription).toHaveBeenNthCalledWith(3, 6); + expect(subscription).toHaveBeenNthCalledWith(4, 4); + expect(subscription).toHaveBeenNthCalledWith(5, 3); + expect(subscription).toHaveBeenNthCalledWith(6, 2); + expect(subscription).toHaveBeenNthCalledWith(7, 1); + // No new calls due to value not changing and usage of distinctUntilChanged() + expect(subscription).toHaveBeenCalledTimes(7); + }); }); - test('should keep reducing configuration when errors keep emitting', async () => { - const { subscription, errors$ } = setupScenario(100); - for (let i = 0; i < 20; i++) { + describe('mget claim strategy', () => { + test('should decrease configuration at the next interval when an error is emitted', async () => { + const { subscription, errors$ } = setupScenario(10, CLAIM_STRATEGY_MGET); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL - 1); + expect(subscription).toHaveBeenCalledTimes(1); + expect(subscription).toHaveBeenNthCalledWith(1, 10); + clock.tick(1); + expect(subscription).toHaveBeenCalledTimes(2); + expect(subscription).toHaveBeenNthCalledWith(2, 8); + }); + + test('should log a warning when the configuration changes from the starting value', async () => { + const { errors$ } = setupScenario(10, CLAIM_STRATEGY_MGET); errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); clock.tick(ADJUST_THROUGHPUT_INTERVAL); - } - expect(subscription).toHaveBeenNthCalledWith(2, 80); - expect(subscription).toHaveBeenNthCalledWith(3, 64); - // 51.2 -> 51 from Math.floor - expect(subscription).toHaveBeenNthCalledWith(4, 51); - expect(subscription).toHaveBeenNthCalledWith(5, 40); - expect(subscription).toHaveBeenNthCalledWith(6, 32); - expect(subscription).toHaveBeenNthCalledWith(7, 25); - expect(subscription).toHaveBeenNthCalledWith(8, 20); - expect(subscription).toHaveBeenNthCalledWith(9, 16); - expect(subscription).toHaveBeenNthCalledWith(10, 12); - expect(subscription).toHaveBeenNthCalledWith(11, 9); - expect(subscription).toHaveBeenNthCalledWith(12, 7); - expect(subscription).toHaveBeenNthCalledWith(13, 5); - expect(subscription).toHaveBeenNthCalledWith(14, 4); - expect(subscription).toHaveBeenNthCalledWith(15, 3); - expect(subscription).toHaveBeenNthCalledWith(16, 2); - expect(subscription).toHaveBeenNthCalledWith(17, 1); - // No new calls due to value not changing and usage of distinctUntilChanged() - expect(subscription).toHaveBeenCalledTimes(17); + expect(logger.warn).toHaveBeenCalledWith( + 'Capacity configuration is temporarily reduced after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).' + ); + }); + + test('should increase configuration back to normal incrementally after an error is emitted', async () => { + const { subscription, errors$ } = setupScenario(10, CLAIM_STRATEGY_MGET); + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL * 10); + expect(subscription).toHaveBeenNthCalledWith(1, 10); + expect(subscription).toHaveBeenNthCalledWith(2, 8); + expect(subscription).toHaveBeenNthCalledWith(3, 9); + expect(subscription).toHaveBeenNthCalledWith(4, 10); + // No new calls due to value not changing and usage of distinctUntilChanged() + expect(subscription).toHaveBeenCalledTimes(4); + }); + + test('should keep reducing configuration when errors keep emitting until it reaches minimum', async () => { + const { subscription, errors$ } = setupScenario(10, CLAIM_STRATEGY_MGET); + for (let i = 0; i < 20; i++) { + errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b')); + clock.tick(ADJUST_THROUGHPUT_INTERVAL); + } + expect(subscription).toHaveBeenNthCalledWith(1, 10); + expect(subscription).toHaveBeenNthCalledWith(2, 8); + expect(subscription).toHaveBeenNthCalledWith(3, 6); + expect(subscription).toHaveBeenNthCalledWith(4, 5); + // No new calls due to value not changing and usage of distinctUntilChanged() + expect(subscription).toHaveBeenCalledTimes(4); + }); }); }); @@ -151,8 +283,10 @@ describe('createManagedConfiguration()', () => { const { pollIntervalConfiguration$ } = createManagedConfiguration({ logger, errors$, - startingPollInterval, - startingMaxWorkers: 1, + config: { + poll_interval: startingPollInterval, + capacity: 20, + } as TaskManagerConfig, }); pollIntervalConfiguration$.subscribe(subscription); return { subscription, errors$ }; diff --git a/x-pack/plugins/task_manager/server/lib/create_managed_configuration.ts b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.ts index 5c7b1a16a4308..9e350d9902804 100644 --- a/x-pack/plugins/task_manager/server/lib/create_managed_configuration.ts +++ b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.ts @@ -10,17 +10,26 @@ import { filter, mergeScan, map, scan, distinctUntilChanged, startWith } from 'r import { SavedObjectsErrorHelpers } from '@kbn/core/server'; import { Logger } from '@kbn/core/server'; import { isEsCannotExecuteScriptError } from './identify_es_error'; +import { CLAIM_STRATEGY_MGET, DEFAULT_CAPACITY, MAX_CAPACITY, TaskManagerConfig } from '../config'; +import { TaskCost } from '../task'; const FLUSH_MARKER = Symbol('flush'); export const ADJUST_THROUGHPUT_INTERVAL = 10 * 1000; export const PREFERRED_MAX_POLL_INTERVAL = 60 * 1000; + +// Capacity is measured in number of normal cost tasks that can be run +// At a minimum, we need to be able to run a single task with the greatest cost +// so we should convert the greatest cost to normal cost +export const MIN_COST = TaskCost.ExtraLarge / TaskCost.Normal; + +// For default claim strategy export const MIN_WORKERS = 1; -// When errors occur, reduce maxWorkers by MAX_WORKERS_DECREASE_PERCENTAGE -// When errors no longer occur, start increasing maxWorkers by MAX_WORKERS_INCREASE_PERCENTAGE +// When errors occur, reduce capacity by CAPACITY_DECREASE_PERCENTAGE +// When errors no longer occur, start increasing capacity by CAPACITY_INCREASE_PERCENTAGE // until starting value is reached -const MAX_WORKERS_DECREASE_PERCENTAGE = 0.8; -const MAX_WORKERS_INCREASE_PERCENTAGE = 1.05; +const CAPACITY_DECREASE_PERCENTAGE = 0.8; +const CAPACITY_INCREASE_PERCENTAGE = 1.05; // When errors occur, increase pollInterval by POLL_INTERVAL_INCREASE_PERCENTAGE // When errors no longer occur, start decreasing pollInterval by POLL_INTERVAL_DECREASE_PERCENTAGE @@ -29,28 +38,30 @@ const POLL_INTERVAL_DECREASE_PERCENTAGE = 0.95; const POLL_INTERVAL_INCREASE_PERCENTAGE = 1.2; interface ManagedConfigurationOpts { + config: TaskManagerConfig; logger: Logger; - startingMaxWorkers: number; - startingPollInterval: number; errors$: Observable; } export interface ManagedConfiguration { - maxWorkersConfiguration$: Observable; + startingCapacity: number; + capacityConfiguration$: Observable; pollIntervalConfiguration$: Observable; } export function createManagedConfiguration({ + config, logger, - startingMaxWorkers, - startingPollInterval, errors$, }: ManagedConfigurationOpts): ManagedConfiguration { const errorCheck$ = countErrors(errors$, ADJUST_THROUGHPUT_INTERVAL); + const startingCapacity = calculateStartingCapacity(config, logger); + const startingPollInterval = config.poll_interval; return { - maxWorkersConfiguration$: errorCheck$.pipe( - createMaxWorkersScan(logger, startingMaxWorkers), - startWith(startingMaxWorkers), + startingCapacity, + capacityConfiguration$: errorCheck$.pipe( + createCapacityScan(config, logger, startingCapacity), + startWith(startingCapacity), distinctUntilChanged() ), pollIntervalConfiguration$: errorCheck$.pipe( @@ -61,37 +72,39 @@ export function createManagedConfiguration({ }; } -function createMaxWorkersScan(logger: Logger, startingMaxWorkers: number) { - return scan((previousMaxWorkers: number, errorCount: number) => { - let newMaxWorkers: number; +function createCapacityScan(config: TaskManagerConfig, logger: Logger, startingCapacity: number) { + return scan((previousCapacity: number, errorCount: number) => { + let newCapacity: number; if (errorCount > 0) { - // Decrease max workers by MAX_WORKERS_DECREASE_PERCENTAGE while making sure it doesn't go lower than 1. + const minCapacity = getMinCapacity(config); + // Decrease capacity by CAPACITY_DECREASE_PERCENTAGE while making sure it doesn't go lower than minCapacity. // Using Math.floor to make sure the number is different than previous while not being a decimal value. - newMaxWorkers = Math.max( - Math.floor(previousMaxWorkers * MAX_WORKERS_DECREASE_PERCENTAGE), - MIN_WORKERS + newCapacity = Math.max( + Math.floor(previousCapacity * CAPACITY_DECREASE_PERCENTAGE), + minCapacity ); } else { - // Increase max workers by MAX_WORKERS_INCREASE_PERCENTAGE while making sure it doesn't go + // Increase capacity by CAPACITY_INCREASE_PERCENTAGE while making sure it doesn't go // higher than the starting value. Using Math.ceil to make sure the number is different than // previous while not being a decimal value - newMaxWorkers = Math.min( - startingMaxWorkers, - Math.ceil(previousMaxWorkers * MAX_WORKERS_INCREASE_PERCENTAGE) + newCapacity = Math.min( + startingCapacity, + Math.ceil(previousCapacity * CAPACITY_INCREASE_PERCENTAGE) ); } - if (newMaxWorkers !== previousMaxWorkers) { + + if (newCapacity !== previousCapacity) { logger.debug( - `Max workers configuration changing from ${previousMaxWorkers} to ${newMaxWorkers} after seeing ${errorCount} "too many request" and/or "execute [inline] script" error(s)` + `Capacity configuration changing from ${previousCapacity} to ${newCapacity} after seeing ${errorCount} "too many request" and/or "execute [inline] script" error(s)` ); - if (previousMaxWorkers === startingMaxWorkers) { + if (previousCapacity === startingCapacity) { logger.warn( - `Max workers configuration is temporarily reduced after Elasticsearch returned ${errorCount} "too many request" and/or "execute [inline] script" error(s).` + `Capacity configuration is temporarily reduced after Elasticsearch returned ${errorCount} "too many request" and/or "execute [inline] script" error(s).` ); } } - return newMaxWorkers; - }, startingMaxWorkers); + return newCapacity; + }, startingCapacity); } function createPollIntervalScan(logger: Logger, startingPollInterval: number) { @@ -186,3 +199,32 @@ function resetErrorCount() { count: 0, }; } + +function getMinCapacity(config: TaskManagerConfig) { + switch (config.claim_strategy) { + case CLAIM_STRATEGY_MGET: + return MIN_COST; + + default: + return MIN_WORKERS; + } +} + +export function calculateStartingCapacity(config: TaskManagerConfig, logger: Logger): number { + if (config.capacity !== undefined && config.max_workers !== undefined) { + logger.warn( + `Both "xpack.task_manager.capacity" and "xpack.task_manager.max_workers" configs are set, max_workers will be ignored in favor of capacity and the setting should be removed.` + ); + } + + if (config.capacity) { + // Use capacity if explicitly set + return config.capacity!; + } else if (config.max_workers) { + // Otherwise use max_worker value as capacity, capped at MAX_CAPACITY + return Math.min(config.max_workers, MAX_CAPACITY); + } + + // Neither are set, use DEFAULT CAPACITY + return DEFAULT_CAPACITY; +} diff --git a/x-pack/plugins/task_manager/server/lib/fill_pool.test.ts b/x-pack/plugins/task_manager/server/lib/fill_pool.test.ts index 9fdb16fb5f677..d3533ac058314 100644 --- a/x-pack/plugins/task_manager/server/lib/fill_pool.test.ts +++ b/x-pack/plugins/task_manager/server/lib/fill_pool.test.ts @@ -30,7 +30,6 @@ describe('fillPool', () => { tasksUpdated: tasks?.length ?? 0, tasksConflicted: 0, tasksClaimed: 0, - tasksRejected: 0, }, docs: tasks, }) diff --git a/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts b/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts index ea0793b60266b..a39568df5fdd2 100644 --- a/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts +++ b/x-pack/plugins/task_manager/server/lib/log_health_metrics.test.ts @@ -435,7 +435,8 @@ function getMockMonitoredHealth(overrides = {}): MonitoredHealth { timestamp: new Date().toISOString(), status: HealthStatus.OK, value: { - max_workers: 10, + capacity: { config: 10, as_cost: 20, as_workers: 10 }, + claim_strategy: 'default', poll_interval: 3000, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, @@ -454,16 +455,19 @@ function getMockMonitoredHealth(overrides = {}): MonitoredHealth { status: HealthStatus.OK, value: { count: 4, + cost: 8, task_types: { - actions_telemetry: { count: 2, status: { idle: 2 } }, - alerting_telemetry: { count: 1, status: { idle: 1 } }, - session_cleanup: { count: 1, status: { idle: 1 } }, + actions_telemetry: { count: 2, cost: 4, status: { idle: 2 } }, + alerting_telemetry: { count: 1, cost: 2, status: { idle: 1 } }, + session_cleanup: { count: 1, cost: 2, status: { idle: 1 } }, }, schedule: [], overdue: 0, + overdue_cost: 0, overdue_non_recurring: 0, estimatedScheduleDensity: [], non_recurring: 20, + non_recurring_cost: 40, owner_ids: 2, estimated_schedule_density: [], capacity_requirements: { diff --git a/x-pack/plugins/task_manager/server/metrics/create_aggregator.test.ts b/x-pack/plugins/task_manager/server/metrics/create_aggregator.test.ts index 309617a8e4cc3..b1cf9a90b6cb6 100644 --- a/x-pack/plugins/task_manager/server/metrics/create_aggregator.test.ts +++ b/x-pack/plugins/task_manager/server/metrics/create_aggregator.test.ts @@ -45,7 +45,6 @@ const config: TaskManagerConfig = { warn_threshold: 5000, }, max_attempts: 9, - max_workers: 10, metrics_reset_interval: 30000, monitored_aggregated_stats_refresh_rate: 5000, monitored_stats_health_verbose_log: { diff --git a/x-pack/plugins/task_manager/server/monitoring/background_task_utilization_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/background_task_utilization_statistics.ts index 837f29c83f108..5a9a9e07aadf7 100644 --- a/x-pack/plugins/task_manager/server/monitoring/background_task_utilization_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/background_task_utilization_statistics.ts @@ -21,7 +21,7 @@ import { } from '../task_events'; import { MonitoredStat } from './monitoring_stats_stream'; import { AggregatedStat, AggregatedStatProvider } from '../lib/runtime_statistics_aggregator'; -import { createRunningAveragedStat } from './task_run_calcultors'; +import { createRunningAveragedStat } from './task_run_calculators'; import { DEFAULT_WORKER_UTILIZATION_RUNNING_AVERAGE_WINDOW } from '../config'; export interface PublicBackgroundTaskUtilizationStat extends JsonObject { diff --git a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts index 263f2e9987b7c..9791ac805e500 100644 --- a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.test.ts @@ -21,7 +21,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: 1, overdue_non_recurring: 0, @@ -77,7 +77,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: 1, overdue_non_recurring: 0, @@ -135,7 +135,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: 1, overdue_non_recurring: 0, @@ -172,7 +172,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: 1, overdue_non_recurring: 0, @@ -228,7 +228,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { // 0 active tasks at this moment in time, so no owners identifiable owner_ids: 0, @@ -285,7 +285,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: 3, overdue_non_recurring: 0, @@ -347,7 +347,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: provisionedKibanaInstances, overdue_non_recurring: 0, @@ -428,7 +428,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: provisionedKibanaInstances, overdue_non_recurring: 0, @@ -510,7 +510,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: 1, overdue_non_recurring: 0, @@ -578,7 +578,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: 1, overdue_non_recurring: 0, @@ -643,7 +643,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: 1, overdue_non_recurring: 0, @@ -708,7 +708,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: 1, overdue_non_recurring: 0, @@ -784,7 +784,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { owner_ids: 1, overdue_non_recurring: 0, @@ -862,7 +862,7 @@ describe('estimateCapacity', () => { estimateCapacity( logger, mockStats( - { max_workers: 10, poll_interval: 3000 }, + { capacity: { config: 10, as_cost: 20, as_workers: 10 }, poll_interval: 3000 }, { overdue: undefined, owner_ids: 1, @@ -949,7 +949,8 @@ function mockStats( status: HealthStatus.OK, timestamp: new Date().toISOString(), value: { - max_workers: 0, + capacity: { config: 10, as_cost: 20, as_workers: 10 }, + claim_strategy: 'default', poll_interval: 0, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, @@ -969,16 +970,19 @@ function mockStats( timestamp: new Date().toISOString(), value: { count: 4, + cost: 8, task_types: { - actions_telemetry: { count: 2, status: { idle: 2 } }, - alerting_telemetry: { count: 1, status: { idle: 1 } }, - session_cleanup: { count: 1, status: { idle: 1 } }, + actions_telemetry: { count: 2, cost: 4, status: { idle: 2 } }, + alerting_telemetry: { count: 1, cost: 2, status: { idle: 1 } }, + session_cleanup: { count: 1, cost: 2, status: { idle: 1 } }, }, schedule: [], overdue: 0, + overdue_cost: 0, overdue_non_recurring: 0, estimated_schedule_density: [], non_recurring: 20, + non_recurring_cost: 40, owner_ids: 2, capacity_requirements: { per_minute: 150, diff --git a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts index b12382f16e27b..d1c2f3591ea22 100644 --- a/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts +++ b/x-pack/plugins/task_manager/server/monitoring/capacity_estimation.ts @@ -10,7 +10,7 @@ import stats from 'stats-lite'; import { JsonObject } from '@kbn/utility-types'; import { Logger } from '@kbn/core/server'; import { RawMonitoringStats, RawMonitoredStat, HealthStatus } from './monitoring_stats_stream'; -import { AveragedStat } from './task_run_calcultors'; +import { AveragedStat } from './task_run_calculators'; import { TaskPersistenceTypes } from './task_run_statistics'; import { asErr, asOk, map, Result } from '../lib/result_type'; @@ -61,8 +61,10 @@ export function estimateCapacity( non_recurring: percentageOfExecutionsUsedByNonRecurringTasks, } = capacityStats.runtime.value.execution.persistence; const { overdue, capacity_requirements: capacityRequirements } = workload; - const { poll_interval: pollInterval, max_workers: maxWorkers } = - capacityStats.configuration.value; + const { + poll_interval: pollInterval, + capacity: { config: configuredCapacity }, + } = capacityStats.configuration.value; /** * On average, how many polling cycles does it take to execute a task? @@ -78,10 +80,10 @@ export function estimateCapacity( ); /** - * Given the current configuration how much task capacity do we have? + * Given the current configuration how much capacity do we have to run normal cost tasks? */ const capacityPerMinutePerKibana = Math.round( - ((60 * 1000) / (averagePollIntervalsPerExecution * pollInterval)) * maxWorkers + ((60 * 1000) / (averagePollIntervalsPerExecution * pollInterval)) * configuredCapacity ); /** diff --git a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts index 822356e2d6534..0b5387b66dece 100644 --- a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.test.ts @@ -13,7 +13,6 @@ import { TaskManagerConfig } from '../config'; describe('Configuration Statistics Aggregator', () => { test('merges the static config with the merged configs', async () => { const configuration: TaskManagerConfig = { - max_workers: 10, max_attempts: 9, poll_interval: 6000000, allow_reading_invalid_state: false, @@ -55,7 +54,8 @@ describe('Configuration Statistics Aggregator', () => { }; const managedConfig = { - maxWorkersConfiguration$: new Subject(), + startingCapacity: 10, + capacityConfiguration$: new Subject(), pollIntervalConfiguration$: new Subject(), }; @@ -65,7 +65,12 @@ describe('Configuration Statistics Aggregator', () => { .pipe(take(3), bufferCount(3)) .subscribe(([initial, updatedWorkers, updatedInterval]) => { expect(initial.value).toEqual({ - max_workers: 10, + capacity: { + config: 10, + as_workers: 10, + as_cost: 20, + }, + claim_strategy: 'default', poll_interval: 6000000, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, @@ -79,7 +84,12 @@ describe('Configuration Statistics Aggregator', () => { }, }); expect(updatedWorkers.value).toEqual({ - max_workers: 8, + capacity: { + config: 8, + as_workers: 8, + as_cost: 16, + }, + claim_strategy: 'default', poll_interval: 6000000, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, @@ -93,7 +103,12 @@ describe('Configuration Statistics Aggregator', () => { }, }); expect(updatedInterval.value).toEqual({ - max_workers: 8, + capacity: { + config: 8, + as_workers: 8, + as_cost: 16, + }, + claim_strategy: 'default', poll_interval: 3000, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, @@ -108,7 +123,7 @@ describe('Configuration Statistics Aggregator', () => { }); resolve(); }, reject); - managedConfig.maxWorkersConfiguration$.next(8); + managedConfig.capacityConfiguration$.next(8); managedConfig.pollIntervalConfiguration$.next(3000); } catch (error) { reject(error); diff --git a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.ts index dc3221351a33e..c606b63694b0f 100644 --- a/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/configuration_statistics.ts @@ -8,9 +8,11 @@ import { combineLatest, of } from 'rxjs'; import { pick, merge } from 'lodash'; import { map, startWith } from 'rxjs'; +import { JsonObject } from '@kbn/utility-types'; import { AggregatedStatProvider } from '../lib/runtime_statistics_aggregator'; -import { TaskManagerConfig } from '../config'; +import { CLAIM_STRATEGY_DEFAULT, TaskManagerConfig } from '../config'; import { ManagedConfiguration } from '../lib/create_managed_configuration'; +import { getCapacityInCost, getCapacityInWorkers } from '../task_pool'; const CONFIG_FIELDS_TO_EXPOSE = [ 'request_capacity', @@ -19,10 +21,19 @@ const CONFIG_FIELDS_TO_EXPOSE = [ 'monitored_task_execution_thresholds', ] as const; +interface CapacityConfig extends JsonObject { + capacity: { + config: number; + as_workers: number; + as_cost: number; + }; +} + export type ConfigStat = Pick< TaskManagerConfig, - 'max_workers' | 'poll_interval' | (typeof CONFIG_FIELDS_TO_EXPOSE)[number] ->; + 'poll_interval' | 'claim_strategy' | (typeof CONFIG_FIELDS_TO_EXPOSE)[number] +> & + CapacityConfig; export function createConfigurationAggregator( config: TaskManagerConfig, @@ -30,16 +41,21 @@ export function createConfigurationAggregator( ): AggregatedStatProvider { return combineLatest([ of(pick(config, ...CONFIG_FIELDS_TO_EXPOSE)), + of({ claim_strategy: config.claim_strategy ?? CLAIM_STRATEGY_DEFAULT }), managedConfig.pollIntervalConfiguration$.pipe( startWith(config.poll_interval), map>((pollInterval) => ({ poll_interval: pollInterval, })) ), - managedConfig.maxWorkersConfiguration$.pipe( - startWith(config.max_workers), - map>((maxWorkers) => ({ - max_workers: maxWorkers, + managedConfig.capacityConfiguration$.pipe( + startWith(managedConfig.startingCapacity), + map((capacity) => ({ + capacity: { + config: capacity, + as_workers: getCapacityInWorkers(capacity), + as_cost: getCapacityInCost(capacity), + }, })) ), ]).pipe( diff --git a/x-pack/plugins/task_manager/server/monitoring/ephemeral_task_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/ephemeral_task_statistics.test.ts index d7135837e052e..ac16070d7c131 100644 --- a/x-pack/plugins/task_manager/server/monitoring/ephemeral_task_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/ephemeral_task_statistics.test.ts @@ -176,11 +176,11 @@ describe('Ephemeral Task Statistics', () => { }); const runningAverageWindowSize = 5; - const maxWorkers = 10; + const capacity = 10; const ephemeralTaskAggregator = createEphemeralTaskAggregator( ephemeralTaskLifecycle, runningAverageWindowSize, - maxWorkers + capacity ); function expectWindowEqualsUpdate( @@ -229,7 +229,7 @@ describe('Ephemeral Task Statistics', () => { }); }); -test('returns the average load added per polling cycle cycle by ephemeral tasks when load exceeds max workers', async () => { +test('returns the average load added per polling cycle cycle by ephemeral tasks when load exceeds capacity', async () => { const tasksExecuted = [0, 5, 10, 20, 15, 10, 5, 0, 0, 0, 0, 0]; const expectedLoad = [0, 50, 100, 200, 150, 100, 50, 0, 0, 0, 0, 0]; @@ -241,11 +241,11 @@ test('returns the average load added per polling cycle cycle by ephemeral tasks }); const runningAverageWindowSize = 5; - const maxWorkers = 10; + const capacity = 10; const ephemeralTaskAggregator = createEphemeralTaskAggregator( ephemeralTaskLifecycle, runningAverageWindowSize, - maxWorkers + capacity ); function expectWindowEqualsUpdate( diff --git a/x-pack/plugins/task_manager/server/monitoring/ephemeral_task_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/ephemeral_task_statistics.ts index b77eae1080fbc..d02080a56a1aa 100644 --- a/x-pack/plugins/task_manager/server/monitoring/ephemeral_task_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/ephemeral_task_statistics.ts @@ -17,7 +17,7 @@ import { AveragedStat, calculateRunningAverage, createRunningAveragedStat, -} from './task_run_calcultors'; +} from './task_run_calculators'; import { HealthStatus } from './monitoring_stats_stream'; export interface EphemeralTaskStat extends JsonObject { @@ -35,7 +35,7 @@ export interface SummarizedEphemeralTaskStat extends JsonObject { export function createEphemeralTaskAggregator( ephemeralTaskLifecycle: EphemeralTaskLifecycle, runningAverageWindowSize: number, - maxWorkers: number + capacity: number ): AggregatedStatProvider { const ephemeralTaskRunEvents$ = ephemeralTaskLifecycle.events.pipe( filter((taskEvent: TaskLifecycleEvent) => isTaskRunEvent(taskEvent)) @@ -70,7 +70,7 @@ export function createEphemeralTaskAggregator( map(([tasksRanSincePreviousQueueSize, ephemeralQueueSize]) => ({ queuedTasks: ephemeralQueuedTasksQueue(ephemeralQueueSize), executionsPerCycle: ephemeralQueueExecutionsPerCycleQueue(tasksRanSincePreviousQueueSize), - load: ephemeralTaskLoadQueue(calculateWorkerLoad(maxWorkers, tasksRanSincePreviousQueueSize)), + load: ephemeralTaskLoadQueue(calculateWorkerLoad(capacity, tasksRanSincePreviousQueueSize)), })), startWith({ queuedTasks: [], diff --git a/x-pack/plugins/task_manager/server/monitoring/index.ts b/x-pack/plugins/task_manager/server/monitoring/index.ts index 9ee32e97d7758..5dc024b53de10 100644 --- a/x-pack/plugins/task_manager/server/monitoring/index.ts +++ b/x-pack/plugins/task_manager/server/monitoring/index.ts @@ -18,6 +18,7 @@ import { TaskPollingLifecycle } from '../polling_lifecycle'; import { ManagedConfiguration } from '../lib/create_managed_configuration'; import { EphemeralTaskLifecycle } from '../ephemeral_task_lifecycle'; import { AdHocTaskCounter } from '../lib/adhoc_task_counter'; +import { TaskTypeDictionary } from '../task_type_dictionary'; export type { MonitoringStats, RawMonitoringStats } from './monitoring_stats_stream'; export { @@ -27,27 +28,20 @@ export { createMonitoringStatsStream, } from './monitoring_stats_stream'; +export interface CreateMonitoringStatsOpts { + taskStore: TaskStore; + elasticsearchAndSOAvailability$: Observable; + config: TaskManagerConfig; + managedConfig: ManagedConfiguration; + logger: Logger; + adHocTaskCounter: AdHocTaskCounter; + taskDefinitions: TaskTypeDictionary; + taskPollingLifecycle?: TaskPollingLifecycle; + ephemeralTaskLifecycle?: EphemeralTaskLifecycle; +} + export function createMonitoringStats( - taskStore: TaskStore, - elasticsearchAndSOAvailability$: Observable, - config: TaskManagerConfig, - managedConfig: ManagedConfiguration, - logger: Logger, - adHocTaskCounter: AdHocTaskCounter, - taskPollingLifecycle?: TaskPollingLifecycle, - ephemeralTaskLifecycle?: EphemeralTaskLifecycle + opts: CreateMonitoringStatsOpts ): Observable { - return createMonitoringStatsStream( - createAggregators( - taskStore, - elasticsearchAndSOAvailability$, - config, - managedConfig, - logger, - adHocTaskCounter, - taskPollingLifecycle, - ephemeralTaskLifecycle - ), - config - ); + return createMonitoringStatsStream(createAggregators(opts)); } diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts index f4da53871ffa3..075b663e4ce83 100644 --- a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.test.ts @@ -5,7 +5,6 @@ * 2.0. */ -import { TaskManagerConfig } from '../config'; import { of, Subject } from 'rxjs'; import { take, bufferCount } from 'rxjs'; import { createMonitoringStatsStream } from './monitoring_stats_stream'; @@ -17,51 +16,9 @@ beforeEach(() => { }); describe('createMonitoringStatsStream', () => { - const configuration: TaskManagerConfig = { - max_workers: 10, - max_attempts: 9, - poll_interval: 6000000, - allow_reading_invalid_state: false, - version_conflict_threshold: 80, - monitored_stats_required_freshness: 6000000, - request_capacity: 1000, - monitored_aggregated_stats_refresh_rate: 5000, - monitored_stats_health_verbose_log: { - enabled: false, - level: 'debug' as const, - warn_delayed_task_start_in_seconds: 60, - }, - monitored_stats_running_average_window: 50, - monitored_task_execution_thresholds: { - default: { - error_threshold: 90, - warn_threshold: 80, - }, - custom: {}, - }, - ephemeral_tasks: { - enabled: true, - request_capacity: 10, - }, - unsafe: { - exclude_task_types: [], - authenticate_background_task_utilization: true, - }, - event_loop_delay: { - monitor: true, - warn_threshold: 5000, - }, - worker_utilization_running_average_window: 5, - metrics_reset_interval: 3000, - claim_strategy: 'default', - request_timeouts: { - update_by_query: 1000, - }, - }; - it('returns the initial config used to configure Task Manager', async () => { return new Promise((resolve) => { - createMonitoringStatsStream(of(), configuration) + createMonitoringStatsStream(of()) .pipe(take(1)) .subscribe((firstValue) => { expect(firstValue.stats).toEqual({}); @@ -74,7 +31,7 @@ describe('createMonitoringStatsStream', () => { const aggregatedStats$ = new Subject(); return new Promise((resolve) => { - createMonitoringStatsStream(aggregatedStats$, configuration) + createMonitoringStatsStream(aggregatedStats$) .pipe(take(3), bufferCount(3)) .subscribe(([initialValue, secondValue, thirdValue]) => { expect(initialValue.stats).toMatchObject({ @@ -82,7 +39,7 @@ describe('createMonitoringStatsStream', () => { stats: { configuration: { value: { - max_workers: 10, + capacity: 10, poll_interval: 6000000, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, @@ -115,7 +72,7 @@ describe('createMonitoringStatsStream', () => { configuration: { timestamp: expect.any(String), value: { - max_workers: 10, + capacity: 10, poll_interval: 6000000, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, @@ -148,7 +105,7 @@ describe('createMonitoringStatsStream', () => { configuration: { timestamp: expect.any(String), value: { - max_workers: 10, + capacity: 10, poll_interval: 6000000, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, diff --git a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts index 5ee6465dae0eb..e1bffb55d54fa 100644 --- a/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts +++ b/x-pack/plugins/task_manager/server/monitoring/monitoring_stats_stream.ts @@ -10,8 +10,6 @@ import { map, scan } from 'rxjs'; import { set } from '@kbn/safer-lodash-set'; import { Logger } from '@kbn/core/server'; import { JsonObject } from '@kbn/utility-types'; -import { TaskStore } from '../task_store'; -import { TaskPollingLifecycle } from '../polling_lifecycle'; import { createWorkloadAggregator, summarizeWorkloadStat, @@ -37,11 +35,9 @@ import { import { ConfigStat, createConfigurationAggregator } from './configuration_statistics'; import { TaskManagerConfig } from '../config'; -import { ManagedConfiguration } from '../lib/create_managed_configuration'; -import { EphemeralTaskLifecycle } from '../ephemeral_task_lifecycle'; import { CapacityEstimationStat, withCapacityEstimate } from './capacity_estimation'; -import { AdHocTaskCounter } from '../lib/adhoc_task_counter'; import { AggregatedStatProvider } from '../lib/runtime_statistics_aggregator'; +import { CreateMonitoringStatsOpts } from '.'; export interface MonitoringStats { last_update: string; @@ -81,26 +77,28 @@ export interface RawMonitoringStats { }; } -export function createAggregators( - taskStore: TaskStore, - elasticsearchAndSOAvailability$: Observable, - config: TaskManagerConfig, - managedConfig: ManagedConfiguration, - logger: Logger, - adHocTaskCounter: AdHocTaskCounter, - taskPollingLifecycle?: TaskPollingLifecycle, - ephemeralTaskLifecycle?: EphemeralTaskLifecycle -): AggregatedStatProvider { +export function createAggregators({ + taskStore, + elasticsearchAndSOAvailability$, + config, + managedConfig, + logger, + taskDefinitions, + adHocTaskCounter, + taskPollingLifecycle, + ephemeralTaskLifecycle, +}: CreateMonitoringStatsOpts): AggregatedStatProvider { const aggregators: AggregatedStatProvider[] = [ createConfigurationAggregator(config, managedConfig), - createWorkloadAggregator( + createWorkloadAggregator({ taskStore, elasticsearchAndSOAvailability$, - config.monitored_aggregated_stats_refresh_rate, - config.poll_interval, - logger - ), + refreshInterval: config.monitored_aggregated_stats_refresh_rate, + pollInterval: config.poll_interval, + logger, + taskDefinitions, + }), ]; if (taskPollingLifecycle) { aggregators.push( @@ -118,7 +116,7 @@ export function createAggregators( createEphemeralTaskAggregator( ephemeralTaskLifecycle, config.monitored_stats_running_average_window, - config.max_workers + managedConfig.startingCapacity ) ); } @@ -126,8 +124,7 @@ export function createAggregators( } export function createMonitoringStatsStream( - provider$: AggregatedStatProvider, - config: TaskManagerConfig + provider$: AggregatedStatProvider ): Observable { const initialStats = { last_update: new Date().toISOString(), diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_calculators.test.ts similarity index 98% rename from x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts rename to x-pack/plugins/task_manager/server/monitoring/task_run_calculators.test.ts index b5f6be8b7524d..46df2b1b21d42 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_calculators.test.ts @@ -12,7 +12,7 @@ import { calculateFrequency, createRunningAveragedStat, createMapOfRunningAveragedStats, -} from './task_run_calcultors'; +} from './task_run_calculators'; describe('calculateRunningAverage', () => { test('calculates the running average and median of a window of values', async () => { diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_calculators.ts similarity index 100% rename from x-pack/plugins/task_manager/server/monitoring/task_run_calcultors.ts rename to x-pack/plugins/task_manager/server/monitoring/task_run_calculators.ts diff --git a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts index cd75436e1c33a..01bb051426440 100644 --- a/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/task_run_statistics.ts @@ -35,7 +35,7 @@ import { calculateFrequency, createRunningAveragedStat, createMapOfRunningAveragedStats, -} from './task_run_calcultors'; +} from './task_run_calculators'; import { HealthStatus } from './monitoring_stats_stream'; import { TaskPollingLifecycle } from '../polling_lifecycle'; import { TaskExecutionFailureThreshold, TaskManagerConfig } from '../config'; diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts index 7ef860efa783a..2289c00b6405e 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.test.ts @@ -15,13 +15,14 @@ import { padBuckets, estimateRecurringTaskScheduling, } from './workload_statistics'; -import { ConcreteTaskInstance } from '../task'; +import { ConcreteTaskInstance, TaskCost } from '../task'; import { times } from 'lodash'; import { taskStoreMock } from '../task_store.mock'; import { of, Subject } from 'rxjs'; import { sleep } from '../test_utils'; import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey'; +import { TaskTypeDictionary } from '../task_type_dictionary'; type ResponseWithAggs = Omit, 'aggregations'> & { aggregations: WorkloadAggregationResponse; @@ -32,52 +33,98 @@ const asApiResponse = (body: ResponseWithAggs) => .createSuccessTransportRequestPromise(body as estypes.SearchResponse) .then((res) => res.body as ResponseWithAggs); +const logger = loggingSystemMock.create().get(); + +const definitions = new TaskTypeDictionary(logger); +definitions.registerTaskDefinitions({ + report: { + title: 'report', + cost: TaskCost.ExtraLarge, + createTaskRunner: jest.fn(), + }, + foo: { + title: 'foo', + createTaskRunner: jest.fn(), + }, + bar: { + title: 'bar', + cost: TaskCost.Tiny, + createTaskRunner: jest.fn(), + }, +}); describe('Workload Statistics Aggregator', () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + test('queries the Task Store at a fixed interval for the current workload', async () => { const taskStore = taskStoreMock.create({}); taskStore.aggregate.mockResolvedValue( asApiResponse({ - hits: { - hits: [], - max_score: 0, - total: { value: 0, relation: 'eq' }, - }, + hits: { hits: [], max_score: 0, total: { value: 3, relation: 'eq' } }, took: 1, timed_out: false, - _shards: { - total: 1, - successful: 1, - skipped: 1, - failed: 0, - }, + _shards: { total: 1, successful: 1, skipped: 1, failed: 0 }, aggregations: { taskType: { - buckets: [], + buckets: [ + { + key: 'foo', + doc_count: 1, + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [{ key: 'idle', doc_count: 1 }], + }, + }, + { + key: 'bar', + doc_count: 1, + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [{ key: 'claiming', doc_count: 1 }], + }, + }, + { + key: 'report', + doc_count: 1, + status: { + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, + buckets: [{ key: 'idle', doc_count: 1 }], + }, + }, + ], doc_count_error_upper_bound: 0, sum_other_doc_count: 0, }, schedule: { - buckets: [], + buckets: [{ key: '1m', doc_count: 8 }], doc_count_error_upper_bound: 0, sum_other_doc_count: 0, }, nonRecurringTasks: { - doc_count: 13, - }, - ownerIds: { - ownerIds: { - value: 1, + doc_count: 1, + taskType: { + buckets: [{ key: 'report', doc_count: 1 }], + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, }, }, + ownerIds: { ownerIds: { value: 1 } }, // The `FiltersAggregate` doesn't cover the case of a nested `AggregationsAggregationContainer`, in which `FiltersAggregate` // would not have a `buckets` property, but rather a keyed property that's inferred from the request. // @ts-expect-error idleTasks: { doc_count: 0, overdue: { - doc_count: 0, - nonRecurring: { - doc_count: 0, + doc_count: 1, + nonRecurring: { doc_count: 0 }, + taskTypes: { + buckets: [{ key: 'foo', doc_count: 1 }], + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, }, }, scheduleDensity: { @@ -89,9 +136,7 @@ describe('Workload Statistics Aggregator', () => { to: 1.601651976274e12, to_as_string: '2020-10-02T15:19:36.274Z', doc_count: 0, - histogram: { - buckets: [], - }, + histogram: { buckets: [] }, }, ], }, @@ -100,87 +145,51 @@ describe('Workload Statistics Aggregator', () => { }) ); - const workloadAggregator = createWorkloadAggregator( + const workloadAggregator = createWorkloadAggregator({ taskStore, - of(true), - 10, - 3000, - loggingSystemMock.create().get() - ); + elasticsearchAndSOAvailability$: of(true), + refreshInterval: 10, + pollInterval: 3000, + logger, + taskDefinitions: definitions, + }); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe(() => { expect(taskStore.aggregate).toHaveBeenCalledWith({ aggs: { taskType: { - terms: { size: 100, field: 'task.taskType' }, - aggs: { - status: { - terms: { field: 'task.status' }, - }, - }, + terms: { size: 3, field: 'task.taskType' }, + aggs: { status: { terms: { field: 'task.status' } } }, }, schedule: { - terms: { - field: 'task.schedule.interval', - size: 100, - }, + terms: { field: 'task.schedule.interval', size: 100 }, }, nonRecurringTasks: { - missing: { field: 'task.schedule' }, + missing: { field: 'task.schedule.interval' }, + aggs: { taskType: { terms: { size: 3, field: 'task.taskType' } } }, }, ownerIds: { - filter: { - range: { - 'task.startedAt': { - gte: 'now-1w/w', - }, - }, - }, - aggs: { - ownerIds: { - cardinality: { - field: 'task.ownerId', - }, - }, - }, + filter: { range: { 'task.startedAt': { gte: 'now-1w/w' } } }, + aggs: { ownerIds: { cardinality: { field: 'task.ownerId' } } }, }, idleTasks: { - filter: { - term: { 'task.status': 'idle' }, - }, + filter: { term: { 'task.status': 'idle' } }, aggs: { scheduleDensity: { - range: { - field: 'task.runAt', - ranges: [{ from: 'now', to: 'now+1m' }], - }, + range: { field: 'task.runAt', ranges: [{ from: 'now', to: 'now+1m' }] }, aggs: { histogram: { - date_histogram: { - field: 'task.runAt', - fixed_interval: '3s', - }, - aggs: { - interval: { - terms: { - field: 'task.schedule.interval', - }, - }, - }, + date_histogram: { field: 'task.runAt', fixed_interval: '3s' }, + aggs: { interval: { terms: { field: 'task.schedule.interval' } } }, }, }, }, overdue: { - filter: { - range: { - 'task.runAt': { lt: 'now' }, - }, - }, + filter: { range: { 'task.runAt': { lt: 'now' } } }, aggs: { - nonRecurring: { - missing: { field: 'task.schedule' }, - }, + nonRecurring: { missing: { field: 'task.schedule.interval' } }, + taskTypes: { terms: { size: 3, field: 'task.taskType' } }, }, }, }, @@ -194,36 +203,18 @@ describe('Workload Statistics Aggregator', () => { const mockAggregatedResult = () => asApiResponse({ - hits: { - hits: [], - max_score: 0, - total: { value: 4, relation: 'eq' }, - }, + hits: { hits: [], max_score: 0, total: { value: 4, relation: 'eq' } }, took: 1, timed_out: false, - _shards: { - total: 1, - successful: 1, - skipped: 1, - failed: 0, - }, + _shards: { total: 1, successful: 1, skipped: 1, failed: 0 }, aggregations: { schedule: { doc_count_error_upper_bound: 0, sum_other_doc_count: 0, buckets: [ - { - key: '3600s', - doc_count: 1, - }, - { - key: '60s', - doc_count: 1, - }, - { - key: '720m', - doc_count: 1, - }, + { key: '3600s', doc_count: 1 }, + { key: '60s', doc_count: 1 }, + { key: '720m', doc_count: 1 }, ], }, taskType: { @@ -231,66 +222,55 @@ describe('Workload Statistics Aggregator', () => { sum_other_doc_count: 0, buckets: [ { - key: 'actions_telemetry', + key: 'foo', doc_count: 2, status: { doc_count_error_upper_bound: 0, sum_other_doc_count: 0, - buckets: [ - { - key: 'idle', - doc_count: 2, - }, - ], + buckets: [{ key: 'idle', doc_count: 2 }], }, }, { - key: 'alerting_telemetry', + key: 'bar', doc_count: 1, status: { doc_count_error_upper_bound: 0, sum_other_doc_count: 0, - buckets: [ - { - key: 'idle', - doc_count: 1, - }, - ], + buckets: [{ key: 'idle', doc_count: 1 }], }, }, { - key: 'session_cleanup', + key: 'report', doc_count: 1, status: { doc_count_error_upper_bound: 0, sum_other_doc_count: 0, - buckets: [ - { - key: 'idle', - doc_count: 1, - }, - ], + buckets: [{ key: 'idle', doc_count: 1 }], }, }, ], }, nonRecurringTasks: { - doc_count: 13, - }, - ownerIds: { - ownerIds: { - value: 1, + doc_count: 1, + taskType: { + buckets: [{ key: 'report', doc_count: 1 }], + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, }, }, + ownerIds: { ownerIds: { value: 1 } }, // The `FiltersAggregate` doesn't cover the case of a nested `AggregationsAggregationContainer`, in which `FiltersAggregate` // would not have a `buckets` property, but rather a keyed property that's inferred from the request. // @ts-expect-error idleTasks: { - doc_count: 13, + doc_count: 3, overdue: { - doc_count: 6, - nonRecurring: { - doc_count: 6, + doc_count: 2, + nonRecurring: { doc_count: 1 }, + taskTypes: { + buckets: [{ key: 'foo', doc_count: 1 }], + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, }, }, scheduleDensity: { @@ -306,23 +286,25 @@ describe('Workload Statistics Aggregator', () => { const taskStore = taskStoreMock.create({}); taskStore.aggregate.mockResolvedValue(mockAggregatedResult()); - const workloadAggregator = createWorkloadAggregator( + const workloadAggregator = createWorkloadAggregator({ taskStore, - of(true), - 10, - 3000, - loggingSystemMock.create().get() - ); + elasticsearchAndSOAvailability$: of(true), + refreshInterval: 10, + pollInterval: 3000, + logger, + taskDefinitions: definitions, + }); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe((result) => { expect(result.key).toEqual('workload'); expect(result.value).toMatchObject({ count: 4, + cost: 15, task_types: { - actions_telemetry: { count: 2, status: { idle: 2 } }, - alerting_telemetry: { count: 1, status: { idle: 1 } }, - session_cleanup: { count: 1, status: { idle: 1 } }, + foo: { count: 2, cost: 4, status: { idle: 2 } }, + bar: { count: 1, cost: 1, status: { idle: 1 } }, + report: { count: 1, cost: 10, status: { idle: 1 } }, }, }); resolve(); @@ -336,13 +318,14 @@ describe('Workload Statistics Aggregator', () => { const availability$ = new Subject(); - const workloadAggregator = createWorkloadAggregator( + const workloadAggregator = createWorkloadAggregator({ taskStore, - availability$, - 10, - 3000, - loggingSystemMock.create().get() - ); + elasticsearchAndSOAvailability$: of(true), + refreshInterval: 10, + pollInterval: 3000, + logger, + taskDefinitions: definitions, + }); return new Promise(async (resolve, reject) => { try { @@ -350,25 +333,11 @@ describe('Workload Statistics Aggregator', () => { expect(result.key).toEqual('workload'); expect(result.value).toMatchObject({ count: 4, + cost: 15, task_types: { - actions_telemetry: { - count: 2, - status: { - idle: 2, - }, - }, - alerting_telemetry: { - count: 1, - status: { - idle: 1, - }, - }, - session_cleanup: { - count: 1, - status: { - idle: 1, - }, - }, + foo: { count: 2, cost: 4, status: { idle: 2 } }, + bar: { count: 1, cost: 1, status: { idle: 1 } }, + report: { count: 1, cost: 10, status: { idle: 1 } }, }, }); resolve(); @@ -389,19 +358,22 @@ describe('Workload Statistics Aggregator', () => { const taskStore = taskStoreMock.create({}); taskStore.aggregate.mockResolvedValue(mockAggregatedResult()); - const workloadAggregator = createWorkloadAggregator( + const workloadAggregator = createWorkloadAggregator({ taskStore, - of(true), - 10, - 3000, - loggingSystemMock.create().get() - ); + elasticsearchAndSOAvailability$: of(true), + refreshInterval: 10, + pollInterval: 3000, + logger, + taskDefinitions: definitions, + }); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe((result) => { expect(result.key).toEqual('workload'); expect(result.value).toMatchObject({ - overdue: 6, + overdue: 2, + overdue_cost: 2, + overdue_non_recurring: 1, }); resolve(); }); @@ -412,13 +384,14 @@ describe('Workload Statistics Aggregator', () => { const taskStore = taskStoreMock.create({}); taskStore.aggregate.mockResolvedValue(mockAggregatedResult()); - const workloadAggregator = createWorkloadAggregator( + const workloadAggregator = createWorkloadAggregator({ taskStore, - of(true), - 10, - 3000, - loggingSystemMock.create().get() - ); + elasticsearchAndSOAvailability$: of(true), + refreshInterval: 10, + pollInterval: 3000, + logger, + taskDefinitions: definitions, + }); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe((result) => { @@ -440,13 +413,14 @@ describe('Workload Statistics Aggregator', () => { const taskStore = taskStoreMock.create({}); taskStore.aggregate.mockResolvedValue(mockAggregatedResult()); - const workloadAggregator = createWorkloadAggregator( + const workloadAggregator = createWorkloadAggregator({ taskStore, - of(true), - 60 * 1000, - 3000, - loggingSystemMock.create().get() - ); + elasticsearchAndSOAvailability$: of(true), + refreshInterval: 60 * 1000, + pollInterval: 3000, + logger, + taskDefinitions: definitions, + }); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe(() => { @@ -478,13 +452,14 @@ describe('Workload Statistics Aggregator', () => { const taskStore = taskStoreMock.create({}); taskStore.aggregate.mockResolvedValue(mockAggregatedResult()); - const workloadAggregator = createWorkloadAggregator( + const workloadAggregator = createWorkloadAggregator({ taskStore, - of(true), - 15 * 60 * 1000, - 3000, - loggingSystemMock.create().get() - ); + elasticsearchAndSOAvailability$: of(true), + refreshInterval: 15 * 60 * 1000, + pollInterval: 3000, + logger, + taskDefinitions: definitions, + }); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe((result) => { @@ -517,42 +492,41 @@ describe('Workload Statistics Aggregator', () => { const taskStore = taskStoreMock.create({}); taskStore.aggregate .mockResolvedValueOnce( - mockAggregatedResult().then((res) => - setTaskTypeCount(res, 'alerting_telemetry', { - idle: 2, - }) - ) + mockAggregatedResult().then((res) => setTaskTypeCount(res, 'foo', { idle: 2 })) ) .mockRejectedValueOnce(new Error('Elasticsearch has gone poof')) .mockResolvedValueOnce( - mockAggregatedResult().then((res) => - setTaskTypeCount(res, 'alerting_telemetry', { - idle: 1, - failed: 1, - }) - ) + mockAggregatedResult().then((res) => setTaskTypeCount(res, 'foo', { idle: 1, failed: 1 })) ); - const logger = loggingSystemMock.create().get(); - const workloadAggregator = createWorkloadAggregator(taskStore, of(true), 10, 3000, logger); + const workloadAggregator = createWorkloadAggregator({ + taskStore, + elasticsearchAndSOAvailability$: of(true), + refreshInterval: 10, + pollInterval: 3000, + logger, + taskDefinitions: definitions, + }); return new Promise((resolve, reject) => { workloadAggregator.pipe(take(2), bufferCount(2)).subscribe((results) => { expect(results[0].key).toEqual('workload'); expect(results[0].value).toMatchObject({ - count: 5, + count: 4, + cost: 15, task_types: { - actions_telemetry: { count: 2, status: { idle: 2 } }, - alerting_telemetry: { count: 2, status: { idle: 2 } }, - session_cleanup: { count: 1, status: { idle: 1 } }, + bar: { count: 1, cost: 1, status: { idle: 1 } }, + report: { count: 1, cost: 10, status: { idle: 1 } }, + foo: { count: 2, cost: 4, status: { idle: 2 } }, }, }); expect(results[1].key).toEqual('workload'); expect(results[1].value).toMatchObject({ - count: 5, + count: 4, + cost: 15, task_types: { - actions_telemetry: { count: 2, status: { idle: 2 } }, - alerting_telemetry: { count: 2, status: { idle: 1, failed: 1 } }, - session_cleanup: { count: 1, status: { idle: 1 } }, + bar: { count: 1, cost: 1, status: { idle: 1 } }, + report: { count: 1, cost: 10, status: { idle: 1 } }, + foo: { count: 2, cost: 4, status: { idle: 1, failed: 1 } }, }, }); resolve(); @@ -567,49 +541,27 @@ describe('Workload Statistics Aggregator', () => { const taskStore = taskStoreMock.create({}); taskStore.aggregate.mockResolvedValue( asApiResponse({ - hits: { - hits: [], - max_score: 0, - total: { value: 4, relation: 'eq' }, - }, + hits: { hits: [], max_score: 0, total: { value: 4, relation: 'eq' } }, took: 1, timed_out: false, - _shards: { - total: 1, - successful: 1, - skipped: 1, - failed: 0, - }, + _shards: { total: 1, successful: 1, skipped: 1, failed: 0 }, aggregations: { schedule: { doc_count_error_upper_bound: 0, sum_other_doc_count: 0, buckets: [ // repeats each cycle - { - key: `${pollingIntervalInSeconds}s`, - doc_count: 1, - }, - { - key: `10s`, // 6 times per minute - doc_count: 20, - }, - { - key: `60s`, // 1 times per minute - doc_count: 10, - }, - { - key: '15m', // 4 times per hour - doc_count: 90, - }, - { - key: '720m', // 2 times per day - doc_count: 10, - }, - { - key: '3h', // 8 times per day - doc_count: 100, - }, + { key: `${pollingIntervalInSeconds}s`, doc_count: 1 }, + // 6 times per minute + { key: `10s`, doc_count: 20 }, + // 1 times per minute + { key: `60s`, doc_count: 10 }, + // 4 times per hour + { key: '15m', doc_count: 90 }, + // 2 times per day + { key: '720m', doc_count: 10 }, + // 8 times per day + { key: '3h', doc_count: 100 }, ], }, taskType: { @@ -619,12 +571,13 @@ describe('Workload Statistics Aggregator', () => { }, nonRecurringTasks: { doc_count: 13, - }, - ownerIds: { - ownerIds: { - value: 3, + taskType: { + buckets: [{ key: 'report', doc_count: 13 }], + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, }, }, + ownerIds: { ownerIds: { value: 3 } }, // The `FiltersAggregate` doesn't cover the case of a nested `AggregationContainer`, in which `FiltersAggregate` // would not have a `buckets` property, but rather a keyed property that's inferred from the request. // @ts-expect-error @@ -632,8 +585,11 @@ describe('Workload Statistics Aggregator', () => { doc_count: 13, overdue: { doc_count: 6, - nonRecurring: { - doc_count: 0, + nonRecurring: { doc_count: 0 }, + taskTypes: { + buckets: [{ key: 'foo', doc_count: 6 }], + doc_count_error_upper_bound: 0, + sum_other_doc_count: 0, }, }, scheduleDensity: { @@ -646,13 +602,14 @@ describe('Workload Statistics Aggregator', () => { }) ); - const workloadAggregator = createWorkloadAggregator( + const workloadAggregator = createWorkloadAggregator({ taskStore, - of(true), - 10, - pollingIntervalInSeconds * 1000, - loggingSystemMock.create().get() - ); + elasticsearchAndSOAvailability$: of(true), + refreshInterval: 10, + pollInterval: pollingIntervalInSeconds * 1000, + logger, + taskDefinitions: definitions, + }); return new Promise((resolve) => { workloadAggregator.pipe(first()).subscribe((result) => { @@ -660,7 +617,7 @@ describe('Workload Statistics Aggregator', () => { expect(result.value).toMatchObject({ capacity_requirements: { - // these are buckets of required capacity, rather than aggregated requirmenets. + // these are buckets of required capacity, rather than aggregated requirements. per_minute: 150, per_hour: 360, per_day: 820, @@ -675,14 +632,14 @@ describe('Workload Statistics Aggregator', () => { const refreshInterval = 1000; const taskStore = taskStoreMock.create({}); - const logger = loggingSystemMock.create().get(); - const workloadAggregator = createWorkloadAggregator( + const workloadAggregator = createWorkloadAggregator({ taskStore, - of(true), + elasticsearchAndSOAvailability$: of(true), refreshInterval, - 3000, - logger - ); + pollInterval: 3000, + logger, + taskDefinitions: definitions, + }); return new Promise((resolve, reject) => { let errorWasThrowAt = 0; @@ -694,9 +651,7 @@ describe('Workload Statistics Aggregator', () => { reject(new Error(`Elasticsearch is still poof`)); } - return setTaskTypeCount(await mockAggregatedResult(), 'alerting_telemetry', { - idle: 2, - }); + return setTaskTypeCount(await mockAggregatedResult(), 'foo', { idle: 2 }); }); workloadAggregator.pipe(take(2), bufferCount(2)).subscribe((results) => { @@ -799,7 +754,7 @@ describe('estimateRecurringTaskScheduling', () => { }); describe('padBuckets', () => { - test('returns zeroed out bucklets when there are no buckets in the histogram', async () => { + test('returns zeroed out buckets when there are no buckets in the histogram', async () => { expect( padBuckets(10, 3000, { key: '2020-10-02T19:47:28.128Z-2020-10-02T19:48:28.128Z', diff --git a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts index 6c372ce0fc453..e437b420c04f5 100644 --- a/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts +++ b/x-pack/plugins/task_manager/server/monitoring/workload_statistics.ts @@ -16,7 +16,9 @@ import { AggregatedStatProvider } from '../lib/runtime_statistics_aggregator'; import { parseIntervalAsSecond, asInterval, parseIntervalAsMillisecond } from '../lib/intervals'; import { HealthStatus } from './monitoring_stats_stream'; import { TaskStore } from '../task_store'; -import { createRunningAveragedStat } from './task_run_calcultors'; +import { createRunningAveragedStat } from './task_run_calculators'; +import { TaskTypeDictionary } from '../task_type_dictionary'; +import { TaskCost } from '../task'; interface StatusStat extends JsonObject { [status: string]: number; @@ -24,16 +26,20 @@ interface StatusStat extends JsonObject { interface TaskTypeStat extends JsonObject { [taskType: string]: { count: number; + cost: number; status: StatusStat; }; } interface RawWorkloadStat extends JsonObject { count: number; + cost: number; task_types: TaskTypeStat; schedule: Array<[string, number]>; non_recurring: number; + non_recurring_cost: number; overdue: number; + overdue_cost: number; overdue_non_recurring: number; estimated_schedule_density: number[]; capacity_requirements: CapacityRequirements; @@ -109,22 +115,34 @@ type ScheduleDensityResult = AggregationResultOf< type ScheduledIntervals = ScheduleDensityResult['histogram']['buckets'][0]; // Set an upper bound just in case a customer sets a really high refresh rate -const MAX_SHCEDULE_DENSITY_BUCKETS = 50; +const MAX_SCHEDULE_DENSITY_BUCKETS = 50; + +interface CreateWorkloadAggregatorOpts { + taskStore: TaskStore; + elasticsearchAndSOAvailability$: Observable; + refreshInterval: number; + pollInterval: number; + logger: Logger; + taskDefinitions: TaskTypeDictionary; +} -export function createWorkloadAggregator( - taskStore: TaskStore, - elasticsearchAndSOAvailability$: Observable, - refreshInterval: number, - pollInterval: number, - logger: Logger -): AggregatedStatProvider { +export function createWorkloadAggregator({ + taskStore, + elasticsearchAndSOAvailability$, + refreshInterval, + pollInterval, + logger, + taskDefinitions, +}: CreateWorkloadAggregatorOpts): AggregatedStatProvider { // calculate scheduleDensity going two refreshIntervals or 1 minute into into the future // (the longer of the two) const scheduleDensityBuckets = Math.min( Math.max(Math.round(60000 / pollInterval), Math.round((refreshInterval * 2) / pollInterval)), - MAX_SHCEDULE_DENSITY_BUCKETS + MAX_SCHEDULE_DENSITY_BUCKETS ); + const totalNumTaskDefinitions = taskDefinitions.getAllTypes().length; + const taskTypeTermAggSize = Math.min(totalNumTaskDefinitions, 10000); const ownerIdsQueue = createRunningAveragedStat(scheduleDensityBuckets); return combineLatest([timer(0, refreshInterval), elasticsearchAndSOAvailability$]).pipe( @@ -133,39 +151,24 @@ export function createWorkloadAggregator( taskStore.aggregate({ aggs: { taskType: { - terms: { size: 100, field: 'task.taskType' }, - aggs: { - status: { - terms: { field: 'task.status' }, - }, - }, + terms: { size: taskTypeTermAggSize, field: 'task.taskType' }, + aggs: { status: { terms: { field: 'task.status' } } }, }, schedule: { terms: { field: 'task.schedule.interval', size: 100 }, }, nonRecurringTasks: { - missing: { field: 'task.schedule' }, - }, - ownerIds: { - filter: { - range: { - 'task.startedAt': { - gte: 'now-1w/w', - }, - }, - }, + missing: { field: 'task.schedule.interval' }, aggs: { - ownerIds: { - cardinality: { - field: 'task.ownerId', - }, - }, + taskType: { terms: { size: taskTypeTermAggSize, field: 'task.taskType' } }, }, }, + ownerIds: { + filter: { range: { 'task.startedAt': { gte: 'now-1w/w' } } }, + aggs: { ownerIds: { cardinality: { field: 'task.ownerId' } } }, + }, idleTasks: { - filter: { - term: { 'task.status': 'idle' }, - }, + filter: { term: { 'task.status': 'idle' } }, aggs: { scheduleDensity: { // create a window of upcoming tasks @@ -187,7 +190,7 @@ export function createWorkloadAggregator( field: 'task.runAt', fixed_interval: asInterval(pollInterval), }, - // break down each bucket in the historgram by schedule + // break down each bucket in the histogram by schedule aggs: { interval: { terms: { field: 'task.schedule.interval' }, @@ -197,15 +200,10 @@ export function createWorkloadAggregator( }, }, overdue: { - filter: { - range: { - 'task.runAt': { lt: 'now' }, - }, - }, + filter: { range: { 'task.runAt': { lt: 'now' } } }, aggs: { - nonRecurring: { - missing: { field: 'task.schedule' }, - }, + taskTypes: { terms: { size: taskTypeTermAggSize, field: 'task.taskType' } }, + nonRecurring: { missing: { field: 'task.schedule.interval' } }, }, }, }, @@ -226,11 +224,13 @@ export function createWorkloadAggregator( const taskTypes = aggregations.taskType.buckets; const nonRecurring = aggregations.nonRecurringTasks.doc_count; + const nonRecurringTaskTypes = aggregations.nonRecurringTasks.taskType.buckets; const ownerIds = aggregations.ownerIds.ownerIds.value; const { overdue: { doc_count: overdue, + taskTypes: { buckets: taskTypesOverdue = [] } = {}, nonRecurring: { doc_count: overdueNonRecurring }, }, scheduleDensity: { buckets: [scheduleDensity] = [] } = {}, @@ -243,6 +243,7 @@ export function createWorkloadAggregator( asSeconds: parseIntervalAsSecond(schedule.key as string), count: schedule.doc_count, }; + accm.schedules.push(parsedSchedule); if (parsedSchedule.asSeconds <= 60) { accm.cadence.perMinute += @@ -257,11 +258,7 @@ export function createWorkloadAggregator( return accm; }, { - cadence: { - perMinute: 0, - perHour: 0, - perDay: 0, - }, + cadence: { perMinute: 0, perHour: 0, perDay: 0 }, schedules: [] as Array<{ interval: string; asSeconds: number; @@ -270,20 +267,36 @@ export function createWorkloadAggregator( } ); + const totalNonRecurringCost = getTotalCost(nonRecurringTaskTypes, taskDefinitions); + const totalOverdueCost = getTotalCost(taskTypesOverdue, taskDefinitions); + + let totalCost = 0; + const taskTypeSummary = taskTypes.reduce((acc, bucket) => { + const value = bucket as TaskTypeWithStatusBucket; + const cost = + value.doc_count * taskDefinitions.get(value.key as string)?.cost ?? TaskCost.Normal; + totalCost += cost; + return Object.assign(acc, { + [value.key as string]: { + count: value.doc_count, + cost, + status: mapValues(keyBy(value.status.buckets, 'key'), 'doc_count'), + }, + }); + }, {}); + const summary: WorkloadStat = { count, - task_types: mapValues(keyBy(taskTypes, 'key'), ({ doc_count: docCount, status }) => { - return { - count: docCount, - status: mapValues(keyBy(status.buckets, 'key'), 'doc_count'), - }; - }), + cost: totalCost, + task_types: taskTypeSummary, non_recurring: nonRecurring, + non_recurring_cost: totalNonRecurringCost, owner_ids: ownerIdsQueue(ownerIds), schedule: schedules .sort((scheduleLeft, scheduleRight) => scheduleLeft.asSeconds - scheduleRight.asSeconds) .map((schedule) => [schedule.interval, schedule.count]), overdue, + overdue_cost: totalOverdueCost, overdue_non_recurring: overdueNonRecurring, estimated_schedule_density: padBuckets( scheduleDensityBuckets, @@ -457,40 +470,37 @@ export interface WorkloadAggregationResponse { taskType: TaskTypeAggregation; schedule: ScheduleAggregation; idleTasks: IdleTasksAggregation; - nonRecurringTasks: { - doc_count: number; - }; - ownerIds: { - ownerIds: { - value: number; - }; - }; + nonRecurringTasks: { doc_count: number; taskType: TaskTypeAggregation }; + ownerIds: { ownerIds: { value: number } }; [otherAggs: string]: estypes.AggregationsAggregate; } + +export type TaskTypeWithStatusBucket = TaskTypeBucket & { + status: { + buckets: Array<{ + doc_count: number; + key: string | number; + }>; + doc_count_error_upper_bound?: number | undefined; + sum_other_doc_count?: number | undefined; + }; +}; + +export interface TaskTypeBucket { + doc_count: number; + key: string | number; +} + // @ts-expect-error key doesn't accept a string export interface TaskTypeAggregation extends estypes.AggregationsFiltersAggregate { - buckets: Array<{ - doc_count: number; - key: string | number; - status: { - buckets: Array<{ - doc_count: number; - key: string | number; - }>; - doc_count_error_upper_bound?: number | undefined; - sum_other_doc_count?: number | undefined; - }; - }>; + buckets: Array; doc_count_error_upper_bound?: number | undefined; sum_other_doc_count?: number | undefined; } // @ts-expect-error key doesn't accept a string export interface ScheduleAggregation extends estypes.AggregationsFiltersAggregate { - buckets: Array<{ - doc_count: number; - key: string | number; - }>; + buckets: Array<{ doc_count: number; key: string | number }>; doc_count_error_upper_bound?: number | undefined; sum_other_doc_count?: number | undefined; } @@ -518,9 +528,8 @@ export interface IdleTasksAggregation extends estypes.AggregationsFiltersAggrega }; overdue: { doc_count: number; - nonRecurring: { - doc_count: number; - }; + nonRecurring: { doc_count: number }; + taskTypes: TaskTypeAggregation; }; } @@ -537,3 +546,11 @@ interface DateRangeBucket { from_as_string?: string; doc_count: number; } + +function getTotalCost(taskTypeBuckets: TaskTypeBucket[], definitions: TaskTypeDictionary): number { + let cost = 0; + for (const bucket of taskTypeBuckets) { + cost += bucket.doc_count * definitions.get(bucket.key as string)?.cost ?? TaskCost.Normal; + } + return cost; +} diff --git a/x-pack/plugins/task_manager/server/plugin.test.ts b/x-pack/plugins/task_manager/server/plugin.test.ts index 7b80920a57559..0a93a97fdaf52 100644 --- a/x-pack/plugins/task_manager/server/plugin.test.ts +++ b/x-pack/plugins/task_manager/server/plugin.test.ts @@ -38,7 +38,6 @@ jest.mock('./ephemeral_task_lifecycle', () => { const coreStart = coreMock.createStart(); const pluginInitializerContextParams = { - max_workers: 10, max_attempts: 9, poll_interval: 3000, version_conflict_threshold: 80, diff --git a/x-pack/plugins/task_manager/server/plugin.ts b/x-pack/plugins/task_manager/server/plugin.ts index 1926b48b31ea6..8b0c50c59b203 100644 --- a/x-pack/plugins/task_manager/server/plugin.ts +++ b/x-pack/plugins/task_manager/server/plugin.ts @@ -270,8 +270,7 @@ export class TaskManagerPlugin const managedConfiguration = createManagedConfiguration({ logger: this.logger, errors$: taskStore.errors$, - startingMaxWorkers: this.config!.max_workers, - startingPollInterval: this.config!.poll_interval, + config: this.config!, }); // Only poll for tasks if configured to run tasks @@ -310,16 +309,17 @@ export class TaskManagerPlugin }); } - createMonitoringStats( + createMonitoringStats({ taskStore, - this.elasticsearchAndSOAvailability$!, - this.config!, - managedConfiguration, - this.logger, - this.adHocTaskCounter, - this.taskPollingLifecycle, - this.ephemeralTaskLifecycle - ).subscribe((stat) => this.monitoringStats$.next(stat)); + elasticsearchAndSOAvailability$: this.elasticsearchAndSOAvailability$!, + config: this.config!, + managedConfig: managedConfiguration, + logger: this.logger, + adHocTaskCounter: this.adHocTaskCounter, + taskDefinitions: this.definitions, + taskPollingLifecycle: this.taskPollingLifecycle, + ephemeralTaskLifecycle: this.ephemeralTaskLifecycle, + }).subscribe((stat) => this.monitoringStats$.next(stat)); metricsStream({ config: this.config!, diff --git a/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.test.ts b/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.test.ts index f06c43bc15587..11741aeadcf2d 100644 --- a/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.test.ts +++ b/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.test.ts @@ -22,10 +22,10 @@ describe('delayOnClaimConflicts', () => { 'initializes with a delay of 0', fakeSchedulers(async () => { const pollInterval = 100; - const maxWorkers = 10; + const capacity = 10; const taskLifecycleEvents$ = new Subject(); const delays = delayOnClaimConflicts( - of(maxWorkers), + of(capacity), of(pollInterval), taskLifecycleEvents$, 80, @@ -42,11 +42,11 @@ describe('delayOnClaimConflicts', () => { 'emits a random delay whenever p50 of claim clashes exceed 80% of available max_workers', fakeSchedulers(async () => { const pollInterval = 100; - const maxWorkers = 10; + const capacity = 10; const taskLifecycleEvents$ = new Subject(); const delays$ = firstValueFrom( - delayOnClaimConflicts(of(maxWorkers), of(pollInterval), taskLifecycleEvents$, 80, 2).pipe( + delayOnClaimConflicts(of(capacity), of(pollInterval), taskLifecycleEvents$, 80, 2).pipe( take(2), bufferCount(2) ) @@ -60,7 +60,6 @@ describe('delayOnClaimConflicts', () => { tasksUpdated: 0, tasksConflicted: 8, tasksClaimed: 0, - tasksRejected: 0, }, docs: [], }) @@ -94,7 +93,6 @@ describe('delayOnClaimConflicts', () => { tasksUpdated: 0, tasksConflicted: 8, tasksClaimed: 0, - tasksRejected: 0, }, docs: [], }) @@ -111,7 +109,6 @@ describe('delayOnClaimConflicts', () => { tasksUpdated: 0, tasksConflicted: 10, tasksClaimed: 0, - tasksRejected: 0, }, docs: [], }) @@ -137,18 +134,14 @@ describe('delayOnClaimConflicts', () => { 'doesnt emit a new delay when conflicts have reduced', fakeSchedulers(async () => { const pollInterval = 100; - const maxWorkers = 10; + const capacity = 10; const taskLifecycleEvents$ = new Subject(); const handler = jest.fn(); - delayOnClaimConflicts( - of(maxWorkers), - of(pollInterval), - taskLifecycleEvents$, - 80, - 2 - ).subscribe(handler); + delayOnClaimConflicts(of(capacity), of(pollInterval), taskLifecycleEvents$, 80, 2).subscribe( + handler + ); await sleep(0); expect(handler).toHaveBeenCalledWith(0); @@ -161,7 +154,6 @@ describe('delayOnClaimConflicts', () => { tasksUpdated: 0, tasksConflicted: 8, tasksClaimed: 0, - tasksRejected: 0, }, docs: [], }) @@ -182,7 +174,6 @@ describe('delayOnClaimConflicts', () => { tasksUpdated: 0, tasksConflicted: 7, tasksClaimed: 0, - tasksRejected: 0, }, docs: [], }) @@ -201,7 +192,6 @@ describe('delayOnClaimConflicts', () => { tasksUpdated: 0, tasksConflicted: 9, tasksClaimed: 0, - tasksRejected: 0, }, docs: [], }) diff --git a/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.ts b/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.ts index f491d58fc59ee..21b16b1a8d5c5 100644 --- a/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.ts +++ b/x-pack/plugins/task_manager/server/polling/delay_on_claim_conflicts.ts @@ -19,13 +19,14 @@ import { ManagedConfiguration } from '../lib/create_managed_configuration'; import { TaskLifecycleEvent } from '../polling_lifecycle'; import { isTaskPollingCycleEvent } from '../task_events'; import { ClaimAndFillPoolResult } from '../lib/fill_pool'; -import { createRunningAveragedStat } from '../monitoring/task_run_calcultors'; +import { createRunningAveragedStat } from '../monitoring/task_run_calculators'; +import { getCapacityInWorkers } from '../task_pool'; /** * Emits a delay amount in ms to apply to polling whenever the task store exceeds a threshold of claim claimClashes */ export function delayOnClaimConflicts( - maxWorkersConfiguration$: ManagedConfiguration['maxWorkersConfiguration$'], + capacityConfiguration$: ManagedConfiguration['capacityConfiguration$'], pollIntervalConfiguration$: ManagedConfiguration['pollIntervalConfiguration$'], taskLifecycleEvents$: Observable, claimClashesPercentageThreshold: number, @@ -37,7 +38,7 @@ export function delayOnClaimConflicts( merge( of(0), combineLatest([ - maxWorkersConfiguration$, + capacityConfiguration$, pollIntervalConfiguration$, taskLifecycleEvents$.pipe( map>((taskEvent: TaskLifecycleEvent) => @@ -51,7 +52,10 @@ export function delayOnClaimConflicts( map((claimClashes: Option) => (claimClashes as Some).value) ), ]).pipe( - map(([maxWorkers, pollInterval, latestClaimConflicts]) => { + map(([capacity, pollInterval, latestClaimConflicts]) => { + // convert capacity to maxWorkers + const maxWorkers = getCapacityInWorkers(capacity); + // add latest claimConflict count to queue claimConflictQueue(latestClaimConflicts); diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts index baf45cb65ea1e..e804f1c166cee 100644 --- a/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts +++ b/x-pack/plugins/task_manager/server/polling_lifecycle.test.ts @@ -20,6 +20,8 @@ import { asOk, Err, isErr, isOk, Result } from './lib/result_type'; import { FillPoolResult } from './lib/fill_pool'; import { ElasticsearchResponseError } from './lib/identify_es_error'; import { executionContextServiceMock } from '@kbn/core/server/mocks'; +import { TaskCost } from './task'; +import { CLAIM_STRATEGY_MGET } from './config'; import { TaskPartitioner } from './lib/task_partitioner'; import { KibanaDiscoveryService } from './kibana_discovery_service'; @@ -44,7 +46,6 @@ describe('TaskPollingLifecycle', () => { const taskManagerOpts = { config: { enabled: true, - max_workers: 10, index: 'foo', max_attempts: 9, poll_interval: 6000000, @@ -90,7 +91,8 @@ describe('TaskPollingLifecycle', () => { unusedTypes: [], definitions: new TaskTypeDictionary(taskManagerLogger), middleware: createInitialMiddleware(), - maxWorkersConfiguration$: of(100), + startingCapacity: 20, + capacityConfiguration$: of(20), pollIntervalConfiguration$: of(100), executionContext, taskPartitioner: new TaskPartitioner('test', {} as KibanaDiscoveryService), @@ -105,12 +107,23 @@ describe('TaskPollingLifecycle', () => { afterEach(() => clock.restore()); describe('start', () => { + taskManagerOpts.definitions.registerTaskDefinitions({ + report: { + title: 'report', + maxConcurrency: 1, + cost: TaskCost.ExtraLarge, + createTaskRunner: jest.fn(), + }, + quickReport: { + title: 'quickReport', + maxConcurrency: 5, + createTaskRunner: jest.fn(), + }, + }); + test('begins polling once the ES and SavedObjects services are available', () => { const elasticsearchAndSOAvailability$ = new Subject(); - new TaskPollingLifecycle({ - ...taskManagerOpts, - elasticsearchAndSOAvailability$, - }); + new TaskPollingLifecycle({ ...taskManagerOpts, elasticsearchAndSOAvailability$ }); clock.tick(150); expect(mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable).not.toHaveBeenCalled(); @@ -121,56 +134,71 @@ describe('TaskPollingLifecycle', () => { expect(mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable).toHaveBeenCalled(); }); - test('provides TaskClaiming with the capacity available', () => { + test('provides TaskClaiming with the capacity available when strategy = CLAIM_STRATEGY_DEFAULT', () => { const elasticsearchAndSOAvailability$ = new Subject(); - const maxWorkers$ = new Subject(); - taskManagerOpts.definitions.registerTaskDefinitions({ - report: { - title: 'report', - maxConcurrency: 1, - createTaskRunner: jest.fn(), - }, - quickReport: { - title: 'quickReport', - maxConcurrency: 5, - createTaskRunner: jest.fn(), - }, - }); + const capacity$ = new Subject(); new TaskPollingLifecycle({ ...taskManagerOpts, elasticsearchAndSOAvailability$, - maxWorkersConfiguration$: maxWorkers$, + capacityConfiguration$: capacity$, }); const taskClaimingGetCapacity = (TaskClaiming as jest.Mock).mock - .calls[0][0].getCapacity; + .calls[0][0].getAvailableCapacity; - maxWorkers$.next(20); - expect(taskClaimingGetCapacity()).toEqual(20); + capacity$.next(40); + expect(taskClaimingGetCapacity()).toEqual(40); expect(taskClaimingGetCapacity('report')).toEqual(1); expect(taskClaimingGetCapacity('quickReport')).toEqual(5); - maxWorkers$.next(30); - expect(taskClaimingGetCapacity()).toEqual(30); + capacity$.next(60); + expect(taskClaimingGetCapacity()).toEqual(60); expect(taskClaimingGetCapacity('report')).toEqual(1); expect(taskClaimingGetCapacity('quickReport')).toEqual(5); - maxWorkers$.next(2); - expect(taskClaimingGetCapacity()).toEqual(2); + capacity$.next(4); + expect(taskClaimingGetCapacity()).toEqual(4); expect(taskClaimingGetCapacity('report')).toEqual(1); - expect(taskClaimingGetCapacity('quickReport')).toEqual(2); + expect(taskClaimingGetCapacity('quickReport')).toEqual(4); }); - }); - describe('stop', () => { - test('stops polling once the ES and SavedObjects services become unavailable', () => { + test('provides TaskClaiming with the capacity available when strategy = CLAIM_STRATEGY_MGET', () => { const elasticsearchAndSOAvailability$ = new Subject(); + const capacity$ = new Subject(); + new TaskPollingLifecycle({ - elasticsearchAndSOAvailability$, ...taskManagerOpts, + config: { ...taskManagerOpts.config, claim_strategy: CLAIM_STRATEGY_MGET }, + elasticsearchAndSOAvailability$, + capacityConfiguration$: capacity$, }); + const taskClaimingGetCapacity = (TaskClaiming as jest.Mock).mock + .calls[0][0].getAvailableCapacity; + + capacity$.next(40); + expect(taskClaimingGetCapacity()).toEqual(80); + expect(taskClaimingGetCapacity('report')).toEqual(10); + expect(taskClaimingGetCapacity('quickReport')).toEqual(10); + + capacity$.next(60); + expect(taskClaimingGetCapacity()).toEqual(120); + expect(taskClaimingGetCapacity('report')).toEqual(10); + expect(taskClaimingGetCapacity('quickReport')).toEqual(10); + + capacity$.next(4); + expect(taskClaimingGetCapacity()).toEqual(8); + expect(taskClaimingGetCapacity('report')).toEqual(8); + expect(taskClaimingGetCapacity('quickReport')).toEqual(8); + }); + }); + + describe('stop', () => { + test('stops polling once the ES and SavedObjects services become unavailable', () => { + const elasticsearchAndSOAvailability$ = new Subject(); + new TaskPollingLifecycle({ elasticsearchAndSOAvailability$, ...taskManagerOpts }); + elasticsearchAndSOAvailability$.next(true); clock.tick(150); @@ -216,7 +244,7 @@ describe('TaskPollingLifecycle', () => { of( asOk({ docs: [], - stats: { tasksUpdated: 0, tasksConflicted: 0, tasksClaimed: 0, tasksRejected: 0 }, + stats: { tasksUpdated: 0, tasksConflicted: 0, tasksClaimed: 0 }, }) ) ); @@ -298,7 +326,47 @@ describe('TaskPollingLifecycle', () => { of( asOk({ docs: [], - stats: { tasksUpdated: 0, tasksConflicted: 0, tasksClaimed: 0, tasksRejected: 0 }, + stats: { tasksUpdated: 0, tasksConflicted: 0, tasksClaimed: 0 }, + }) + ) + ); + const elasticsearchAndSOAvailability$ = new Subject(); + const taskPollingLifecycle = new TaskPollingLifecycle({ + ...taskManagerOpts, + elasticsearchAndSOAvailability$, + }); + + const emittedEvents: TaskLifecycleEvent[] = []; + + taskPollingLifecycle.events.subscribe((event: TaskLifecycleEvent) => + emittedEvents.push(event) + ); + + elasticsearchAndSOAvailability$.next(true); + expect(mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable).toHaveBeenCalled(); + await retryUntil('workerUtilizationEvent emitted', () => { + return !!emittedEvents.find( + (event: TaskLifecycleEvent) => event.id === 'workerUtilization' + ); + }); + + const workerUtilizationEvent = emittedEvents.find( + (event: TaskLifecycleEvent) => event.id === 'workerUtilization' + ); + expect(workerUtilizationEvent).toEqual({ + id: 'workerUtilization', + type: 'TASK_MANAGER_STAT', + event: { tag: 'ok', value: 0 }, + }); + }); + + test('should set utilization to max when capacity is not fully reached but there are tasks left unclaimed', async () => { + clock.restore(); + mockTaskClaiming.claimAvailableTasksIfCapacityIsAvailable.mockImplementation(() => + of( + asOk({ + docs: [], + stats: { tasksUpdated: 0, tasksConflicted: 0, tasksClaimed: 0, tasksLeftUnclaimed: 2 }, }) ) ); @@ -321,6 +389,15 @@ describe('TaskPollingLifecycle', () => { (event: TaskLifecycleEvent) => event.id === 'workerUtilization' ); }); + + const workerUtilizationEvent = emittedEvents.find( + (event: TaskLifecycleEvent) => event.id === 'workerUtilization' + ); + expect(workerUtilizationEvent).toEqual({ + id: 'workerUtilization', + type: 'TASK_MANAGER_STAT', + event: { tag: 'ok', value: 100 }, + }); }); test('should emit event when polling error occurs', async () => { diff --git a/x-pack/plugins/task_manager/server/polling_lifecycle.ts b/x-pack/plugins/task_manager/server/polling_lifecycle.ts index 3b9c5621da0b9..f13a7ad20806c 100644 --- a/x-pack/plugins/task_manager/server/polling_lifecycle.ts +++ b/x-pack/plugins/task_manager/server/polling_lifecycle.ts @@ -45,6 +45,8 @@ import { TaskClaiming } from './queries/task_claiming'; import { ClaimOwnershipResult } from './task_claimers'; import { TaskPartitioner } from './lib/task_partitioner'; +const MAX_BUFFER_OPERATIONS = 100; + export interface ITaskEventEmitter { get events(): Observable; } @@ -101,7 +103,7 @@ export class TaskPollingLifecycle implements ITaskEventEmitter this.events$.next(event); this.bufferedStore = new BufferedTaskStore(this.store, { - bufferMaxOperations: config.max_workers, + bufferMaxOperations: MAX_BUFFER_OPERATIONS, logger, }); this.pool = new TaskPool({ logger, - maxWorkers$: maxWorkersConfiguration$, + strategy: config.claim_strategy, + capacity$: capacityConfiguration$, + definitions: this.definitions, }); this.pool.load.subscribe(emitEvent); @@ -142,17 +146,7 @@ export class TaskPollingLifecycle implements ITaskEventEmitter - taskType && this.definitions.get(taskType)?.maxConcurrency - ? Math.max( - Math.min( - this.pool.availableWorkers, - this.definitions.get(taskType)!.maxConcurrency! - - this.pool.getOccupiedWorkersByType(taskType) - ), - 0 - ) - : this.pool.availableWorkers, + getAvailableCapacity: (taskType?: string) => this.pool.availableCapacity(taskType), taskPartitioner, }); // pipe taskClaiming events into the lifecycle event stream @@ -163,7 +157,7 @@ export class TaskPollingLifecycle implements ITaskEventEmitter | undefined; if (claimStrategy === CLAIM_STRATEGY_DEFAULT) { pollIntervalDelay$ = delayOnClaimConflicts( - maxWorkersConfiguration$, + capacityConfiguration$, pollIntervalConfiguration$, this.events$, config.version_conflict_threshold, @@ -177,19 +171,22 @@ export class TaskPollingLifecycle implements ITaskEventEmitter { - const capacity = this.pool.availableWorkers; + const capacity = this.pool.availableCapacity(); if (!capacity) { + const usedCapacityPercentage = this.pool.usedCapacityPercentage; + // if there isn't capacity, emit a load event so that we can expose how often // high load causes the poller to skip work (work isn't called when there is no capacity) - this.emitEvent(asTaskManagerStatEvent('load', asOk(this.pool.workerLoad))); + this.emitEvent(asTaskManagerStatEvent('load', asOk(usedCapacityPercentage))); // Emit event indicating task manager utilization - this.emitEvent(asTaskManagerStatEvent('workerUtilization', asOk(this.pool.workerLoad))); + this.emitEvent(asTaskManagerStatEvent('workerUtilization', asOk(usedCapacityPercentage))); } return capacity; }, work: this.pollForWork, }); + this.subscribeToPoller(poller.events$); elasticsearchAndSOAvailability$.subscribe((areESAndSOAvailable) => { @@ -262,7 +259,7 @@ export class TaskPollingLifecycle implements ITaskEventEmitter { + mapOk((results: TimedFillPoolResult) => { // Emit event indicating task manager utilization % at the end of a polling cycle - // This represents the number of workers busy + number of tasks claimed in this cycle - this.emitEvent(asTaskManagerStatEvent('workerUtilization', asOk(this.pool.workerLoad))); + + // Get the actual utilization as a percentage + let tmUtilization = this.pool.usedCapacityPercentage; + + // Check whether there are any tasks left unclaimed + // If we're not at capacity and there are unclaimed tasks, then + // there must be high cost tasks that need to be claimed + // Artificially inflate the utilization to represent the unclaimed load + if (tmUtilization < 100 && (results.stats?.tasksLeftUnclaimed ?? 0) > 0) { + tmUtilization = 100; + } + + this.emitEvent(asTaskManagerStatEvent('workerUtilization', asOk(tmUtilization))); }) ) ) diff --git a/x-pack/plugins/task_manager/server/queries/task_claiming.test.ts b/x-pack/plugins/task_manager/server/queries/task_claiming.test.ts index bc4adb71dd4a1..de57a73f80533 100644 --- a/x-pack/plugins/task_manager/server/queries/task_claiming.test.ts +++ b/x-pack/plugins/task_manager/server/queries/task_claiming.test.ts @@ -80,7 +80,7 @@ describe('TaskClaiming', () => { unusedTypes: [], taskStore: taskStoreMock.create({ taskManagerId: '' }), maxAttempts: 2, - getCapacity: () => 10, + getAvailableCapacity: () => 10, taskPartitioner, }); @@ -130,7 +130,7 @@ describe('TaskClaiming', () => { unusedTypes: [], taskStore: taskStoreMock.create({ taskManagerId: '' }), maxAttempts: 2, - getCapacity: () => 10, + getAvailableCapacity: () => 10, taskPartitioner, }); diff --git a/x-pack/plugins/task_manager/server/queries/task_claiming.ts b/x-pack/plugins/task_manager/server/queries/task_claiming.ts index 188f47b0d2d2f..f5ef18452509b 100644 --- a/x-pack/plugins/task_manager/server/queries/task_claiming.ts +++ b/x-pack/plugins/task_manager/server/queries/task_claiming.ts @@ -38,7 +38,7 @@ export interface TaskClaimingOpts { taskStore: TaskStore; maxAttempts: number; excludedTaskTypes: string[]; - getCapacity: (taskType?: string) => number; + getAvailableCapacity: (taskType?: string) => number; taskPartitioner: TaskPartitioner; } @@ -87,7 +87,7 @@ export class TaskClaiming { private definitions: TaskTypeDictionary; private events$: Subject; private taskStore: TaskStore; - private getCapacity: (taskType?: string) => number; + private getAvailableCapacity: (taskType?: string) => number; private logger: Logger; private readonly taskClaimingBatchesByType: TaskClaimingBatches; private readonly taskMaxAttempts: Record; @@ -106,7 +106,7 @@ export class TaskClaiming { this.definitions = opts.definitions; this.maxAttempts = opts.maxAttempts; this.taskStore = opts.taskStore; - this.getCapacity = opts.getCapacity; + this.getAvailableCapacity = opts.getAvailableCapacity; this.logger = opts.logger.get('taskClaiming'); this.taskClaimingBatchesByType = this.partitionIntoClaimingBatches(this.definitions); this.taskMaxAttempts = Object.fromEntries(this.normalizeMaxAttempts(this.definitions)); @@ -170,13 +170,13 @@ export class TaskClaiming { public claimAvailableTasksIfCapacityIsAvailable( claimingOptions: Omit ): Observable> { - if (this.getCapacity()) { + if (this.getAvailableCapacity()) { const opts: TaskClaimerOpts = { batches: this.getClaimingBatches(), claimOwnershipUntil: claimingOptions.claimOwnershipUntil, taskStore: this.taskStore, events$: this.events$, - getCapacity: this.getCapacity, + getCapacity: this.getAvailableCapacity, unusedTypes: this.unusedTypes, definitions: this.definitions, taskMaxAttempts: this.taskMaxAttempts, diff --git a/x-pack/plugins/task_manager/server/routes/health.test.ts b/x-pack/plugins/task_manager/server/routes/health.test.ts index a97d99079bc58..9c08c5b5fb4c4 100644 --- a/x-pack/plugins/task_manager/server/routes/health.test.ts +++ b/x-pack/plugins/task_manager/server/routes/health.test.ts @@ -823,7 +823,8 @@ function mockHealthStats(overrides = {}) { configuration: { timestamp: new Date().toISOString(), value: { - max_workers: 10, + capacity: { config: 10, as_cost: 20, as_workers: 10 }, + claim_strategy: 'default', poll_interval: 3000, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, @@ -841,16 +842,19 @@ function mockHealthStats(overrides = {}) { timestamp: new Date().toISOString(), value: { count: 4, + cost: 8, task_types: { - actions_telemetry: { count: 2, status: { idle: 2 } }, - alerting_telemetry: { count: 1, status: { idle: 1 } }, - session_cleanup: { count: 1, status: { idle: 1 } }, + actions_telemetry: { count: 2, cost: 4, status: { idle: 2 } }, + alerting_telemetry: { count: 1, cost: 2, status: { idle: 1 } }, + session_cleanup: { count: 1, cost: 2, status: { idle: 1 } }, }, schedule: [], overdue: 0, + overdue_cost: 2, overdue_non_recurring: 0, estimatedScheduleDensity: [], non_recurring: 20, + non_recurring_cost: 40, owner_ids: [0, 0, 0, 1, 2, 0, 0, 2, 2, 2, 1, 2, 1, 1], estimated_schedule_density: [], capacity_requirements: { diff --git a/x-pack/plugins/task_manager/server/task.ts b/x-pack/plugins/task_manager/server/task.ts index fae99bb8f1f5b..96df4a703c5f7 100644 --- a/x-pack/plugins/task_manager/server/task.ts +++ b/x-pack/plugins/task_manager/server/task.ts @@ -16,6 +16,12 @@ export enum TaskPriority { Normal = 50, } +export enum TaskCost { + Tiny = 1, + Normal = 2, + ExtraLarge = 10, +} + /* * Type definitions and validations for tasks. */ @@ -127,6 +133,10 @@ export const taskDefinitionSchema = schema.object( * Priority of this task type. Defaults to "NORMAL" if not defined */ priority: schema.maybe(schema.number()), + /** + * Cost to run this task type. Defaults to "Normal". + */ + cost: schema.number({ defaultValue: TaskCost.Normal }), /** * An optional more detailed description of what this task does. */ @@ -172,7 +182,7 @@ export const taskDefinitionSchema = schema.object( paramsSchema: schema.maybe(schema.any()), }, { - validate({ timeout, priority }) { + validate({ timeout, priority, cost }) { if (!isInterval(timeout) || isErr(tryAsResult(() => parseIntervalAsMillisecond(timeout)))) { return `Invalid timeout "${timeout}". Timeout must be of the form "{number}{cadance}" where number is an integer. Example: 5m.`; } @@ -182,6 +192,12 @@ export const taskDefinitionSchema = schema.object( .filter((key) => isNaN(Number(key))) .map((key) => `${key} => ${TaskPriority[key as keyof typeof TaskPriority]}`)}`; } + + if (cost && (!isNumber(cost) || !(cost in TaskCost))) { + return `Invalid cost "${cost}". Cost must be one of ${Object.keys(TaskCost) + .filter((key) => isNaN(Number(key))) + .map((key) => `${key} => ${TaskCost[key as keyof typeof TaskCost]}`)}`; + } }, } ); diff --git a/x-pack/plugins/task_manager/server/task_claimers/index.ts b/x-pack/plugins/task_manager/server/task_claimers/index.ts index 1caa6e2addb0f..134c72041f96f 100644 --- a/x-pack/plugins/task_manager/server/task_claimers/index.ts +++ b/x-pack/plugins/task_manager/server/task_claimers/index.ts @@ -37,6 +37,7 @@ export interface ClaimOwnershipResult { tasksUpdated: number; tasksConflicted: number; tasksClaimed: number; + tasksLeftUnclaimed?: number; }; docs: ConcreteTaskInstance[]; timing?: TaskTiming; @@ -61,13 +62,12 @@ export function getTaskClaimer(logger: Logger, strategy: string): TaskClaimerFn return claimAvailableTasksDefault; } -export function getEmptyClaimOwnershipResult() { +export function getEmptyClaimOwnershipResult(): ClaimOwnershipResult { return { stats: { tasksUpdated: 0, tasksConflicted: 0, tasksClaimed: 0, - tasksRejected: 0, }, docs: [], }; diff --git a/x-pack/plugins/task_manager/server/task_claimers/strategy_default.test.ts b/x-pack/plugins/task_manager/server/task_claimers/strategy_default.test.ts index 8aa206bbe1872..d58fd83486efa 100644 --- a/x-pack/plugins/task_manager/server/task_claimers/strategy_default.test.ts +++ b/x-pack/plugins/task_manager/server/task_claimers/strategy_default.test.ts @@ -133,7 +133,7 @@ describe('TaskClaiming', () => { excludedTaskTypes, unusedTypes: unusedTaskTypes, maxAttempts: taskClaimingOpts.maxAttempts ?? 2, - getCapacity: taskClaimingOpts.getCapacity ?? (() => 10), + getAvailableCapacity: taskClaimingOpts.getAvailableCapacity ?? (() => 10), taskPartitioner, ...taskClaimingOpts, }); @@ -158,7 +158,7 @@ describe('TaskClaiming', () => { excludedTaskTypes?: string[]; unusedTaskTypes?: string[]; }) { - const getCapacity = taskClaimingOpts.getCapacity ?? (() => 10); + const getCapacity = taskClaimingOpts.getAvailableCapacity ?? (() => 10); const { taskClaiming, store } = initialiseTestClaiming({ storeOpts, taskClaimingOpts, @@ -447,7 +447,7 @@ if (doc['task.runAt'].size()!=0) { }, taskClaimingOpts: { maxAttempts, - getCapacity: (type) => { + getAvailableCapacity: (type) => { switch (type) { case 'limitedToOne': case 'anotherLimitedToOne': @@ -577,7 +577,7 @@ if (doc['task.runAt'].size()!=0) { }, taskClaimingOpts: { maxAttempts, - getCapacity: (type) => { + getAvailableCapacity: (type) => { switch (type) { case 'limitedToTwo': return 2; @@ -686,7 +686,7 @@ if (doc['task.runAt'].size()!=0) { }, taskClaimingOpts: { maxAttempts, - getCapacity: (type) => { + getAvailableCapacity: (type) => { switch (type) { case 'limitedToOne': case 'anotherLimitedToOne': @@ -1139,7 +1139,7 @@ if (doc['task.runAt'].size()!=0) { storeOpts: { taskManagerId, }, - taskClaimingOpts: { getCapacity: () => maxDocs }, + taskClaimingOpts: { getAvailableCapacity: () => maxDocs }, claimingOpts: { claimOwnershipUntil, }, @@ -1219,9 +1219,9 @@ if (doc['task.runAt'].size()!=0) { function instantiateStoreWithMockedApiResponses({ taskManagerId = uuidv4(), definitions = taskDefinitions, - getCapacity = () => 10, + getAvailableCapacity = () => 10, tasksClaimed, - }: Partial> & { + }: Partial> & { taskManagerId?: string; tasksClaimed?: ConcreteTaskInstance[][]; } = {}) { @@ -1254,7 +1254,7 @@ if (doc['task.runAt'].size()!=0) { unusedTypes: [], taskStore, maxAttempts: 2, - getCapacity, + getAvailableCapacity, taskPartitioner, }); diff --git a/x-pack/plugins/task_manager/server/task_claimers/strategy_mget.test.ts b/x-pack/plugins/task_manager/server/task_claimers/strategy_mget.test.ts index b58ea02893c10..2c4b5fd6a96c6 100644 --- a/x-pack/plugins/task_manager/server/task_claimers/strategy_mget.test.ts +++ b/x-pack/plugins/task_manager/server/task_claimers/strategy_mget.test.ts @@ -15,10 +15,11 @@ import { ConcreteTaskInstance, ConcreteTaskInstanceVersion, TaskPriority, + TaskCost, } from '../task'; import { SearchOpts, StoreOpts } from '../task_store'; import { asTaskClaimEvent, TaskEvent } from '../task_events'; -import { asOk, isOk, unwrap } from '../lib/result_type'; +import { asOk, asErr, isOk, unwrap } from '../lib/result_type'; import { TaskTypeDictionary } from '../task_type_dictionary'; import { mockLogger } from '../test_utils'; import { @@ -33,6 +34,7 @@ import apm from 'elastic-apm-node'; import { TASK_MANAGER_TRANSACTION_TYPE } from '../task_running'; import { ClaimOwnershipResult } from '.'; import { FillPoolResult } from '../lib/fill_pool'; +import { SavedObjectsErrorHelpers } from '@kbn/core/server'; import { TaskPartitioner } from '../lib/task_partitioner'; import type { MustNotCondition } from '../queries/query_clauses'; import { @@ -52,6 +54,7 @@ jest.mock('../constants', () => ({ 'anotherLimitedToOne', 'limitedToTwo', 'limitedToFive', + 'yawn', ], })); @@ -74,14 +77,18 @@ const taskDefinitions = new TaskTypeDictionary(taskManagerLogger); taskDefinitions.registerTaskDefinitions({ report: { title: 'report', + cost: TaskCost.Normal, createTaskRunner: jest.fn(), }, dernstraight: { title: 'dernstraight', + cost: TaskCost.ExtraLarge, createTaskRunner: jest.fn(), }, yawn: { title: 'yawn', + cost: TaskCost.Tiny, + maxConcurrency: 1, createTaskRunner: jest.fn(), }, }); @@ -110,6 +117,17 @@ describe('TaskClaiming', () => { }); describe('claimAvailableTasks', () => { + function getVersionMapsFromTasks(tasks: ConcreteTaskInstance[]) { + const versionMap = new Map(); + const docLatestVersions = new Map(); + for (const task of tasks) { + versionMap.set(task.id, { esId: task.id, seqNo: 32, primaryTerm: 32 }); + docLatestVersions.set(`task:${task.id}`, { esId: task.id, seqNo: 32, primaryTerm: 32 }); + } + + return { versionMap, docLatestVersions }; + } + function initialiseTestClaiming({ storeOpts = {}, taskClaimingOpts = {}, @@ -130,20 +148,27 @@ describe('TaskClaiming', () => { store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); if (hits == null) hits = [generateFakeTasks(1)]; + + const docVersion = []; if (versionMaps == null) { - versionMaps = [new Map()]; + versionMaps = []; for (const oneHit of hits) { const map = new Map(); - versionMaps.push(map); + const mapWithTaskPrefix = new Map(); for (const task of oneHit) { map.set(task.id, { esId: task.id, seqNo: 32, primaryTerm: 32 }); + mapWithTaskPrefix.set(`task:${task.id}`, { esId: task.id, seqNo: 32, primaryTerm: 32 }); } + versionMaps.push(map); + docVersion.push(mapWithTaskPrefix); } } for (let i = 0; i < hits.length; i++) { store.fetch.mockResolvedValueOnce({ docs: hits[i], versionMap: versionMaps[i] }); - store.getDocVersions.mockResolvedValueOnce(versionMaps[i]); + store.getDocVersions.mockResolvedValueOnce(docVersion[i]); + const oneBulkGetResult = hits[i].map((hit) => asOk(hit)); + store.bulkGet.mockResolvedValueOnce(oneBulkGetResult); const oneBulkResult = hits[i].map((hit) => asOk(hit)); store.bulkUpdate.mockResolvedValueOnce(oneBulkResult); } @@ -156,7 +181,7 @@ describe('TaskClaiming', () => { excludedTaskTypes, unusedTypes: unusedTaskTypes, maxAttempts: taskClaimingOpts.maxAttempts ?? 2, - getCapacity: taskClaimingOpts.getCapacity ?? (() => 10), + getAvailableCapacity: taskClaimingOpts.getAvailableCapacity ?? (() => 10), taskPartitioner, ...taskClaimingOpts, }); @@ -203,6 +228,14 @@ describe('TaskClaiming', () => { return unwrap(resultOrErr) as ClaimOwnershipResult; }); + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(store.fetch.mock.calls).toMatchObject({}); + expect(store.getDocVersions.mock.calls).toMatchObject({}); return results.map((result, index) => ({ result, args: { @@ -289,8 +322,1250 @@ describe('TaskClaiming', () => { expect(result).toMatchObject({}); }); + test('should limit claimed tasks based on task cost and available capacity', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), // total cost = 2 + mockInstance({ id: `id-2`, taskType: 'report' }), // total cost = 4 + mockInstance({ id: `id-3`, taskType: 'yawn' }), // total cost = 5 + mockInstance({ id: `id-4`, taskType: 'dernstraight' }), // claiming this will exceed the available capacity + mockInstance({ id: `id-5`, taskType: 'report' }), + mockInstance({ id: `id-6`, taskType: 'report' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + + store.bulkGet.mockResolvedValueOnce( + [fetchedTasks[0], fetchedTasks[1], fetchedTasks[2]].map(asOk) + ); + store.bulkUpdate.mockResolvedValueOnce( + [fetchedTasks[0], fetchedTasks[1], fetchedTasks[2]].map(asOk) + ); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: [], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 3; stale: 0; conflicts: 0; missing: 0; capacity reached: 3; updateErrors: 0; removed: 0;', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith([ + 'task:id-1', + 'task:id-2', + 'task:id-3', + 'task:id-4', + 'task:id-5', + 'task:id-6', + ]); + expect(store.bulkUpdate).toHaveBeenCalledTimes(1); + expect(store.bulkUpdate).toHaveBeenCalledWith( + [ + { + ...fetchedTasks[0], + ownerId: 'test-test', + retryAt: fetchedTasks[0].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[1], + ownerId: 'test-test', + retryAt: fetchedTasks[1].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith(['id-1', 'id-2', 'id-3']); + + expect(result.stats).toEqual({ + tasksClaimed: 3, + tasksConflicted: 0, + tasksUpdated: 3, + tasksLeftUnclaimed: 3, + }); + expect(result.docs.length).toEqual(3); + }); + + test('should not claim tasks of removed type', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + + store.bulkGet.mockResolvedValueOnce([fetchedTasks[2]].map(asOk)); + store.bulkUpdate.mockResolvedValueOnce([fetchedTasks[2]].map(asOk)); + store.bulkUpdate.mockResolvedValueOnce([fetchedTasks[0], fetchedTasks[1]].map(asOk)); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: ['report'], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 1; stale: 0; conflicts: 0; missing: 0; capacity reached: 0; updateErrors: 0; removed: 2;', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith(['task:id-1', 'task:id-2', 'task:id-3']); + expect(store.bulkUpdate).toHaveBeenCalledTimes(2); + expect(store.bulkUpdate).toHaveBeenNthCalledWith( + 1, + [ + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkUpdate).toHaveBeenNthCalledWith( + 2, + [ + { + ...fetchedTasks[0], + status: 'unrecognized', + }, + { + ...fetchedTasks[1], + status: 'unrecognized', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith(['id-3']); + + expect(result.stats).toEqual({ + tasksClaimed: 1, + tasksConflicted: 0, + tasksUpdated: 1, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(1); + }); + + test('should log warning if error updating single removed task as unrecognized', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + + store.bulkGet.mockResolvedValueOnce([fetchedTasks[2]].map(asOk)); + store.bulkUpdate.mockResolvedValueOnce([fetchedTasks[2]].map(asOk)); + store.bulkUpdate.mockResolvedValueOnce([ + asOk(fetchedTasks[0]), + // @ts-expect-error + asErr({ + type: 'task', + id: fetchedTasks[1].id, + error: SavedObjectsErrorHelpers.createBadRequestError(), + }), + ]); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: ['report'], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.warn).toHaveBeenCalledWith( + 'Error updating task id-2:task to mark as unrecognized during claim: Bad Request', + { tags: ['claimAvailableTasksMget'] } + ); + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 1; stale: 0; conflicts: 0; missing: 0; capacity reached: 0; updateErrors: 0; removed: 1;', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith(['task:id-1', 'task:id-2', 'task:id-3']); + expect(store.bulkUpdate).toHaveBeenCalledTimes(2); + expect(store.bulkUpdate).toHaveBeenNthCalledWith( + 1, + [ + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkUpdate).toHaveBeenNthCalledWith( + 2, + [ + { + ...fetchedTasks[0], + status: 'unrecognized', + }, + { + ...fetchedTasks[1], + status: 'unrecognized', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith(['id-3']); + + expect(result.stats).toEqual({ + tasksClaimed: 1, + tasksConflicted: 0, + tasksUpdated: 1, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(1); + }); + + test('should log warning if error updating all removed tasks as unrecognized', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + + store.bulkGet.mockResolvedValueOnce([fetchedTasks[2]].map(asOk)); + store.bulkUpdate.mockResolvedValueOnce([fetchedTasks[2]].map(asOk)); + store.bulkUpdate.mockRejectedValueOnce(new Error('Oh no')); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: ['report'], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.warn).toHaveBeenCalledWith( + 'Error updating tasks to mark as unrecognized during claim: Error: Oh no', + { tags: ['claimAvailableTasksMget'] } + ); + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 1; stale: 0; conflicts: 0; missing: 0; capacity reached: 0; updateErrors: 0; removed: 0;', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith(['task:id-1', 'task:id-2', 'task:id-3']); + expect(store.bulkGet).toHaveBeenCalledWith(['id-3']); + expect(store.bulkUpdate).toHaveBeenCalledTimes(2); + expect(store.bulkUpdate).toHaveBeenNthCalledWith( + 1, + [ + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkUpdate).toHaveBeenNthCalledWith( + 2, + [ + { + ...fetchedTasks[0], + status: 'unrecognized', + }, + { + ...fetchedTasks[1], + status: 'unrecognized', + }, + ], + { validate: false, excludeLargeFields: true } + ); + + expect(result.stats).toEqual({ + tasksClaimed: 1, + tasksConflicted: 0, + tasksUpdated: 1, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(1); + }); + + test('should handle no tasks to claim', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks: ConcreteTaskInstance[] = []; + + const { versionMap } = getVersionMapsFromTasks(fetchedTasks); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: [], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).not.toHaveBeenCalled(); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).not.toHaveBeenCalled(); + expect(store.bulkGet).not.toHaveBeenCalled(); + expect(store.bulkUpdate).not.toHaveBeenCalled(); + + expect(result.stats).toEqual({ + tasksClaimed: 0, + tasksConflicted: 0, + tasksUpdated: 0, + }); + expect(result.docs.length).toEqual(0); + }); + + test('should handle tasks with no search version', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + versionMap.delete('id-1'); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + + store.bulkGet.mockResolvedValueOnce([fetchedTasks[1], fetchedTasks[2]].map(asOk)); + store.bulkUpdate.mockResolvedValueOnce([fetchedTasks[1], fetchedTasks[2]].map(asOk)); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: [], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 2; stale: 0; conflicts: 0; missing: 1; capacity reached: 0; updateErrors: 0; removed: 0;', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith(['task:id-1', 'task:id-2', 'task:id-3']); + expect(store.bulkUpdate).toHaveBeenCalledTimes(1); + expect(store.bulkUpdate).toHaveBeenCalledWith( + [ + { + ...fetchedTasks[1], + ownerId: 'test-test', + retryAt: fetchedTasks[1].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith(['id-2', 'id-3']); + + expect(result.stats).toEqual({ + tasksClaimed: 2, + tasksConflicted: 0, + tasksUpdated: 2, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(2); + }); + + test('should handle tasks with no latest version', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + docLatestVersions.delete('task:id-1'); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + + store.bulkGet.mockResolvedValueOnce([fetchedTasks[1], fetchedTasks[2]].map(asOk)); + store.bulkUpdate.mockResolvedValueOnce([fetchedTasks[1], fetchedTasks[2]].map(asOk)); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: [], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 2; stale: 0; conflicts: 0; missing: 1; capacity reached: 0; updateErrors: 0; removed: 0;', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith(['task:id-1', 'task:id-2', 'task:id-3']); + expect(store.bulkUpdate).toHaveBeenCalledTimes(1); + expect(store.bulkUpdate).toHaveBeenCalledWith( + [ + { + ...fetchedTasks[1], + ownerId: 'test-test', + retryAt: fetchedTasks[1].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith(['id-2', 'id-3']); + + expect(result.stats).toEqual({ + tasksClaimed: 2, + tasksConflicted: 0, + tasksUpdated: 2, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(2); + }); + + test('should handle stale tasks', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + docLatestVersions.set('task:id-1', { esId: 'task:id-1', seqNo: 33, primaryTerm: 33 }); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + + store.bulkGet.mockResolvedValueOnce([fetchedTasks[1], fetchedTasks[2]].map(asOk)); + store.bulkUpdate.mockResolvedValueOnce([fetchedTasks[1], fetchedTasks[2]].map(asOk)); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: [], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 2; stale: 1; conflicts: 1; missing: 0; capacity reached: 0; updateErrors: 0; removed: 0;', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith(['task:id-1', 'task:id-2', 'task:id-3']); + expect(store.bulkUpdate).toHaveBeenCalledTimes(1); + expect(store.bulkUpdate).toHaveBeenCalledWith( + [ + { + ...fetchedTasks[1], + ownerId: 'test-test', + retryAt: fetchedTasks[1].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith(['id-2', 'id-3']); + + expect(result.stats).toEqual({ + tasksClaimed: 2, + tasksConflicted: 1, + tasksUpdated: 2, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(2); + }); + + test('should correctly handle limited concurrency tasks', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + mockInstance({ id: `id-4`, taskType: 'yawn' }), + mockInstance({ id: `id-5`, taskType: 'report' }), + mockInstance({ id: `id-6`, taskType: 'yawn' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + + store.bulkGet.mockResolvedValueOnce( + [fetchedTasks[0], fetchedTasks[1], fetchedTasks[2], fetchedTasks[4]].map(asOk) + ); + store.bulkUpdate.mockResolvedValueOnce( + [fetchedTasks[0], fetchedTasks[1], fetchedTasks[2], fetchedTasks[4]].map(asOk) + ); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: [], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 4; stale: 0; conflicts: 0; missing: 0; capacity reached: 0; updateErrors: 0; removed: 0;', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith([ + 'task:id-1', + 'task:id-2', + 'task:id-3', + 'task:id-4', + 'task:id-5', + 'task:id-6', + ]); + expect(store.bulkUpdate).toHaveBeenCalledTimes(1); + expect(store.bulkUpdate).toHaveBeenCalledWith( + [ + { + ...fetchedTasks[0], + ownerId: 'test-test', + retryAt: fetchedTasks[1].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[1], + ownerId: 'test-test', + retryAt: fetchedTasks[1].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[4], + ownerId: 'test-test', + retryAt: fetchedTasks[1].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith(['id-1', 'id-2', 'id-3', 'id-5']); + + expect(result.stats).toEqual({ + tasksClaimed: 4, + tasksConflicted: 0, + tasksUpdated: 4, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(4); + }); + + test('should handle individual errors when bulk getting the full task doc', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + mockInstance({ id: `id-4`, taskType: 'report' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + store.bulkUpdate.mockResolvedValueOnce( + [fetchedTasks[0], fetchedTasks[1], fetchedTasks[2], fetchedTasks[3]].map(asOk) + ); + store.bulkGet.mockResolvedValueOnce([ + asOk(fetchedTasks[0]), + // @ts-expect-error + asErr({ + type: 'task', + id: fetchedTasks[1].id, + error: new Error('Oh no'), + }), + asOk(fetchedTasks[2]), + asOk(fetchedTasks[3]), + ]); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: [], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 3; stale: 0; conflicts: 0; missing: 0; capacity reached: 0; updateErrors: 0; removed: 0;', + { tags: ['claimAvailableTasksMget'] } + ); + expect(taskManagerLogger.warn).toHaveBeenCalledWith( + 'Error getting full task id-2:task during claim: Oh no', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith([ + 'task:id-1', + 'task:id-2', + 'task:id-3', + 'task:id-4', + ]); + expect(store.bulkUpdate).toHaveBeenCalledTimes(1); + expect(store.bulkUpdate).toHaveBeenCalledWith( + [ + { + ...fetchedTasks[0], + ownerId: 'test-test', + retryAt: fetchedTasks[0].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[1], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[3], + ownerId: 'test-test', + retryAt: fetchedTasks[3].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith(['id-1', 'id-2', 'id-3', 'id-4']); + + expect(result.stats).toEqual({ + tasksClaimed: 3, + tasksConflicted: 0, + tasksUpdated: 3, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(3); + }); + + test('should handle error when bulk getting all full task docs', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + mockInstance({ id: `id-4`, taskType: 'report' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + store.bulkUpdate.mockResolvedValueOnce( + [fetchedTasks[0], fetchedTasks[1], fetchedTasks[2], fetchedTasks[3]].map(asOk) + ); + store.bulkGet.mockRejectedValueOnce(new Error('oh no')); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: [], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 0; stale: 0; conflicts: 0; missing: 0; capacity reached: 0; updateErrors: 0; removed: 0;', + { tags: ['claimAvailableTasksMget'] } + ); + expect(taskManagerLogger.warn).toHaveBeenCalledWith( + 'Error getting full task documents during claim: Error: oh no', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith([ + 'task:id-1', + 'task:id-2', + 'task:id-3', + 'task:id-4', + ]); + expect(store.bulkUpdate).toHaveBeenCalledTimes(1); + expect(store.bulkUpdate).toHaveBeenCalledWith( + [ + { + ...fetchedTasks[0], + ownerId: 'test-test', + retryAt: fetchedTasks[0].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[1], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[3], + ownerId: 'test-test', + retryAt: fetchedTasks[3].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith(['id-1', 'id-2', 'id-3', 'id-4']); + + expect(result.stats).toEqual({ + tasksClaimed: 0, + tasksConflicted: 0, + tasksUpdated: 0, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(0); + }); + + test('should handle individual errors when bulk updating the task doc', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + mockInstance({ id: `id-4`, taskType: 'report' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + store.bulkUpdate.mockResolvedValueOnce([ + asOk(fetchedTasks[0]), + // @ts-expect-error + asErr({ + type: 'task', + id: fetchedTasks[1].id, + error: new Error('Oh no'), + }), + asOk(fetchedTasks[2]), + asOk(fetchedTasks[3]), + ]); + store.bulkGet.mockResolvedValueOnce([ + asOk(fetchedTasks[0]), + asOk(fetchedTasks[2]), + asOk(fetchedTasks[3]), + ]); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: [], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 3; stale: 0; conflicts: 0; missing: 0; capacity reached: 0; updateErrors: 1; removed: 0;', + { tags: ['claimAvailableTasksMget'] } + ); + expect(taskManagerLogger.warn).toHaveBeenCalledWith( + 'Error updating task id-2:task during claim: Oh no', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith([ + 'task:id-1', + 'task:id-2', + 'task:id-3', + 'task:id-4', + ]); + expect(store.bulkUpdate).toHaveBeenCalledTimes(1); + expect(store.bulkUpdate).toHaveBeenCalledWith( + [ + { + ...fetchedTasks[0], + ownerId: 'test-test', + retryAt: fetchedTasks[0].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[1], + ownerId: 'test-test', + retryAt: fetchedTasks[1].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[3], + ownerId: 'test-test', + retryAt: fetchedTasks[3].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith(['id-1', 'id-3', 'id-4']); + + expect(result.stats).toEqual({ + tasksClaimed: 3, + tasksConflicted: 0, + tasksUpdated: 3, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(3); + }); + + test('should handle error when bulk updating all task docs', async () => { + const store = taskStoreMock.create({ taskManagerId: 'test-test' }); + store.convertToSavedObjectIds.mockImplementation((ids) => ids.map((id) => `task:${id}`)); + + const fetchedTasks = [ + mockInstance({ id: `id-1`, taskType: 'report' }), + mockInstance({ id: `id-2`, taskType: 'report' }), + mockInstance({ id: `id-3`, taskType: 'yawn' }), + mockInstance({ id: `id-4`, taskType: 'report' }), + ]; + + const { versionMap, docLatestVersions } = getVersionMapsFromTasks(fetchedTasks); + store.fetch.mockResolvedValueOnce({ docs: fetchedTasks, versionMap }); + store.getDocVersions.mockResolvedValueOnce(docLatestVersions); + store.bulkUpdate.mockRejectedValueOnce(new Error('oh no')); + store.bulkGet.mockResolvedValueOnce([]); + + const taskClaiming = new TaskClaiming({ + logger: taskManagerLogger, + strategy: CLAIM_STRATEGY_MGET, + definitions: taskDefinitions, + taskStore: store, + excludedTaskTypes: [], + unusedTypes: [], + maxAttempts: 2, + getAvailableCapacity: () => 10, + taskPartitioner, + }); + + const [resultOrErr] = await getAllAsPromise( + taskClaiming.claimAvailableTasksIfCapacityIsAvailable({ claimOwnershipUntil: new Date() }) + ); + + if (!isOk(resultOrErr)) { + expect(resultOrErr).toBe(undefined); + } + + const result = unwrap(resultOrErr) as ClaimOwnershipResult; + + expect(apm.startTransaction).toHaveBeenCalledWith( + TASK_MANAGER_MARK_AS_CLAIMED, + TASK_MANAGER_TRANSACTION_TYPE + ); + expect(mockApmTrans.end).toHaveBeenCalledWith('success'); + + expect(taskManagerLogger.debug).toHaveBeenCalledWith( + 'task claimer claimed: 0; stale: 0; conflicts: 0; missing: 0; capacity reached: 0; updateErrors: 0; removed: 0;', + { tags: ['claimAvailableTasksMget'] } + ); + expect(taskManagerLogger.warn).toHaveBeenCalledWith( + 'Error updating tasks during claim: Error: oh no', + { tags: ['claimAvailableTasksMget'] } + ); + + expect(store.fetch.mock.calls[0][0]).toMatchObject({ size: 40, seq_no_primary_term: true }); + expect(store.getDocVersions).toHaveBeenCalledWith([ + 'task:id-1', + 'task:id-2', + 'task:id-3', + 'task:id-4', + ]); + expect(store.bulkUpdate).toHaveBeenCalledTimes(1); + expect(store.bulkUpdate).toHaveBeenCalledWith( + [ + { + ...fetchedTasks[0], + ownerId: 'test-test', + retryAt: fetchedTasks[0].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[1], + ownerId: 'test-test', + retryAt: fetchedTasks[1].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[2], + ownerId: 'test-test', + retryAt: fetchedTasks[2].runAt, + status: 'claiming', + }, + { + ...fetchedTasks[3], + ownerId: 'test-test', + retryAt: fetchedTasks[3].runAt, + status: 'claiming', + }, + ], + { validate: false, excludeLargeFields: true } + ); + expect(store.bulkGet).toHaveBeenCalledWith([]); + + expect(result.stats).toEqual({ + tasksClaimed: 0, + tasksConflicted: 0, + tasksUpdated: 0, + tasksLeftUnclaimed: 0, + }); + expect(result.docs.length).toEqual(0); + }); + test('it should filter for specific partitions and tasks without partitions', async () => { const taskManagerId = uuidv4(); + const definitions = new TaskTypeDictionary(mockLogger()); + definitions.registerTaskDefinitions({ + foo: { + title: 'foo', + createTaskRunner: jest.fn(), + }, + bar: { + title: 'bar', + createTaskRunner: jest.fn(), + }, + }); const [ { args: { @@ -300,6 +1575,7 @@ describe('TaskClaiming', () => { ] = await testClaimAvailableTasks({ storeOpts: { taskManagerId, + definitions, }, taskClaimingOpts: {}, claimingOpts: { @@ -355,9 +1631,8 @@ describe('TaskClaiming', () => { Object { "terms": Object { "task.taskType": Array [ - "report", - "dernstraight", - "yawn", + "foo", + "bar", ], }, }, @@ -498,9 +1773,9 @@ describe('TaskClaiming', () => { function instantiateStoreWithMockedApiResponses({ taskManagerId = uuidv4(), definitions = taskDefinitions, - getCapacity = () => 10, + getAvailableCapacity = () => 10, tasksClaimed, - }: Partial> & { + }: Partial> & { taskManagerId?: string; tasksClaimed?: ConcreteTaskInstance[][]; } = {}) { @@ -533,7 +1808,7 @@ describe('TaskClaiming', () => { unusedTypes: [], taskStore, maxAttempts: 2, - getCapacity, + getAvailableCapacity, taskPartitioner, }); diff --git a/x-pack/plugins/task_manager/server/task_claimers/strategy_mget.ts b/x-pack/plugins/task_manager/server/task_claimers/strategy_mget.ts index 362c38166339f..7962fdd2b6f8a 100644 --- a/x-pack/plugins/task_manager/server/task_claimers/strategy_mget.ts +++ b/x-pack/plugins/task_manager/server/task_claimers/strategy_mget.ts @@ -7,9 +7,11 @@ // Basic operation of this task claimer: // - search for candidate tasks to run, more than we actually can run +// - initial search returns a slimmer task document for I/O efficiency (no params or state) // - for each task found, do an mget to get the current seq_no and primary_term // - if the mget result doesn't match the search result, the task is stale -// - from the non-stale search results, return as many as we can run +// - from the non-stale search results, return as many as we can run based on available +// capacity and the cost of each task type to run import { SavedObjectsErrorHelpers } from '@kbn/core/server'; @@ -18,7 +20,7 @@ import { Subject, Observable } from 'rxjs'; import { TaskTypeDictionary } from '../task_type_dictionary'; import { TaskClaimerOpts, ClaimOwnershipResult, getEmptyClaimOwnershipResult } from '.'; -import { ConcreteTaskInstance, TaskStatus, ConcreteTaskInstanceVersion } from '../task'; +import { ConcreteTaskInstance, TaskStatus, ConcreteTaskInstanceVersion, TaskCost } from '../task'; import { TASK_MANAGER_TRANSACTION_TYPE } from '../task_running'; import { isLimited, @@ -112,7 +114,10 @@ async function claimAvailableTasks(opts: TaskClaimerOpts): Promise { - if (task.retryAt != null && new Date(task.retryAt).getTime() < Date.now()) { - task.scheduledAt = task.retryAt; - } else { - task.scheduledAt = task.runAt; - } - task.retryAt = claimOwnershipUntil; - task.ownerId = taskStore.taskManagerId; - task.status = TaskStatus.Claiming; + // apply capacity constraint to candidate tasks + const tasksToRun: ConcreteTaskInstance[] = []; + const leftOverTasks: ConcreteTaskInstance[] = []; + + let capacityAccumulator = 0; + for (const task of candidateTasks) { + const taskCost = definitions.get(task.taskType)?.cost ?? TaskCost.Normal; + if (capacityAccumulator + taskCost <= initialCapacity) { + tasksToRun.push(task); + capacityAccumulator += taskCost; + } else { + leftOverTasks.push(task); + capacityAccumulator = initialCapacity; + } + } - return task; + // build the updated task objects we'll claim + const taskUpdates: ConcreteTaskInstance[] = []; + for (const task of tasksToRun) { + taskUpdates.push({ + ...task, + scheduledAt: + task.retryAt != null && new Date(task.retryAt).getTime() < Date.now() + ? task.retryAt + : task.runAt, + status: TaskStatus.Claiming, + retryAt: claimOwnershipUntil, + ownerId: taskStore.taskManagerId, }); + } // perform the task object updates, deal with errors - const finalResults: ConcreteTaskInstance[] = []; + const updatedTasks: ConcreteTaskInstance[] = []; let conflicts = staleTasks.length; let bulkErrors = 0; try { - const updateResults = await taskStore.bulkUpdate(taskUpdates, { validate: false }); + const updateResults = await taskStore.bulkUpdate(taskUpdates, { + validate: false, + excludeLargeFields: true, + }); for (const updateResult of updateResults) { if (isOk(updateResult)) { - finalResults.push(updateResult.value); + updatedTasks.push(updateResult.value); } else { const { id, type, error } = updateResult.error; @@ -209,6 +233,27 @@ async function claimAvailableTasks(opts: TaskClaimerOpts): Promise task.id))).reduce< + ConcreteTaskInstance[] + >((acc, task) => { + if (isOk(task)) { + acc.push(task.value); + } else { + const { id, type, error } = task.error; + logger.warn( + `Error getting full task ${id}:${type} during claim: ${error.message}`, + logMeta + ); + } + return acc; + }, []); + } catch (err) { + logger.warn(`Error getting full task documents during claim: ${err}`, logMeta); + } + // separate update for removed tasks; shouldn't happen often, so unlikely // a performance concern, and keeps the rest of the logic simpler let removedCount = 0; @@ -220,7 +265,10 @@ async function claimAvailableTasks(opts: TaskClaimerOpts): Promise { - beforeEach(() => { - jest.useFakeTimers(); - jest.setSystemTime(new Date(2021, 12, 30)); - }); - - afterEach(() => { - jest.useRealTimers(); - }); - - test('occupiedWorkers are a sum of running tasks', async () => { - const pool = new TaskPool({ - maxWorkers$: of(200), - logger: loggingSystemMock.create().get(), - }); - - const result = await pool.run([{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]); - - expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); - expect(pool.occupiedWorkers).toEqual(3); - }); - - test('availableWorkers are a function of total_capacity - occupiedWorkers', async () => { - const pool = new TaskPool({ - maxWorkers$: of(10), - logger: loggingSystemMock.create().get(), - }); - - const result = await pool.run([{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]); - - expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); - expect(pool.availableWorkers).toEqual(7); - }); - - test('availableWorkers is 0 until maxWorkers$ pushes a value', async () => { - const maxWorkers$ = new Subject(); - const pool = new TaskPool({ - maxWorkers$, - logger: loggingSystemMock.create().get(), - }); - - expect(pool.availableWorkers).toEqual(0); - maxWorkers$.next(10); - expect(pool.availableWorkers).toEqual(10); - }); - - test('does not run tasks that are beyond its available capacity', async () => { - const pool = new TaskPool({ - maxWorkers$: of(2), - logger: loggingSystemMock.create().get(), - }); - - const shouldRun = mockRun(); - const shouldNotRun = mockRun(); - - const result = await pool.run([ - { ...mockTask(), run: shouldRun }, - { ...mockTask(), run: shouldRun }, - { ...mockTask(), run: shouldNotRun }, - ]); - - expect(result).toEqual(TaskPoolRunResult.RanOutOfCapacity); - expect(pool.availableWorkers).toEqual(0); - expect(shouldRun).toHaveBeenCalledTimes(2); - expect(shouldNotRun).not.toHaveBeenCalled(); - }); - - test('should log when marking a Task as running fails', async () => { - const logger = loggingSystemMock.create().get(); - const pool = new TaskPool({ - maxWorkers$: of(2), - logger, - }); - - const taskFailedToMarkAsRunning = mockTask(); - taskFailedToMarkAsRunning.markTaskAsRunning.mockImplementation(async () => { - throw new Error(`Mark Task as running has failed miserably`); - }); - - const result = await pool.run([mockTask(), taskFailedToMarkAsRunning, mockTask()]); - - expect((logger as jest.Mocked).error.mock.calls[0]).toMatchInlineSnapshot(` - Array [ - "Failed to mark Task TaskType \\"shooooo\\" as running: Mark Task as running has failed miserably", - ] - `); - - expect(result).toEqual(TaskPoolRunResult.RunningAtCapacity); - }); - - test('should log when running a Task fails', async () => { - const logger = loggingSystemMock.create().get(); - const pool = new TaskPool({ - maxWorkers$: of(3), - logger, - }); - - const taskFailedToRun = mockTask(); - taskFailedToRun.run.mockImplementation(async () => { - throw new Error(`Run Task has failed miserably`); - }); - - const result = await pool.run([mockTask(), taskFailedToRun, mockTask()]); - - expect((logger as jest.Mocked).warn.mock.calls[0]).toMatchInlineSnapshot(` - Array [ - "Task TaskType \\"shooooo\\" failed in attempt to run: Run Task has failed miserably", - ] - `); - - expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); - }); - - test('should not log when running a Task fails due to the Task SO having been deleted while in flight', async () => { - const logger = loggingSystemMock.create().get(); - const pool = new TaskPool({ - maxWorkers$: of(3), - logger, - }); - - const taskFailedToRun = mockTask(); - taskFailedToRun.run.mockImplementation(async () => { - throw SavedObjectsErrorHelpers.createGenericNotFoundError('task', taskFailedToRun.id); - }); - - const result = await pool.run([mockTask(), taskFailedToRun, mockTask()]); - - expect(logger.debug).toHaveBeenCalledWith( - `Task TaskType "shooooo" failed in attempt to run: Saved object [task/${taskFailedToRun.id}] not found` - ); - expect(logger.warn).not.toHaveBeenCalled(); - - expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); - }); - - test('Running a task which fails still takes up capacity', async () => { - const logger = loggingSystemMock.create().get(); - const pool = new TaskPool({ - maxWorkers$: of(1), - logger, - }); - - const taskFailedToRun = mockTask(); - taskFailedToRun.run.mockImplementation(async () => { - await sleep(0); - throw new Error(`Run Task has failed miserably`); - }); - - const result = await pool.run([taskFailedToRun, mockTask()]); - - expect(result).toEqual(TaskPoolRunResult.RanOutOfCapacity); - }); - - test('clears up capacity when a task completes', async () => { - const pool = new TaskPool({ - maxWorkers$: of(1), - logger: loggingSystemMock.create().get(), - }); - - const firstWork = resolvable(); - const firstRun = sinon.spy(async () => { - await sleep(0); - firstWork.resolve(); - return asOk({ state: {} }); - }); - const secondWork = resolvable(); - const secondRun = sinon.spy(async () => { - await sleep(0); - secondWork.resolve(); - return asOk({ state: {} }); - }); - - const result = await pool.run([ - { ...mockTask(), run: firstRun }, - { ...mockTask(), run: secondRun }, - ]); - - expect(result).toEqual(TaskPoolRunResult.RanOutOfCapacity); - expect(pool.occupiedWorkers).toEqual(1); - expect(pool.availableWorkers).toEqual(0); - - await firstWork; - sinon.assert.calledOnce(firstRun); - sinon.assert.notCalled(secondRun); - - expect(pool.occupiedWorkers).toEqual(0); - await pool.run([{ ...mockTask(), run: secondRun }]); - expect(pool.occupiedWorkers).toEqual(1); - - expect(pool.availableWorkers).toEqual(0); - - await secondWork; - - expect(pool.occupiedWorkers).toEqual(0); - expect(pool.availableWorkers).toEqual(1); - sinon.assert.calledOnce(secondRun); - }); - - test('run cancels expired tasks prior to running new tasks', async () => { - const logger = loggingSystemMock.create().get(); - const pool = new TaskPool({ - maxWorkers$: of(2), - logger, - }); - - const haltUntilWeAfterFirstRun = resolvable(); - const taskHasExpired = resolvable(); - const haltTaskSoThatItCanBeCanceled = resolvable(); - - const shouldRun = sinon.spy(() => Promise.resolve()); - const shouldNotRun = sinon.spy(() => Promise.resolve()); - const now = new Date(); - const result = await pool.run([ - { - ...mockTask({ id: '1' }), - async run() { - await haltUntilWeAfterFirstRun; - this.isExpired = true; - taskHasExpired.resolve(); - await haltTaskSoThatItCanBeCanceled; - return asOk({ state: {} }); - }, - get expiration() { - return now; - }, - get startedAt() { - // 5 and a half minutes - return moment(now).subtract(5, 'm').subtract(30, 's').toDate(); - }, - cancel: shouldRun, - }, - { - ...mockTask({ id: '2' }), - async run() { - // halt here so that we can verify that this task is counted in `occupiedWorkers` - await haltUntilWeAfterFirstRun; - return asOk({ state: {} }); - }, - cancel: shouldNotRun, - }, - ]); - - expect(result).toEqual(TaskPoolRunResult.RunningAtCapacity); - expect(pool.occupiedWorkers).toEqual(2); - expect(pool.availableWorkers).toEqual(0); - - // release first stage in task so that it has time to expire, but not complete - haltUntilWeAfterFirstRun.resolve(); - await taskHasExpired; - - expect(await pool.run([{ ...mockTask({ id: '3' }) }])).toBeTruthy(); - - sinon.assert.calledOnce(shouldRun); - sinon.assert.notCalled(shouldNotRun); - - expect(pool.occupiedWorkers).toEqual(1); - expect(pool.availableWorkers).toEqual(1); - - haltTaskSoThatItCanBeCanceled.resolve(); - - expect(logger.warn).toHaveBeenCalledWith( - `Cancelling task TaskType "shooooo" as it expired at ${now.toISOString()} after running for 05m 30s (with timeout set at 5m).` - ); - }); - - test('calls to availableWorkers ensures we cancel expired tasks', async () => { - const pool = new TaskPool({ - maxWorkers$: of(1), - logger: loggingSystemMock.create().get(), - }); - - const taskIsRunning = resolvable(); - const taskHasExpired = resolvable(); - const cancel = sinon.spy(() => Promise.resolve()); - const now = new Date(); - expect( - await pool.run([ - { - ...mockTask(), - async run() { - await sleep(10); - this.isExpired = true; - taskIsRunning.resolve(); - await taskHasExpired; - return asOk({ state: {} }); - }, - get expiration() { - return new Date(now.getTime() + 10); - }, - get startedAt() { - return now; - }, - cancel, - }, - ]) - ).toEqual(TaskPoolRunResult.RunningAtCapacity); - - await taskIsRunning; - - sinon.assert.notCalled(cancel); - expect(pool.occupiedWorkers).toEqual(1); - // The call to `availableWorkers` will clear the expired task so it's 1 instead of 0 - expect(pool.availableWorkers).toEqual(1); - sinon.assert.calledOnce(cancel); - - expect(pool.occupiedWorkers).toEqual(0); - expect(pool.availableWorkers).toEqual(1); - // ensure cancel isn't called twice - sinon.assert.calledOnce(cancel); - taskHasExpired.resolve(); - }); - - test('logs if cancellation errors', async () => { - const logger = loggingSystemMock.create().get(); - const pool = new TaskPool({ - logger, - maxWorkers$: of(20), - }); - - const cancelled = resolvable(); - const result = await pool.run([ - { - ...mockTask(), - async run() { - this.isExpired = true; - await sleep(10); - return asOk({ state: {} }); - }, - async cancel() { - cancelled.resolve(); - throw new Error('Dern!'); - }, - toString: () => '"shooooo!"', - }, - ]); - - expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); - await pool.run([]); - - expect(pool.occupiedWorkers).toEqual(0); - - // Allow the task to cancel... - await cancelled; - - expect((logger as jest.Mocked).error.mock.calls[0][0]).toMatchInlineSnapshot( - `"Failed to cancel task \\"shooooo!\\": Error: Dern!"` - ); - }); - - test('only allows one task with the same id in the task pool', async () => { - const logger = loggingSystemMock.create().get(); - const pool = new TaskPool({ - maxWorkers$: of(2), - logger, - }); - - const shouldRun = mockRun(); - const shouldNotRun = mockRun(); - - const taskId = uuidv4(); - const task1 = mockTask({ id: taskId, run: shouldRun }); - const task2 = mockTask({ - id: taskId, - run: shouldNotRun, - isSameTask() { - return true; - }, - }); - - await pool.run([task1]); - await pool.run([task2]); - - expect(shouldRun).toHaveBeenCalledTimes(1); - expect(shouldNotRun).not.toHaveBeenCalled(); - }); - - // This test is from https://github.com/elastic/kibana/issues/172116 - // It's not clear how to reproduce the actual error, but it is easy to - // reproduce with the wacky test below. It does log the exact error - // from that issue, without the corresponding fix in task_pool.ts - test('works when available workers is 0 but there are tasks to run', async () => { - const logger = loggingSystemMock.create().get(); - const pool = new TaskPool({ - maxWorkers$: of(2), - logger, - }); - - const shouldRun = mockRun(); - - const taskId = uuidv4(); - const task1 = mockTask({ id: taskId, run: shouldRun }); - - // we need to alternate the values of `availableWorkers`. First it - // should be 0, then 1, then 0, then 1, etc. This will cause task_pool.run - // to partition tasks (0 to run, everything as leftover), then at the - // end of run(), to check if it should recurse, it should be > 0. - let awValue = 1; - Object.defineProperty(pool, 'availableWorkers', { - get() { - return ++awValue % 2; - }, - }); - - const result = await pool.run([task1]); - expect(result).toBe(TaskPoolRunResult.RanOutOfCapacity); - - expect((logger as jest.Mocked).warn.mock.calls[0]).toMatchInlineSnapshot(` - Array [ - "task pool run attempts exceeded 3; assuming ran out of capacity; availableWorkers: 0, tasksToRun: 0, leftOverTasks: 1, maxWorkers: 2, occupiedWorkers: 0, workerLoad: 0", - ] - `); - }); - - function mockRun() { - return jest.fn(async () => { - await sleep(0); - return asOk({ state: {} }); - }); - } - - function mockTask(overrides = {}) { - return { - isExpired: false, - taskExecutionId: uuidv4(), - id: uuidv4(), - cancel: async () => undefined, - markTaskAsRunning: jest.fn(async () => true), - run: mockRun(), - stage: TaskRunningStage.PENDING, - toString: () => `TaskType "shooooo"`, - isAdHocTaskAndOutOfAttempts: false, - removeTask: jest.fn(), - get expiration() { - return new Date(); - }, - get startedAt() { - return new Date(); - }, - get definition() { - return { - type: '', - title: '', - timeout: '5m', - createTaskRunner: jest.fn(), - }; - }, - isSameTask() { - return false; - }, - ...overrides, - }; - } -}); diff --git a/x-pack/plugins/task_manager/server/task_pool/capacity.mock.ts b/x-pack/plugins/task_manager/server/task_pool/capacity.mock.ts new file mode 100644 index 0000000000000..ed3fd3b07f07c --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_pool/capacity.mock.ts @@ -0,0 +1,21 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ +const createCapacityMock = () => { + return jest.fn().mockImplementation(() => { + return { + determineTasksToRunBasedOnCapacity: jest.fn(), + getUsedCapacityByType: jest.fn(), + usedCapacityPercentage: jest.fn(), + usedCapacity: jest.fn(), + capacity: jest.fn(), + }; + }); +}; + +export const capacityMock = { + create: createCapacityMock(), +}; diff --git a/x-pack/plugins/task_manager/server/task_pool/cost_capacity.test.ts b/x-pack/plugins/task_manager/server/task_pool/cost_capacity.test.ts new file mode 100644 index 0000000000000..b40c6eb2af37d --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_pool/cost_capacity.test.ts @@ -0,0 +1,171 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { loggingSystemMock } from '@kbn/core/server/mocks'; +import { of, Subject } from 'rxjs'; +import { TaskCost } from '../task'; +import { CostCapacity } from './cost_capacity'; +import { mockTask } from './test_utils'; + +const logger = loggingSystemMock.create().get(); + +describe('CostCapacity', () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + + test('capacity responds to changes from capacity$ observable', () => { + const capacity$ = new Subject(); + const pool = new CostCapacity({ capacity$, logger }); + + expect(pool.capacity).toBe(0); + + capacity$.next(20); + expect(pool.capacity).toBe(40); + + capacity$.next(16); + expect(pool.capacity).toBe(32); + + expect(logger.debug).toHaveBeenCalledTimes(2); + expect(logger.debug).toHaveBeenNthCalledWith( + 1, + `Task pool now using 40 as the max allowed cost which is based on a capacity of 20` + ); + expect(logger.debug).toHaveBeenNthCalledWith( + 2, + `Task pool now using 32 as the max allowed cost which is based on a capacity of 16` + ); + }); + + test('usedCapacity returns the sum of costs of tasks in the pool', () => { + const pool = new CostCapacity({ capacity$: of(10), logger }); + + const tasksInPool = new Map([ + ['1', { ...mockTask() }], + ['2', { ...mockTask({}, { cost: TaskCost.Tiny }) }], + ['3', { ...mockTask() }], + ]); + + expect(pool.usedCapacity(tasksInPool)).toBe(5); + }); + + test('usedCapacityPercentage returns the percentage of capacity used based on cost of tasks in the pool', () => { + const pool = new CostCapacity({ capacity$: of(10), logger }); + + const tasksInPool = new Map([ + ['1', { ...mockTask() }], + ['2', { ...mockTask({}, { cost: TaskCost.Tiny }) }], + ['3', { ...mockTask() }], + ]); + + expect(pool.usedCapacityPercentage(tasksInPool)).toBe(25); + }); + + test('usedCapacityByType returns the sum of of costs of tasks of specified type in the pool', () => { + const pool = new CostCapacity({ capacity$: of(10), logger }); + + const tasksInPool = [ + { ...mockTask({}, { type: 'type1' }) }, + { ...mockTask({}, { type: 'type1', cost: TaskCost.Tiny }) }, + { ...mockTask({}, { type: 'type2' }) }, + ]; + + expect(pool.getUsedCapacityByType(tasksInPool, 'type1')).toBe(3); + expect(pool.getUsedCapacityByType(tasksInPool, 'type2')).toBe(2); + expect(pool.getUsedCapacityByType(tasksInPool, 'type3')).toBe(0); + }); + + test('availableCapacity returns the full available capacity when no task type is defined', () => { + const pool = new CostCapacity({ capacity$: of(10), logger }); + + const tasksInPool = new Map([ + ['1', { ...mockTask() }], + ['2', { ...mockTask({}, { cost: TaskCost.Tiny }) }], + ['3', { ...mockTask() }], + ]); + + expect(pool.availableCapacity(tasksInPool)).toBe(15); + }); + + test('availableCapacity returns the full available capacity when task type with no maxConcurrency is provided', () => { + const pool = new CostCapacity({ capacity$: of(10), logger }); + + const tasksInPool = new Map([ + ['1', { ...mockTask() }], + ['2', { ...mockTask({}, { cost: TaskCost.Tiny }) }], + ['3', { ...mockTask() }], + ]); + + expect( + pool.availableCapacity(tasksInPool, { + type: 'type1', + cost: TaskCost.Normal, + createTaskRunner: jest.fn(), + timeout: '5m', + }) + ).toBe(15); + }); + + test('availableCapacity returns the available capacity for the task type when task type with maxConcurrency is provided', () => { + const pool = new CostCapacity({ capacity$: of(10), logger }); + + const tasksInPool = new Map([ + ['1', { ...mockTask({}, { type: 'type1' }) }], + ['2', { ...mockTask({}, { cost: TaskCost.Tiny }) }], + ['3', { ...mockTask() }], + ]); + + expect( + pool.availableCapacity(tasksInPool, { + type: 'type1', + maxConcurrency: 3, + cost: TaskCost.Normal, + createTaskRunner: jest.fn(), + timeout: '5m', + }) + ).toBe(4); + }); + + describe('determineTasksToRunBasedOnCapacity', () => { + test('runs all tasks if there is capacity', () => { + const pool = new CostCapacity({ capacity$: of(10), logger }); + const tasks = [{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]; + const [tasksToRun, leftoverTasks] = pool.determineTasksToRunBasedOnCapacity(tasks, 20); + + expect(tasksToRun).toEqual(tasks); + expect(leftoverTasks).toEqual([]); + }); + + test('runs task in order until capacity is reached', () => { + const pool = new CostCapacity({ capacity$: of(10), logger }); + const tasks = [ + { ...mockTask() }, + { ...mockTask() }, + { ...mockTask() }, + { ...mockTask({}, { cost: TaskCost.ExtraLarge }) }, + { ...mockTask({}, { cost: TaskCost.ExtraLarge }) }, + // technically have capacity for these tasks if we skip the previous task, but we're running + // in order to avoid possibly starving large cost tasks + { ...mockTask() }, + { ...mockTask() }, + ]; + const [tasksToRun, leftoverTasks] = pool.determineTasksToRunBasedOnCapacity(tasks, 20); + + expect(tasksToRun).toEqual([tasks[0], tasks[1], tasks[2], tasks[3]]); + expect(leftoverTasks).toEqual([tasks[4], tasks[5], tasks[6]]); + }); + + test('does not run tasks if there is no capacity', () => { + const pool = new CostCapacity({ capacity$: of(10), logger }); + const tasks = [{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]; + const [tasksToRun, leftoverTasks] = pool.determineTasksToRunBasedOnCapacity(tasks, 1); + + expect(tasksToRun).toEqual([]); + expect(leftoverTasks).toEqual(tasks); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/task_pool/cost_capacity.ts b/x-pack/plugins/task_manager/server/task_pool/cost_capacity.ts new file mode 100644 index 0000000000000..8073cb374c5ff --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_pool/cost_capacity.ts @@ -0,0 +1,109 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { Logger } from '@kbn/core/server'; +import { TaskDefinition } from '../task'; +import { TaskRunner } from '../task_running'; +import { CapacityOpts, ICapacity } from './types'; +import { getCapacityInCost } from './utils'; + +export class CostCapacity implements ICapacity { + private maxAllowedCost: number = 0; + private logger: Logger; + + constructor(opts: CapacityOpts) { + this.logger = opts.logger; + opts.capacity$.subscribe((capacity) => { + // Capacity config describes the number of normal-cost tasks that can be + // run simulatenously. Multiple by the cost of a normal cost to determine + // the maximum allowed cost + this.maxAllowedCost = getCapacityInCost(capacity); + this.logger.debug( + `Task pool now using ${this.maxAllowedCost} as the max allowed cost which is based on a capacity of ${capacity}` + ); + }); + } + + public get capacity(): number { + return this.maxAllowedCost; + } + + /** + * Gets how much capacity is currently in use. + */ + public usedCapacity(tasksInPool: Map) { + let result = 0; + tasksInPool.forEach((task) => { + result += task.definition.cost; + }); + return result; + } + + /** + * Gets % of capacity in use + */ + public usedCapacityPercentage(tasksInPool: Map) { + return this.capacity ? Math.round((this.usedCapacity(tasksInPool) * 100) / this.capacity) : 100; + } + + /** + * Gets how much capacity is currently in use by each type. + */ + public getUsedCapacityByType(tasksInPool: TaskRunner[], type: string) { + return tasksInPool.reduce( + (count, runningTask) => + runningTask.definition.type === type ? count + runningTask.definition.cost : count, + 0 + ); + } + + public availableCapacity( + tasksInPool: Map, + taskDefinition?: TaskDefinition | null + ): number { + const allAvailableCapacity = this.capacity - this.usedCapacity(tasksInPool); + if (taskDefinition && taskDefinition.maxConcurrency) { + // calculate the max capacity that can be used for this task type based on cost + const maxCapacityForType = taskDefinition.maxConcurrency * taskDefinition.cost; + return Math.max( + Math.min( + allAvailableCapacity, + maxCapacityForType - + this.getUsedCapacityByType([...tasksInPool.values()], taskDefinition.type) + ), + 0 + ); + } + + return allAvailableCapacity; + } + + public determineTasksToRunBasedOnCapacity( + tasks: TaskRunner[], + availableCapacity: number + ): [TaskRunner[], TaskRunner[]] { + const tasksToRun: TaskRunner[] = []; + const leftOverTasks: TaskRunner[] = []; + + let capacityAccumulator = 0; + for (const task of tasks) { + const taskCost = task.definition.cost; + if (capacityAccumulator + taskCost <= availableCapacity) { + tasksToRun.push(task); + capacityAccumulator += taskCost; + } else { + leftOverTasks.push(task); + // Don't claim further tasks even if lower cost tasks are next. + // It may be an extra large task and we need to make room for it + // for the next claiming cycle + capacityAccumulator = availableCapacity; + } + } + + return [tasksToRun, leftOverTasks]; + } +} diff --git a/x-pack/plugins/task_manager/server/task_pool/index.ts b/x-pack/plugins/task_manager/server/task_pool/index.ts new file mode 100644 index 0000000000000..979a4536639a6 --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_pool/index.ts @@ -0,0 +1,9 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +export { TaskPool, TaskPoolRunResult } from './task_pool'; +export { getCapacityInCost, getCapacityInWorkers } from './utils'; diff --git a/x-pack/plugins/task_manager/server/task_pool.mock.ts b/x-pack/plugins/task_manager/server/task_pool/task_pool.mock.ts similarity index 58% rename from x-pack/plugins/task_manager/server/task_pool.mock.ts rename to x-pack/plugins/task_manager/server/task_pool/task_pool.mock.ts index 77568c8c6cdfa..00c3cfae16317 100644 --- a/x-pack/plugins/task_manager/server/task_pool.mock.ts +++ b/x-pack/plugins/task_manager/server/task_pool/task_pool.mock.ts @@ -8,16 +8,14 @@ import { TaskPool } from './task_pool'; const defaultGetCapacityOverride: () => Partial<{ load: number; - occupiedWorkers: number; - workerLoad: number; - max: number; - availableWorkers: number; + usedCapacity: number; + usedCapacityPercentage: number; + availableCapacity: number; }> = () => ({ load: 0, - occupiedWorkers: 0, - workerLoad: 0, - max: 10, - availableWorkers: 10, + usedCapacity: 0, + usedCapacityPercentage: 0, + availableCapacity: 20, }); const createTaskPoolMock = (getCapacityOverride = defaultGetCapacityOverride) => { @@ -25,19 +23,16 @@ const createTaskPoolMock = (getCapacityOverride = defaultGetCapacityOverride) => get load() { return getCapacityOverride().load ?? 0; }, - get occupiedWorkers() { - return getCapacityOverride().occupiedWorkers ?? 0; + get usedCapacity() { + return getCapacityOverride().usedCapacity ?? 0; }, - get workerLoad() { - return getCapacityOverride().workerLoad ?? 0; + get usedCapacityPercentage() { + return getCapacityOverride().usedCapacityPercentage ?? 0; }, - get max() { - return getCapacityOverride().max ?? 10; + availableCapacity() { + return getCapacityOverride().availableCapacity ?? 20; }, - get availableWorkers() { - return getCapacityOverride().availableWorkers ?? 10; - }, - getOccupiedWorkersByType: jest.fn(), + getUsedCapacityByType: jest.fn(), run: jest.fn(), cancelRunningTasks: jest.fn(), } as unknown as jest.Mocked; diff --git a/x-pack/plugins/task_manager/server/task_pool/task_pool.test.ts b/x-pack/plugins/task_manager/server/task_pool/task_pool.test.ts new file mode 100644 index 0000000000000..e2936b7ccec0a --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_pool/task_pool.test.ts @@ -0,0 +1,867 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ +import sinon from 'sinon'; +import { of, Subject } from 'rxjs'; +import { TaskPool, TaskPoolRunResult } from './task_pool'; +import { resolvable, sleep } from '../test_utils'; +import { loggingSystemMock } from '@kbn/core/server/mocks'; +import { Logger } from '@kbn/core/server'; +import { asOk } from '../lib/result_type'; +import { SavedObjectsErrorHelpers } from '@kbn/core/server'; +import moment from 'moment'; +import { v4 as uuidv4 } from 'uuid'; +import { TaskCost } from '../task'; +import * as CostCapacityModule from './cost_capacity'; +import * as WorkerCapacityModule from './worker_capacity'; +import { capacityMock } from './capacity.mock'; +import { CLAIM_STRATEGY_DEFAULT, CLAIM_STRATEGY_MGET } from '../config'; +import { mockRun, mockTask } from './test_utils'; +import { TaskTypeDictionary } from '../task_type_dictionary'; + +jest.mock('../constants', () => ({ + CONCURRENCY_ALLOW_LIST_BY_TASK_TYPE: ['report', 'quickReport'], +})); + +describe('TaskPool', () => { + const costCapacityMock = capacityMock.create(); + const workerCapacityMock = capacityMock.create(); + const logger = loggingSystemMock.create().get(); + + const definitions = new TaskTypeDictionary(logger); + definitions.registerTaskDefinitions({ + report: { + title: 'report', + maxConcurrency: 1, + cost: TaskCost.ExtraLarge, + createTaskRunner: jest.fn(), + }, + quickReport: { + title: 'quickReport', + maxConcurrency: 5, + createTaskRunner: jest.fn(), + }, + }); + + beforeEach(() => { + jest.resetAllMocks(); + jest.useFakeTimers(); + jest.setSystemTime(new Date(2021, 12, 30)); + }); + + afterEach(() => { + jest.useRealTimers(); + }); + + describe('uses the correct capacity calculator based on the strategy', () => { + let costCapacitySpy: jest.SpyInstance; + let workerCapacitySpy: jest.SpyInstance; + beforeEach(() => { + costCapacitySpy = jest + .spyOn(CostCapacityModule, 'CostCapacity') + .mockImplementation(() => costCapacityMock); + + workerCapacitySpy = jest + .spyOn(WorkerCapacityModule, 'WorkerCapacity') + .mockImplementation(() => workerCapacityMock); + }); + + afterEach(() => { + costCapacitySpy.mockRestore(); + workerCapacitySpy.mockRestore(); + }); + + test('uses CostCapacity to calculate capacity when strategy is mget', () => { + new TaskPool({ capacity$: of(20), definitions, logger, strategy: CLAIM_STRATEGY_MGET }); + + expect(CostCapacityModule.CostCapacity).toHaveBeenCalledTimes(1); + expect(WorkerCapacityModule.WorkerCapacity).not.toHaveBeenCalled(); + }); + + test('uses WorkerCapacity to calculate capacity when strategy is default', () => { + new TaskPool({ capacity$: of(20), definitions, logger, strategy: CLAIM_STRATEGY_DEFAULT }); + + expect(CostCapacityModule.CostCapacity).not.toHaveBeenCalled(); + expect(WorkerCapacityModule.WorkerCapacity).toHaveBeenCalledTimes(1); + }); + + test('uses WorkerCapacity to calculate capacity when strategy is unrecognized', () => { + new TaskPool({ capacity$: of(20), definitions, logger, strategy: 'any old strategy' }); + + expect(CostCapacityModule.CostCapacity).not.toHaveBeenCalled(); + expect(WorkerCapacityModule.WorkerCapacity).toHaveBeenCalledTimes(1); + }); + }); + + describe('with CLAIM_STRATEGY_DEFAULT', () => { + test('usedCapacity is the number running tasks', async () => { + const pool = new TaskPool({ + capacity$: of(10), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const result = await pool.run([{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + expect(pool.usedCapacity).toEqual(3); + }); + + test('availableCapacity are a function of total_capacity - usedCapacity', async () => { + const pool = new TaskPool({ + capacity$: of(10), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const result = await pool.run([{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + expect(pool.availableCapacity()).toEqual(7); + }); + + test('availableCapacity is 0 until capacity$ pushes a value', async () => { + const capacity$ = new Subject(); + const pool = new TaskPool({ + capacity$, + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + expect(pool.availableCapacity()).toEqual(0); + capacity$.next(10); + expect(pool.availableCapacity()).toEqual(10); + }); + + test('does not run tasks that are beyond its available capacity', async () => { + const pool = new TaskPool({ + capacity$: of(2), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const shouldRun = mockRun(); + const shouldNotRun = mockRun(); + + const result = await pool.run([ + { ...mockTask(), run: shouldRun }, + { ...mockTask(), run: shouldRun }, + { ...mockTask(), run: shouldNotRun }, + ]); + + expect(result).toEqual(TaskPoolRunResult.RanOutOfCapacity); + expect(pool.availableCapacity()).toEqual(0); + expect(shouldRun).toHaveBeenCalledTimes(2); + expect(shouldNotRun).not.toHaveBeenCalled(); + }); + + test('should log when marking a Task as running fails', async () => { + const pool = new TaskPool({ + capacity$: of(3), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const taskFailedToMarkAsRunning = mockTask(); + taskFailedToMarkAsRunning.markTaskAsRunning.mockImplementation(async () => { + throw new Error(`Mark Task as running has failed miserably`); + }); + + const result = await pool.run([mockTask(), taskFailedToMarkAsRunning, mockTask()]); + + expect((logger as jest.Mocked).error.mock.calls[0]).toMatchInlineSnapshot(` + Array [ + "Failed to mark Task TaskType \\"shooooo\\" as running: Mark Task as running has failed miserably", + ] + `); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + }); + + test('should log when running a Task fails', async () => { + const pool = new TaskPool({ + capacity$: of(3), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const taskFailedToRun = mockTask(); + taskFailedToRun.run.mockImplementation(async () => { + throw new Error(`Run Task has failed miserably`); + }); + + const result = await pool.run([mockTask(), taskFailedToRun, mockTask()]); + + expect((logger as jest.Mocked).warn.mock.calls[0]).toMatchInlineSnapshot(` + Array [ + "Task TaskType \\"shooooo\\" failed in attempt to run: Run Task has failed miserably", + ] + `); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + }); + + test('should not log when running a Task fails due to the Task SO having been deleted while in flight', async () => { + const pool = new TaskPool({ + capacity$: of(3), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const taskFailedToRun = mockTask(); + taskFailedToRun.run.mockImplementation(async () => { + throw SavedObjectsErrorHelpers.createGenericNotFoundError('task', taskFailedToRun.id); + }); + + const result = await pool.run([mockTask(), taskFailedToRun, mockTask()]); + + expect(logger.debug).toHaveBeenCalledWith( + `Task TaskType "shooooo" failed in attempt to run: Saved object [task/${taskFailedToRun.id}] not found` + ); + expect(logger.warn).not.toHaveBeenCalled(); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + }); + + test('Running a task which fails still takes up capacity', async () => { + const pool = new TaskPool({ + capacity$: of(1), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const taskFailedToRun = mockTask(); + taskFailedToRun.run.mockImplementation(async () => { + await sleep(0); + throw new Error(`Run Task has failed miserably`); + }); + + const result = await pool.run([taskFailedToRun, mockTask()]); + + expect(result).toEqual(TaskPoolRunResult.RanOutOfCapacity); + }); + + test('clears up capacity when a task completes', async () => { + const pool = new TaskPool({ + capacity$: of(1), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const firstWork = resolvable(); + const firstRun = sinon.spy(async () => { + await sleep(0); + firstWork.resolve(); + return asOk({ state: {} }); + }); + const secondWork = resolvable(); + const secondRun = sinon.spy(async () => { + await sleep(0); + secondWork.resolve(); + return asOk({ state: {} }); + }); + + const result = await pool.run([ + { ...mockTask(), run: firstRun }, + { ...mockTask(), run: secondRun }, + ]); + + expect(result).toEqual(TaskPoolRunResult.RanOutOfCapacity); + expect(pool.usedCapacity).toEqual(1); + expect(pool.availableCapacity()).toEqual(0); + + await firstWork; + sinon.assert.calledOnce(firstRun); + sinon.assert.notCalled(secondRun); + + expect(pool.usedCapacity).toEqual(0); + await pool.run([{ ...mockTask(), run: secondRun }]); + expect(pool.usedCapacity).toEqual(1); + + expect(pool.availableCapacity()).toEqual(0); + + await secondWork; + + expect(pool.usedCapacity).toEqual(0); + expect(pool.availableCapacity()).toEqual(1); + sinon.assert.calledOnce(secondRun); + }); + + test('run cancels expired tasks prior to running new tasks', async () => { + const pool = new TaskPool({ + capacity$: of(2), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const haltUntilWeAfterFirstRun = resolvable(); + const taskHasExpired = resolvable(); + const haltTaskSoThatItCanBeCanceled = resolvable(); + + const shouldRun = sinon.spy(() => Promise.resolve()); + const shouldNotRun = sinon.spy(() => Promise.resolve()); + const now = new Date(); + const result = await pool.run([ + { + ...mockTask({ id: '1' }), + async run() { + await haltUntilWeAfterFirstRun; + this.isExpired = true; + taskHasExpired.resolve(); + await haltTaskSoThatItCanBeCanceled; + return asOk({ state: {} }); + }, + get expiration() { + return now; + }, + get startedAt() { + // 5 and a half minutes + return moment(now).subtract(5, 'm').subtract(30, 's').toDate(); + }, + cancel: shouldRun, + }, + { + ...mockTask({ id: '2' }), + async run() { + // halt here so that we can verify that this task is counted in `occupiedWorkers` + await haltUntilWeAfterFirstRun; + return asOk({ state: {} }); + }, + cancel: shouldNotRun, + }, + ]); + + expect(result).toEqual(TaskPoolRunResult.RunningAtCapacity); + expect(pool.usedCapacity).toEqual(2); + expect(pool.availableCapacity()).toEqual(0); + + // release first stage in task so that it has time to expire, but not complete + haltUntilWeAfterFirstRun.resolve(); + await taskHasExpired; + + expect(await pool.run([{ ...mockTask({ id: '3' }) }])).toBeTruthy(); + + sinon.assert.calledOnce(shouldRun); + sinon.assert.notCalled(shouldNotRun); + + expect(pool.usedCapacity).toEqual(1); + expect(pool.availableCapacity()).toEqual(1); + + haltTaskSoThatItCanBeCanceled.resolve(); + + expect(logger.warn).toHaveBeenCalledWith( + `Cancelling task TaskType "shooooo" as it expired at ${now.toISOString()} after running for 05m 30s (with timeout set at 5m).` + ); + }); + + test('calls to availableWorkers ensures we cancel expired tasks', async () => { + const pool = new TaskPool({ + capacity$: of(1), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const taskIsRunning = resolvable(); + const taskHasExpired = resolvable(); + const cancel = sinon.spy(() => Promise.resolve()); + const now = new Date(); + expect( + await pool.run([ + { + ...mockTask(), + async run() { + await sleep(10); + this.isExpired = true; + taskIsRunning.resolve(); + await taskHasExpired; + return asOk({ state: {} }); + }, + get expiration() { + return new Date(now.getTime() + 10); + }, + get startedAt() { + return now; + }, + cancel, + }, + ]) + ).toEqual(TaskPoolRunResult.RunningAtCapacity); + + await taskIsRunning; + + sinon.assert.notCalled(cancel); + expect(pool.usedCapacity).toEqual(1); + // The call to `availableCapacity` will clear the expired task so it's 1 instead of 0 + expect(pool.availableCapacity()).toEqual(1); + sinon.assert.calledOnce(cancel); + + expect(pool.usedCapacity).toEqual(0); + expect(pool.availableCapacity()).toEqual(1); + // ensure cancel isn't called twice + sinon.assert.calledOnce(cancel); + taskHasExpired.resolve(); + }); + + test('logs if cancellation errors', async () => { + const pool = new TaskPool({ + capacity$: of(10), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const cancelled = resolvable(); + const result = await pool.run([ + { + ...mockTask(), + async run() { + this.isExpired = true; + await sleep(10); + return asOk({ state: {} }); + }, + async cancel() { + cancelled.resolve(); + throw new Error('Dern!'); + }, + toString: () => '"shooooo!"', + }, + ]); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + await pool.run([]); + + expect(pool.usedCapacity).toEqual(0); + + // Allow the task to cancel... + await cancelled; + + expect((logger as jest.Mocked).error.mock.calls[0][0]).toMatchInlineSnapshot( + `"Failed to cancel task \\"shooooo!\\": Error: Dern!"` + ); + }); + + test('only allows one task with the same id in the task pool', async () => { + const pool = new TaskPool({ + capacity$: of(2), + definitions, + logger, + strategy: CLAIM_STRATEGY_DEFAULT, + }); + + const shouldRun = mockRun(); + const shouldNotRun = mockRun(); + + const taskId = uuidv4(); + const task1 = mockTask({ id: taskId, run: shouldRun }); + const task2 = mockTask({ + id: taskId, + run: shouldNotRun, + isSameTask() { + return true; + }, + }); + + await pool.run([task1]); + await pool.run([task2]); + + expect(shouldRun).toHaveBeenCalledTimes(1); + expect(shouldNotRun).not.toHaveBeenCalled(); + }); + }); + + describe('with CLAIM_STRATEGY_MGET', () => { + test('usedCapacity is the sum of the cost of running tasks', async () => { + const pool = new TaskPool({ + capacity$: of(10), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const result = await pool.run([{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + expect(pool.usedCapacity).toEqual(3 * TaskCost.Normal); + }); + + test('availableCapacity are a function of total_capacity - usedCapacity', async () => { + const pool = new TaskPool({ + capacity$: of(10), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const result = await pool.run([{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + expect(pool.availableCapacity()).toEqual(14); + }); + + test('availableCapacity is 0 until capacity$ pushes a value', async () => { + const capacity$ = new Subject(); + const pool = new TaskPool({ capacity$, definitions, logger, strategy: CLAIM_STRATEGY_MGET }); + + expect(pool.availableCapacity()).toEqual(0); + capacity$.next(20); + expect(pool.availableCapacity()).toEqual(40); + }); + + test('does not run tasks that are beyond its available capacity', async () => { + const pool = new TaskPool({ + capacity$: of(2), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const shouldRun = mockRun(); + const shouldNotRun = mockRun(); + + const result = await pool.run([ + { ...mockTask(), run: shouldRun }, + { ...mockTask(), run: shouldRun }, + { ...mockTask(), run: shouldNotRun }, + ]); + + expect(result).toEqual(TaskPoolRunResult.RanOutOfCapacity); + expect(pool.availableCapacity()).toEqual(0); + expect(shouldRun).toHaveBeenCalledTimes(2); + expect(shouldNotRun).not.toHaveBeenCalled(); + }); + + test('should log when marking a Task as running fails', async () => { + const pool = new TaskPool({ + capacity$: of(6), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const taskFailedToMarkAsRunning = mockTask(); + taskFailedToMarkAsRunning.markTaskAsRunning.mockImplementation(async () => { + throw new Error(`Mark Task as running has failed miserably`); + }); + + const result = await pool.run([mockTask(), taskFailedToMarkAsRunning, mockTask()]); + + expect((logger as jest.Mocked).error.mock.calls[0]).toMatchInlineSnapshot(` + Array [ + "Failed to mark Task TaskType \\"shooooo\\" as running: Mark Task as running has failed miserably", + ] + `); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + }); + + test('should log when running a Task fails', async () => { + const pool = new TaskPool({ + capacity$: of(3), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const taskFailedToRun = mockTask(); + taskFailedToRun.run.mockImplementation(async () => { + throw new Error(`Run Task has failed miserably`); + }); + + const result = await pool.run([mockTask(), taskFailedToRun, mockTask()]); + + expect((logger as jest.Mocked).warn.mock.calls[0]).toMatchInlineSnapshot(` + Array [ + "Task TaskType \\"shooooo\\" failed in attempt to run: Run Task has failed miserably", + ] + `); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + }); + + test('should not log when running a Task fails due to the Task SO having been deleted while in flight', async () => { + const pool = new TaskPool({ + capacity$: of(3), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const taskFailedToRun = mockTask(); + taskFailedToRun.run.mockImplementation(async () => { + throw SavedObjectsErrorHelpers.createGenericNotFoundError('task', taskFailedToRun.id); + }); + + const result = await pool.run([mockTask(), taskFailedToRun, mockTask()]); + + expect(logger.debug).toHaveBeenCalledWith( + `Task TaskType "shooooo" failed in attempt to run: Saved object [task/${taskFailedToRun.id}] not found` + ); + expect(logger.warn).not.toHaveBeenCalled(); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + }); + + test('Running a task which fails still takes up capacity', async () => { + const pool = new TaskPool({ + capacity$: of(1), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const taskFailedToRun = mockTask(); + taskFailedToRun.run.mockImplementation(async () => { + await sleep(0); + throw new Error(`Run Task has failed miserably`); + }); + + const result = await pool.run([taskFailedToRun, mockTask()]); + + expect(result).toEqual(TaskPoolRunResult.RanOutOfCapacity); + }); + + test('clears up capacity when a task completes', async () => { + const pool = new TaskPool({ + capacity$: of(1), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const firstWork = resolvable(); + const firstRun = sinon.spy(async () => { + await sleep(0); + firstWork.resolve(); + return asOk({ state: {} }); + }); + const secondWork = resolvable(); + const secondRun = sinon.spy(async () => { + await sleep(0); + secondWork.resolve(); + return asOk({ state: {} }); + }); + + const result = await pool.run([ + { ...mockTask(), run: firstRun }, + { ...mockTask(), run: secondRun }, + ]); + + expect(result).toEqual(TaskPoolRunResult.RanOutOfCapacity); + expect(pool.usedCapacity).toEqual(2); + expect(pool.availableCapacity()).toEqual(0); + + await firstWork; + sinon.assert.calledOnce(firstRun); + sinon.assert.notCalled(secondRun); + + expect(pool.usedCapacity).toEqual(0); + await pool.run([{ ...mockTask(), run: secondRun }]); + expect(pool.usedCapacity).toEqual(2); + + expect(pool.availableCapacity()).toEqual(0); + + await secondWork; + + expect(pool.usedCapacity).toEqual(0); + expect(pool.availableCapacity()).toEqual(2); + sinon.assert.calledOnce(secondRun); + }); + + test('run cancels expired tasks prior to running new tasks', async () => { + const pool = new TaskPool({ + capacity$: of(2), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const haltUntilWeAfterFirstRun = resolvable(); + const taskHasExpired = resolvable(); + const haltTaskSoThatItCanBeCanceled = resolvable(); + + const shouldRun = sinon.spy(() => Promise.resolve()); + const shouldNotRun = sinon.spy(() => Promise.resolve()); + const now = new Date(); + const result = await pool.run([ + { + ...mockTask({ id: '1' }), + async run() { + await haltUntilWeAfterFirstRun; + this.isExpired = true; + taskHasExpired.resolve(); + await haltTaskSoThatItCanBeCanceled; + return asOk({ state: {} }); + }, + get expiration() { + return now; + }, + get startedAt() { + // 5 and a half minutes + return moment(now).subtract(5, 'm').subtract(30, 's').toDate(); + }, + cancel: shouldRun, + }, + { + ...mockTask({ id: '2' }), + async run() { + // halt here so that we can verify that this task is counted in `occupiedWorkers` + await haltUntilWeAfterFirstRun; + return asOk({ state: {} }); + }, + cancel: shouldNotRun, + }, + ]); + + expect(result).toEqual(TaskPoolRunResult.RunningAtCapacity); + expect(pool.usedCapacity).toEqual(4); + expect(pool.availableCapacity()).toEqual(0); + + // release first stage in task so that it has time to expire, but not complete + haltUntilWeAfterFirstRun.resolve(); + await taskHasExpired; + + expect(await pool.run([{ ...mockTask({ id: '3' }) }])).toBeTruthy(); + + sinon.assert.calledOnce(shouldRun); + sinon.assert.notCalled(shouldNotRun); + + expect(pool.usedCapacity).toEqual(2); + expect(pool.availableCapacity()).toEqual(2); + + haltTaskSoThatItCanBeCanceled.resolve(); + + expect(logger.warn).toHaveBeenCalledWith( + `Cancelling task TaskType "shooooo" as it expired at ${now.toISOString()} after running for 05m 30s (with timeout set at 5m).` + ); + }); + + test('calls to availableWorkers ensures we cancel expired tasks', async () => { + const pool = new TaskPool({ + capacity$: of(1), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const taskIsRunning = resolvable(); + const taskHasExpired = resolvable(); + const cancel = sinon.spy(() => Promise.resolve()); + const now = new Date(); + expect( + await pool.run([ + { + ...mockTask(), + async run() { + await sleep(10); + this.isExpired = true; + taskIsRunning.resolve(); + await taskHasExpired; + return asOk({ state: {} }); + }, + get expiration() { + return new Date(now.getTime() + 10); + }, + get startedAt() { + return now; + }, + cancel, + }, + ]) + ).toEqual(TaskPoolRunResult.RunningAtCapacity); + + await taskIsRunning; + + sinon.assert.notCalled(cancel); + expect(pool.usedCapacity).toEqual(2); + // The call to `availableCapacity` will clear the expired task so it's 2 instead of 0 + expect(pool.availableCapacity()).toEqual(2); + sinon.assert.calledOnce(cancel); + + expect(pool.usedCapacity).toEqual(0); + expect(pool.availableCapacity()).toEqual(2); + // ensure cancel isn't called twice + sinon.assert.calledOnce(cancel); + taskHasExpired.resolve(); + }); + + test('logs if cancellation errors', async () => { + const pool = new TaskPool({ + capacity$: of(10), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const cancelled = resolvable(); + const result = await pool.run([ + { + ...mockTask(), + async run() { + this.isExpired = true; + await sleep(10); + return asOk({ state: {} }); + }, + async cancel() { + cancelled.resolve(); + throw new Error('Dern!'); + }, + toString: () => '"shooooo!"', + }, + ]); + + expect(result).toEqual(TaskPoolRunResult.RunningAllClaimedTasks); + await pool.run([]); + + expect(pool.usedCapacity).toEqual(0); + + // Allow the task to cancel... + await cancelled; + + expect((logger as jest.Mocked).error.mock.calls[0][0]).toMatchInlineSnapshot( + `"Failed to cancel task \\"shooooo!\\": Error: Dern!"` + ); + }); + + test('only allows one task with the same id in the task pool', async () => { + const pool = new TaskPool({ + capacity$: of(2), + definitions, + logger, + strategy: CLAIM_STRATEGY_MGET, + }); + + const shouldRun = mockRun(); + const shouldNotRun = mockRun(); + + const taskId = uuidv4(); + const task1 = mockTask({ id: taskId, run: shouldRun }); + const task2 = mockTask({ + id: taskId, + run: shouldNotRun, + isSameTask() { + return true; + }, + }); + + await pool.run([task1]); + await pool.run([task2]); + + expect(shouldRun).toHaveBeenCalledTimes(1); + expect(shouldNotRun).not.toHaveBeenCalled(); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/task_pool.ts b/x-pack/plugins/task_manager/server/task_pool/task_pool.ts similarity index 73% rename from x-pack/plugins/task_manager/server/task_pool.ts rename to x-pack/plugins/task_manager/server/task_pool/task_pool.ts index c0784f0458f72..493aaa77ab41b 100644 --- a/x-pack/plugins/task_manager/server/task_pool.ts +++ b/x-pack/plugins/task_manager/server/task_pool/task_pool.ts @@ -13,13 +13,20 @@ import { Observable, Subject } from 'rxjs'; import moment, { Duration } from 'moment'; import { padStart } from 'lodash'; import { Logger } from '@kbn/core/server'; -import { TaskRunner } from './task_running'; -import { isTaskSavedObjectNotFoundError } from './lib/is_task_not_found_error'; -import { TaskManagerStat } from './task_events'; +import { TaskRunner } from '../task_running'; +import { isTaskSavedObjectNotFoundError } from '../lib/is_task_not_found_error'; +import { TaskManagerStat } from '../task_events'; +import { ICapacity } from './types'; +import { CLAIM_STRATEGY_MGET } from '../config'; +import { WorkerCapacity } from './worker_capacity'; +import { CostCapacity } from './cost_capacity'; +import { TaskTypeDictionary } from '../task_type_dictionary'; -interface Opts { - maxWorkers$: Observable; +interface TaskPoolOpts { + capacity$: Observable; + definitions: TaskTypeDictionary; logger: Logger; + strategy: string; } export enum TaskPoolRunResult { @@ -34,31 +41,43 @@ export enum TaskPoolRunResult { } const VERSION_CONFLICT_MESSAGE = 'Task has been claimed by another Kibana service'; -const MAX_RUN_ATTEMPTS = 3; /** * Runs tasks in batches, taking costs into account. */ export class TaskPool { - private maxWorkers: number = 0; private tasksInPool = new Map(); private logger: Logger; private load$ = new Subject(); + private definitions: TaskTypeDictionary; + private capacityCalculator: ICapacity; /** * Creates an instance of TaskPool. * * @param {Opts} opts - * @prop {number} maxWorkers - The total number of workers / work slots available - * (e.g. maxWorkers is 4, then 2 tasks of cost 2 can run at a time, or 4 tasks of cost 1) + * @prop {number} capacity - The total capacity available + * (e.g. capacity is 4, then 2 tasks of cost 2 can run at a time, or 4 tasks of cost 1) * @prop {Logger} logger - The task manager logger. */ - constructor(opts: Opts) { + constructor(opts: TaskPoolOpts) { this.logger = opts.logger; - opts.maxWorkers$.subscribe((maxWorkers) => { - this.logger.debug(`Task pool now using ${maxWorkers} as the max worker value`); - this.maxWorkers = maxWorkers; - }); + this.definitions = opts.definitions; + + switch (opts.strategy) { + case CLAIM_STRATEGY_MGET: + this.capacityCalculator = new CostCapacity({ + capacity$: opts.capacity$, + logger: this.logger, + }); + break; + + default: + this.capacityCalculator = new WorkerCapacity({ + capacity$: opts.capacity$, + logger: this.logger, + }); + } } public get load(): Observable { @@ -66,38 +85,39 @@ export class TaskPool { } /** - * Gets how many workers are currently in use. + * Gets how much capacity is currently in use. */ - public get occupiedWorkers() { - return this.tasksInPool.size; + public get usedCapacity() { + return this.capacityCalculator.usedCapacity(this.tasksInPool); } /** - * Gets % of workers in use + * Gets how much capacity is currently in use as a percentage */ - public get workerLoad() { - return this.maxWorkers ? Math.round((this.occupiedWorkers * 100) / this.maxWorkers) : 100; + public get usedCapacityPercentage() { + return this.capacityCalculator.usedCapacityPercentage(this.tasksInPool); } /** - * Gets how many workers are currently available. + * Gets how much capacity is currently available. */ - public get availableWorkers() { + public availableCapacity(taskType?: string) { // cancel expired task whenever a call is made to check for capacity // this ensures that we don't end up with a queue of hung tasks causing both // the poller and the pool from hanging due to lack of capacity this.cancelExpiredTasks(); - return this.maxWorkers - this.occupiedWorkers; + + return this.capacityCalculator.availableCapacity( + this.tasksInPool, + taskType ? this.definitions.get(taskType) : null + ); } /** - * Gets how many workers are currently in use by type. + * Gets how much capacity is currently in use by each type. */ - public getOccupiedWorkersByType(type: string) { - return [...this.tasksInPool.values()].reduce( - (count, runningTask) => (runningTask.definition.type === type ? ++count : count), - 0 - ); + public getUsedCapacityByType(type: string) { + return this.capacityCalculator.getUsedCapacityByType([...this.tasksInPool.values()], type); } /** @@ -108,26 +128,14 @@ export class TaskPool { * @param {TaskRunner[]} tasks * @returns {Promise} */ - public async run(tasks: TaskRunner[], attempt = 1): Promise { - // Note `this.availableWorkers` is a getter with side effects, so we just want + public async run(tasks: TaskRunner[]): Promise { + // Note `this.availableCapacity` has side effects, so we just want // to call it once for this bit of the code. - const availableWorkers = this.availableWorkers; - const [tasksToRun, leftOverTasks] = partitionListByCount(tasks, availableWorkers); - - if (attempt > MAX_RUN_ATTEMPTS) { - const stats = [ - `availableWorkers: ${availableWorkers}`, - `tasksToRun: ${tasksToRun.length}`, - `leftOverTasks: ${leftOverTasks.length}`, - `maxWorkers: ${this.maxWorkers}`, - `occupiedWorkers: ${this.occupiedWorkers}`, - `workerLoad: ${this.workerLoad}`, - ].join(', '); - this.logger.warn( - `task pool run attempts exceeded ${MAX_RUN_ATTEMPTS}; assuming ran out of capacity; ${stats}` - ); - return TaskPoolRunResult.RanOutOfCapacity; - } + const availableCapacity = this.availableCapacity(); + const [tasksToRun, leftOverTasks] = this.capacityCalculator.determineTasksToRunBasedOnCapacity( + tasks, + availableCapacity + ); if (tasksToRun.length) { await Promise.all( @@ -163,11 +171,10 @@ export class TaskPool { } if (leftOverTasks.length) { - if (this.availableWorkers) { - return this.run(leftOverTasks, attempt + 1); - } + // leave any leftover tasks + // they will be available for claiming in 30 seconds return TaskPoolRunResult.RanOutOfCapacity; - } else if (!this.availableWorkers) { + } else if (!this.availableCapacity()) { return TaskPoolRunResult.RunningAtCapacity; } return TaskPoolRunResult.RunningAllClaimedTasks; @@ -242,11 +249,6 @@ export class TaskPool { } } -function partitionListByCount(list: T[], count: number): [T[], T[]] { - const listInCount = list.splice(0, count); - return [listInCount, list]; -} - function durationAsString(duration: Duration): string { const [m, s] = [duration.minutes(), duration.seconds()].map((value) => padStart(`${value}`, 2, '0') diff --git a/x-pack/plugins/task_manager/server/task_pool/test_utils.ts b/x-pack/plugins/task_manager/server/task_pool/test_utils.ts new file mode 100644 index 0000000000000..b518ed7b8f8f5 --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_pool/test_utils.ts @@ -0,0 +1,53 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ +import { v4 as uuidv4 } from 'uuid'; +import { asOk } from '../lib/result_type'; +import { sleep } from '../test_utils'; +import { TaskRunningStage } from '../task_running'; +import { TaskCost } from '../task'; + +export function mockRun() { + return jest.fn(async () => { + await sleep(0); + return asOk({ state: {} }); + }); +} + +export function mockTask(overrides = {}, definitionOverrides = {}) { + return { + isExpired: false, + taskExecutionId: uuidv4(), + id: uuidv4(), + cancel: async () => undefined, + markTaskAsRunning: jest.fn(async () => true), + run: mockRun(), + stage: TaskRunningStage.PENDING, + toString: () => `TaskType "shooooo"`, + isAdHocTaskAndOutOfAttempts: false, + removeTask: jest.fn(), + get expiration() { + return new Date(); + }, + get startedAt() { + return new Date(); + }, + get definition() { + return { + type: '', + title: '', + timeout: '5m', + cost: TaskCost.Normal, + createTaskRunner: jest.fn(), + ...definitionOverrides, + }; + }, + isSameTask() { + return false; + }, + ...overrides, + }; +} diff --git a/x-pack/plugins/task_manager/server/task_pool/types.ts b/x-pack/plugins/task_manager/server/task_pool/types.ts new file mode 100644 index 0000000000000..759af4f6d6e70 --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_pool/types.ts @@ -0,0 +1,31 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { Observable } from 'rxjs'; +import { Logger } from '@kbn/core/server'; +import { TaskRunner } from '../task_running'; +import { TaskDefinition } from '../task'; + +export interface ICapacity { + get capacity(): number; + availableCapacity( + tasksInPool: Map, + taskDefinition?: TaskDefinition | null + ): number; + usedCapacity(tasksInPool: Map): number; + usedCapacityPercentage(tasksInPool: Map): number; + getUsedCapacityByType(tasksInPool: TaskRunner[], type: string): number; + determineTasksToRunBasedOnCapacity( + tasks: TaskRunner[], + availableCapacity: number + ): [TaskRunner[], TaskRunner[]]; +} + +export interface CapacityOpts { + capacity$: Observable; + logger: Logger; +} diff --git a/x-pack/plugins/task_manager/server/task_pool/utils.ts b/x-pack/plugins/task_manager/server/task_pool/utils.ts new file mode 100644 index 0000000000000..d4c89be46e02d --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_pool/utils.ts @@ -0,0 +1,16 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { TaskCost } from '../task'; + +// When configured capacity is the number of normal cost tasks that this Kibana +// can run, the total available workers equals the capacity +export const getCapacityInWorkers = (capacity: number) => capacity; + +// When configured capacity is the number of normal cost tasks that this Kibana +// can run, the total available cost equals the capacity multiplied by the cost of a normal task +export const getCapacityInCost = (capacity: number) => capacity * TaskCost.Normal; diff --git a/x-pack/plugins/task_manager/server/task_pool/worker_capacity.test.ts b/x-pack/plugins/task_manager/server/task_pool/worker_capacity.test.ts new file mode 100644 index 0000000000000..7ed7485ccdd52 --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_pool/worker_capacity.test.ts @@ -0,0 +1,176 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { loggingSystemMock } from '@kbn/core/server/mocks'; +import { of, Subject } from 'rxjs'; +import { TaskCost } from '../task'; +import { mockTask } from './test_utils'; +import { WorkerCapacity } from './worker_capacity'; + +const logger = loggingSystemMock.create().get(); + +describe('WorkerCapacity', () => { + beforeEach(() => { + jest.resetAllMocks(); + }); + + test('workers set based on capacity responds to changes from capacity$ observable', () => { + const capacity$ = new Subject(); + const pool = new WorkerCapacity({ capacity$, logger }); + + expect(pool.capacity).toBe(0); + + capacity$.next(20); + expect(pool.capacity).toBe(20); + + capacity$.next(16); + expect(pool.capacity).toBe(16); + + capacity$.next(25); + expect(pool.capacity).toBe(25); + + expect(logger.debug).toHaveBeenCalledTimes(3); + expect(logger.debug).toHaveBeenNthCalledWith( + 1, + 'Task pool now using 20 as the max worker value which is based on a capacity of 20' + ); + expect(logger.debug).toHaveBeenNthCalledWith( + 2, + 'Task pool now using 16 as the max worker value which is based on a capacity of 16' + ); + expect(logger.debug).toHaveBeenNthCalledWith( + 3, + 'Task pool now using 25 as the max worker value which is based on a capacity of 25' + ); + }); + + test('usedCapacity returns the number of tasks in the pool', () => { + const pool = new WorkerCapacity({ capacity$: of(10), logger }); + + const tasksInPool = new Map([ + ['1', { ...mockTask() }], + ['2', { ...mockTask({}, { cost: TaskCost.Tiny }) }], + ['3', { ...mockTask() }], + ]); + + expect(pool.usedCapacity(tasksInPool)).toBe(3); + }); + + test('usedCapacityPercentage returns the percentage of workers in use by tasks in the pool', () => { + const pool = new WorkerCapacity({ capacity$: of(10), logger }); + + const tasksInPool = new Map([ + ['1', { ...mockTask() }], + ['2', { ...mockTask({}, { cost: TaskCost.Tiny }) }], + ['3', { ...mockTask() }], + ]); + + expect(pool.usedCapacityPercentage(tasksInPool)).toBe(30); + }); + + test('usedCapacityByType returns the number of tasks of specified type in the pool', () => { + const pool = new WorkerCapacity({ capacity$: of(10), logger }); + + const tasksInPool = [ + { ...mockTask({}, { type: 'type1' }) }, + { ...mockTask({}, { type: 'type1', cost: TaskCost.Tiny }) }, + { ...mockTask({}, { type: 'type2' }) }, + ]; + + expect(pool.getUsedCapacityByType(tasksInPool, 'type1')).toBe(2); + expect(pool.getUsedCapacityByType(tasksInPool, 'type2')).toBe(1); + expect(pool.getUsedCapacityByType(tasksInPool, 'type3')).toBe(0); + }); + + test('availableCapacity returns the overall number of available workers when no task type is defined', () => { + const pool = new WorkerCapacity({ capacity$: of(10), logger }); + + const tasksInPool = new Map([ + ['1', { ...mockTask() }], + ['2', { ...mockTask({}, { cost: TaskCost.Tiny }) }], + ['3', { ...mockTask() }], + ]); + + expect(pool.availableCapacity(tasksInPool)).toBe(7); + }); + + test('availableCapacity returns the overall number of available workers when task type with no maxConcurrency is provided', () => { + const pool = new WorkerCapacity({ capacity$: of(10), logger }); + + const tasksInPool = new Map([ + ['1', { ...mockTask() }], + ['2', { ...mockTask({}, { cost: TaskCost.Tiny }) }], + ['3', { ...mockTask() }], + ]); + + expect( + pool.availableCapacity(tasksInPool, { + type: 'type1', + cost: TaskCost.Normal, + createTaskRunner: jest.fn(), + timeout: '5m', + }) + ).toBe(7); + }); + + test('availableCapacity returns the number of available workers for the task type when task type with maxConcurrency is provided', () => { + const pool = new WorkerCapacity({ capacity$: of(10), logger }); + + const tasksInPool = new Map([ + ['1', { ...mockTask({}, { type: 'type1' }) }], + ['2', { ...mockTask({}, { cost: TaskCost.Tiny }) }], + ['3', { ...mockTask() }], + ]); + + expect( + pool.availableCapacity(tasksInPool, { + type: 'type1', + maxConcurrency: 3, + cost: TaskCost.Normal, + createTaskRunner: jest.fn(), + timeout: '5m', + }) + ).toBe(2); + }); + + describe('determineTasksToRunBasedOnCapacity', () => { + test('runs all tasks if there are workers available', () => { + const pool = new WorkerCapacity({ capacity$: of(10), logger }); + const tasks = [{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]; + const [tasksToRun, leftoverTasks] = pool.determineTasksToRunBasedOnCapacity(tasks, 10); + + expect(tasksToRun).toEqual(tasks); + expect(leftoverTasks).toEqual([]); + }); + + test('splits tasks if there are more tasks than available workers', () => { + const pool = new WorkerCapacity({ capacity$: of(10), logger }); + const tasks = [ + { ...mockTask() }, + { ...mockTask() }, + { ...mockTask() }, + { ...mockTask({}, { cost: TaskCost.ExtraLarge }) }, + { ...mockTask({}, { cost: TaskCost.ExtraLarge }) }, + { ...mockTask() }, + { ...mockTask() }, + ]; + const [tasksToRun, leftoverTasks] = pool.determineTasksToRunBasedOnCapacity(tasks, 5); + + expect(tasksToRun).toEqual([tasks[0], tasks[1], tasks[2], tasks[3], tasks[4]]); + expect(leftoverTasks).toEqual([tasks[5], tasks[6]]); + }); + + test('does not run tasks if there is no capacity', () => { + const pool = new WorkerCapacity({ capacity$: of(10), logger }); + const tasks = [{ ...mockTask() }, { ...mockTask() }, { ...mockTask() }]; + const [tasksToRun, leftoverTasks] = pool.determineTasksToRunBasedOnCapacity(tasks, 0); + + expect(tasksToRun).toEqual([]); + expect(leftoverTasks).toEqual(tasks); + }); + }); +}); diff --git a/x-pack/plugins/task_manager/server/task_pool/worker_capacity.ts b/x-pack/plugins/task_manager/server/task_pool/worker_capacity.ts new file mode 100644 index 0000000000000..13de1cb266add --- /dev/null +++ b/x-pack/plugins/task_manager/server/task_pool/worker_capacity.ts @@ -0,0 +1,95 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { Logger } from '@kbn/core/server'; +import { TaskRunner } from '../task_running'; +import { CapacityOpts, ICapacity } from './types'; +import { TaskDefinition } from '../task'; +import { getCapacityInWorkers } from './utils'; + +export class WorkerCapacity implements ICapacity { + private workers: number = 0; + private logger: Logger; + + constructor(opts: CapacityOpts) { + this.logger = opts.logger; + opts.capacity$.subscribe((capacity) => { + // Capacity config describes the number of normal-cost tasks that can be + // run simulatenously. This directly corresponds to the number of workers to use. + this.workers = getCapacityInWorkers(capacity); + this.logger.debug( + `Task pool now using ${this.workers} as the max worker value which is based on a capacity of ${capacity}` + ); + }); + } + + public get capacity(): number { + return this.workers; + } + + /** + * Gets how many workers are currently in use. + */ + public usedCapacity(tasksInPool: Map) { + return tasksInPool.size; + } + + /** + * Gets % of workers in use + */ + public usedCapacityPercentage(tasksInPool: Map) { + return this.capacity ? Math.round((this.usedCapacity(tasksInPool) * 100) / this.capacity) : 100; + } + + /** + * Gets how many workers are currently in use by each type. + */ + public getUsedCapacityByType(tasksInPool: TaskRunner[], type: string) { + return tasksInPool.reduce( + (count, runningTask) => (runningTask.definition.type === type ? ++count : count), + 0 + ); + } + + public availableCapacity( + tasksInPool: Map, + taskDefinition?: TaskDefinition | null + ): number { + const allAvailableCapacity = this.capacity - this.usedCapacity(tasksInPool); + if (taskDefinition && taskDefinition.maxConcurrency) { + // calculate the max workers that can be used for this task type + return Math.max( + Math.min( + allAvailableCapacity, + taskDefinition.maxConcurrency - + this.getUsedCapacityByType([...tasksInPool.values()], taskDefinition.type) + ), + 0 + ); + } + + return allAvailableCapacity; + } + + public determineTasksToRunBasedOnCapacity( + tasks: TaskRunner[], + availableCapacity: number + ): [TaskRunner[], TaskRunner[]] { + const tasksToRun: TaskRunner[] = []; + const leftOverTasks: TaskRunner[] = []; + + for (let i = 0; i < tasks.length; i++) { + if (i >= availableCapacity) { + leftOverTasks.push(tasks[i]); + } else { + tasksToRun.push(tasks[i]); + } + } + + return [tasksToRun, leftOverTasks]; + } +} diff --git a/x-pack/plugins/task_manager/server/task_store.test.ts b/x-pack/plugins/task_manager/server/task_store.test.ts index afde0ae91ea55..9bc1a64140647 100644 --- a/x-pack/plugins/task_manager/server/task_store.test.ts +++ b/x-pack/plugins/task_manager/server/task_store.test.ts @@ -8,7 +8,7 @@ import { schema } from '@kbn/config-schema'; import { Client } from '@elastic/elasticsearch'; import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey'; -import _ from 'lodash'; +import _, { omit } from 'lodash'; import { first } from 'rxjs'; import { @@ -18,7 +18,7 @@ import { SerializedConcreteTaskInstance, } from './task'; import { elasticsearchServiceMock, savedObjectsServiceMock } from '@kbn/core/server/mocks'; -import { TaskStore, SearchOpts, AggregationOpts } from './task_store'; +import { TaskStore, SearchOpts, AggregationOpts, taskInstanceToAttributes } from './task_store'; import { savedObjectsRepositoryMock } from '@kbn/core/server/mocks'; import { SavedObjectAttributes, SavedObjectsErrorHelpers } from '@kbn/core/server'; import { TaskTypeDictionary } from './task_type_dictionary'; @@ -292,12 +292,16 @@ describe('TaskStore', () => { }); }); - async function testFetch(opts?: SearchOpts, hits: Array> = []) { + async function testFetch( + opts?: SearchOpts, + hits: Array> = [], + limitResponse: boolean = false + ) { childEsClient.search.mockResponse({ hits: { hits, total: hits.length }, } as estypes.SearchResponse); - const result = await store.fetch(opts); + const result = await store.fetch(opts, limitResponse); expect(childEsClient.search).toHaveBeenCalledTimes(1); @@ -342,6 +346,18 @@ describe('TaskStore', () => { await expect(store.fetch()).rejects.toThrowErrorMatchingInlineSnapshot(`"Failure"`); expect(await firstErrorPromise).toMatchInlineSnapshot(`[Error: Failure]`); }); + + test('excludes state and params from source when excludeState is true', async () => { + const { args } = await testFetch({}, [], true); + expect(args).toMatchObject({ + index: 'tasky', + body: { + sort: [{ 'task.runAt': 'asc' }], + query: { term: { type: 'task' } }, + }, + _source_excludes: ['task.state', 'task.params'], + }); + }); }); describe('aggregate', () => { @@ -615,10 +631,11 @@ describe('TaskStore', () => { describe('bulkUpdate', () => { let store: TaskStore; + const logger = mockLogger(); beforeAll(() => { store = new TaskStore({ - logger: mockLogger(), + logger, index: 'tasky', taskManagerId: '', serializer, @@ -671,6 +688,125 @@ describe('TaskStore', () => { expect(mockGetValidatedTaskInstanceForUpdating).toHaveBeenCalledWith(task, { validate: false, }); + + expect(savedObjectsClient.bulkUpdate).toHaveBeenCalledWith( + [ + { + id: task.id, + type: 'task', + version: task.version, + attributes: taskInstanceToAttributes(task, task.id), + }, + ], + { refresh: false } + ); + }); + + test(`validates whenever validate:true is passed-in`, async () => { + const task = { + runAt: mockedDate, + scheduledAt: mockedDate, + startedAt: null, + retryAt: null, + id: 'task:324242', + params: { hello: 'world' }, + state: { foo: 'bar' }, + taskType: 'report', + attempts: 3, + status: 'idle' as TaskStatus, + version: '123', + ownerId: null, + traceparent: '', + }; + + savedObjectsClient.bulkUpdate.mockResolvedValue({ + saved_objects: [ + { + id: '324242', + type: 'task', + attributes: { + ...task, + state: '{"foo":"bar"}', + params: '{"hello":"world"}', + }, + references: [], + version: '123', + }, + ], + }); + + await store.bulkUpdate([task], { validate: true }); + + expect(mockGetValidatedTaskInstanceForUpdating).toHaveBeenCalledWith(task, { + validate: true, + }); + + expect(savedObjectsClient.bulkUpdate).toHaveBeenCalledWith( + [ + { + id: task.id, + type: 'task', + version: task.version, + attributes: taskInstanceToAttributes(task, task.id), + }, + ], + { refresh: false } + ); + }); + + test(`logs warning and doesn't validate whenever excludeLargeFields option is passed-in`, async () => { + const task = { + runAt: mockedDate, + scheduledAt: mockedDate, + startedAt: null, + retryAt: null, + id: 'task:324242', + params: { hello: 'world' }, + state: { foo: 'bar' }, + taskType: 'report', + attempts: 3, + status: 'idle' as TaskStatus, + version: '123', + ownerId: null, + traceparent: '', + }; + + savedObjectsClient.bulkUpdate.mockResolvedValue({ + saved_objects: [ + { + id: '324242', + type: 'task', + attributes: { + ...task, + state: '{"foo":"bar"}', + params: '{"hello":"world"}', + }, + references: [], + version: '123', + }, + ], + }); + + await store.bulkUpdate([task], { validate: true, excludeLargeFields: true }); + + expect(logger.warn).toHaveBeenCalledWith( + `Skipping validation for bulk update because excludeLargeFields=true.` + ); + expect(mockGetValidatedTaskInstanceForUpdating).toHaveBeenCalledWith(task, { + validate: false, + }); + + expect(savedObjectsClient.bulkUpdate).toHaveBeenCalledWith( + [ + { + id: task.id, + type: 'task', + version: task.version, + attributes: omit(taskInstanceToAttributes(task, task.id), ['state', 'params']), + }, + ], + { refresh: false } + ); }); test('pushes error from saved objects client to errors$', async () => { diff --git a/x-pack/plugins/task_manager/server/task_store.ts b/x-pack/plugins/task_manager/server/task_store.ts index e0ad3dfae149a..9b58d7bc3c18b 100644 --- a/x-pack/plugins/task_manager/server/task_store.ts +++ b/x-pack/plugins/task_manager/server/task_store.ts @@ -84,6 +84,11 @@ export interface FetchResult { versionMap: Map; } +export interface BulkUpdateOpts { + validate: boolean; + excludeLargeFields?: boolean; +} + export type BulkUpdateResult = Result< ConcreteTaskInstance, { type: string; id: string; error: SavedObjectError } @@ -108,6 +113,7 @@ export class TaskStore { public readonly taskManagerId: string; public readonly errors$ = new Subject(); public readonly taskValidator: TaskValidator; + private readonly logger: Logger; private esClient: ElasticsearchClient; private esClientWithoutRetries: ElasticsearchClient; @@ -134,6 +140,7 @@ export class TaskStore { this.serializer = opts.serializer; this.savedObjectsRepository = opts.savedObjectsRepository; this.adHocTaskCounter = opts.adHocTaskCounter; + this.logger = opts.logger; this.taskValidator = new TaskValidator({ logger: opts.logger, definitions: opts.definitions, @@ -232,15 +239,13 @@ export class TaskStore { * Fetches a list of scheduled tasks with default sorting. * * @param opts - The query options used to filter tasks + * @param limitResponse - Whether to exclude the task state and params from the source for a smaller respose payload */ - public async fetch({ - sort = [{ 'task.runAt': 'asc' }], - ...opts - }: SearchOpts = {}): Promise { - return this.search({ - ...opts, - sort, - }); + public async fetch( + { sort = [{ 'task.runAt': 'asc' }], ...opts }: SearchOpts = {}, + limitResponse: boolean = false + ): Promise { + return this.search({ ...opts, sort }, limitResponse); } /** @@ -296,13 +301,23 @@ export class TaskStore { */ public async bulkUpdate( docs: ConcreteTaskInstance[], - options: { validate: boolean } + { validate, excludeLargeFields = false }: BulkUpdateOpts ): Promise { + // if we're excluding large fields (state and params), we cannot apply validation so log a warning + if (validate && excludeLargeFields) { + validate = false; + this.logger.warn(`Skipping validation for bulk update because excludeLargeFields=true.`); + } + const attributesByDocId = docs.reduce((attrsById, doc) => { const taskInstance = this.taskValidator.getValidatedTaskInstanceForUpdating(doc, { - validate: options.validate, + validate, }); - attrsById.set(doc.id, taskInstanceToAttributes(taskInstance, doc.id)); + const taskAttributes = taskInstanceToAttributes(taskInstance, doc.id); + attrsById.set( + doc.id, + excludeLargeFields ? omit(taskAttributes, 'state', 'params') : taskAttributes + ); return attrsById; }, new Map()); @@ -342,7 +357,7 @@ export class TaskStore { ), }); const result = this.taskValidator.getValidatedTaskInstanceFromReading(taskInstance, { - validate: options.validate, + validate, }); return asOk(result); }); @@ -489,18 +504,20 @@ export class TaskStore { } } - private async search(opts: SearchOpts = {}): Promise { + private async search( + opts: SearchOpts = {}, + limitResponse: boolean = false + ): Promise { const { query } = ensureQueryOnlyReturnsTaskObjects(opts); try { const result = await this.esClientWithoutRetries.search({ index: this.index, ignore_unavailable: true, - body: { - ...opts, - query, - }, + body: { ...opts, query }, + ...(limitResponse ? { _source_excludes: ['task.state', 'task.params'] } : {}), }); + const { hits: { hits: tasks }, } = result; @@ -627,7 +644,10 @@ export function correctVersionConflictsForContinuation( return maxDocs && versionConflicts + updated > maxDocs ? maxDocs - updated : versionConflicts; } -function taskInstanceToAttributes(doc: TaskInstance, id: string): SerializedConcreteTaskInstance { +export function taskInstanceToAttributes( + doc: TaskInstance, + id: string +): SerializedConcreteTaskInstance { return { ...omit(doc, 'id', 'version'), params: JSON.stringify(doc.params || {}), diff --git a/x-pack/plugins/task_manager/server/task_type_dictionary.test.ts b/x-pack/plugins/task_manager/server/task_type_dictionary.test.ts index d1b44a7577025..ac34107e5d013 100644 --- a/x-pack/plugins/task_manager/server/task_type_dictionary.test.ts +++ b/x-pack/plugins/task_manager/server/task_type_dictionary.test.ts @@ -6,7 +6,7 @@ */ import { get } from 'lodash'; -import { RunContext, TaskDefinition, TaskPriority } from './task'; +import { RunContext, TaskCost, TaskDefinition, TaskPriority } from './task'; import { mockLogger } from './test_utils'; import { sanitizeTaskDefinitions, @@ -53,6 +53,7 @@ describe('taskTypeDictionary', () => { const logger = mockLogger(); beforeEach(() => { + jest.resetAllMocks(); definitions = new TaskTypeDictionary(logger); }); @@ -64,6 +65,7 @@ describe('taskTypeDictionary', () => { expect(result).toMatchInlineSnapshot(` Array [ Object { + "cost": 2, "createTaskRunner": [Function], "description": "one super cool task", "timeout": "5m", @@ -71,6 +73,7 @@ describe('taskTypeDictionary', () => { "type": "test_task_type_0", }, Object { + "cost": 2, "createTaskRunner": [Function], "description": "one super cool task", "timeout": "5m", @@ -78,6 +81,7 @@ describe('taskTypeDictionary', () => { "type": "test_task_type_1", }, Object { + "cost": 2, "createTaskRunner": [Function], "description": "one super cool task", "timeout": "5m", @@ -224,6 +228,7 @@ describe('taskTypeDictionary', () => { createTaskRunner: expect.any(Function), maxConcurrency: 2, priority: 1, + cost: 2, timeout: '5m', title: 'foo', type: 'foo', @@ -249,6 +254,44 @@ describe('taskTypeDictionary', () => { ); }); + it('uses task cost if specified', () => { + definitions.registerTaskDefinitions({ + foo: { + title: 'foo', + maxConcurrency: 2, + cost: TaskCost.ExtraLarge, + createTaskRunner: jest.fn(), + }, + }); + expect(definitions.get('foo')).toEqual({ + createTaskRunner: expect.any(Function), + maxConcurrency: 2, + cost: 10, + timeout: '5m', + title: 'foo', + type: 'foo', + }); + }); + + it('does not register task with invalid cost schema', () => { + definitions.registerTaskDefinitions({ + foo: { + title: 'foo', + maxConcurrency: 2, + cost: 23, + createTaskRunner: jest.fn(), + }, + }); + expect(logger.error).toHaveBeenCalledWith( + `Could not sanitize task definitions: Invalid cost \"23\". Cost must be one of Tiny => 1,Normal => 2,ExtraLarge => 10` + ); + expect(() => { + definitions.get('foo'); + }).toThrowErrorMatchingInlineSnapshot( + `"Unsupported task type \\"foo\\". Supported types are "` + ); + }); + it('throws error when registering duplicate task type', () => { definitions.registerTaskDefinitions({ foo: { diff --git a/x-pack/plugins/task_manager/server/task_type_dictionary.ts b/x-pack/plugins/task_manager/server/task_type_dictionary.ts index f45cbad172d5a..0a2368860c0cf 100644 --- a/x-pack/plugins/task_manager/server/task_type_dictionary.ts +++ b/x-pack/plugins/task_manager/server/task_type_dictionary.ts @@ -7,7 +7,13 @@ import { ObjectType } from '@kbn/config-schema'; import { Logger } from '@kbn/core/server'; -import { TaskDefinition, taskDefinitionSchema, TaskRunCreatorFunction, TaskPriority } from './task'; +import { + TaskDefinition, + taskDefinitionSchema, + TaskRunCreatorFunction, + TaskPriority, + TaskCost, +} from './task'; import { CONCURRENCY_ALLOW_LIST_BY_TASK_TYPE } from './constants'; /** @@ -50,6 +56,10 @@ export interface TaskRegisterDefinition { * claimed before low priority */ priority?: TaskPriority; + /** + * An optional definition of the cost associated with running the task. + */ + cost?: TaskCost; /** * An optional more detailed description of what this task does. */ diff --git a/x-pack/plugins/task_manager/server/usage/task_manager_usage_collector.test.ts b/x-pack/plugins/task_manager/server/usage/task_manager_usage_collector.test.ts index 019d8bd47c57a..067a32c8a486d 100644 --- a/x-pack/plugins/task_manager/server/usage/task_manager_usage_collector.test.ts +++ b/x-pack/plugins/task_manager/server/usage/task_manager_usage_collector.test.ts @@ -174,7 +174,8 @@ function getMockMonitoredHealth(overrides = {}): MonitoredHealth { timestamp: new Date().toISOString(), status: HealthStatus.OK, value: { - max_workers: 10, + capacity: { config: 10, as_cost: 20, as_workers: 10 }, + claim_strategy: 'default', poll_interval: 3000, request_capacity: 1000, monitored_aggregated_stats_refresh_rate: 5000, @@ -193,16 +194,19 @@ function getMockMonitoredHealth(overrides = {}): MonitoredHealth { status: HealthStatus.OK, value: { count: 4, + cost: 8, task_types: { - actions_telemetry: { count: 2, status: { idle: 2 } }, - alerting_telemetry: { count: 1, status: { idle: 1 } }, - session_cleanup: { count: 1, status: { idle: 1 } }, + actions_telemetry: { count: 2, cost: 4, status: { idle: 2 } }, + alerting_telemetry: { count: 1, cost: 2, status: { idle: 1 } }, + session_cleanup: { count: 1, cost: 2, status: { idle: 1 } }, }, schedule: [], overdue: 0, + overdue_cost: 0, overdue_non_recurring: 0, estimatedScheduleDensity: [], non_recurring: 20, + non_recurring_cost: 40, owner_ids: 2, estimated_schedule_density: [], capacity_requirements: { diff --git a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts index 066a004df3814..44d2257f8a957 100644 --- a/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts +++ b/x-pack/test/plugin_api_integration/test_suites/task_manager/health_route.ts @@ -140,7 +140,12 @@ export default function ({ getService }: FtrProviderContext) { }, }, request_capacity: 1000, - max_workers: 10, + capacity: { + config: 10, + as_workers: 10, + as_cost: 20, + }, + claim_strategy: 'default', }); }); diff --git a/x-pack/test/task_manager_claimer_mget/test_suites/task_manager/health_route.ts b/x-pack/test/task_manager_claimer_mget/test_suites/task_manager/health_route.ts index 066a004df3814..aa4f68e1fedd8 100644 --- a/x-pack/test/task_manager_claimer_mget/test_suites/task_manager/health_route.ts +++ b/x-pack/test/task_manager_claimer_mget/test_suites/task_manager/health_route.ts @@ -140,7 +140,12 @@ export default function ({ getService }: FtrProviderContext) { }, }, request_capacity: 1000, - max_workers: 10, + capacity: { + config: 10, + as_workers: 10, + as_cost: 20, + }, + claim_strategy: 'unsafe_mget', }); });