From bc2d5026fd0987199100625c8a98b207def04e71 Mon Sep 17 00:00:00 2001 From: Trent Mick Date: Wed, 27 Oct 2021 13:42:54 -0700 Subject: [PATCH 1/2] feat: impl contextPropagationOnly config var; change disableSend behaviour Implement contextPropagationOnly per spec (it does what disableSend did before this change). Change disableSend to just disable comms with APM server, but otherwise *not* attempt to reduce work. https://github.com/elastic/apm/blob/master/specs/agents/transport.md#context_propagation_only-configuration --- CHANGELOG.asciidoc | 10 ++ docs/configuration.asciidoc | 36 ++++- index.d.ts | 1 + lib/agent.js | 4 +- lib/config.js | 7 +- lib/instrumentation/index.js | 14 +- lib/instrumentation/transaction.js | 4 +- lib/metrics/index.js | 2 +- lib/noop-transport.js | 5 +- test/config.test.js | 1 + test/context-propagation-only.test.js | 211 ++++++++++++++++++++++++++ test/disable-send.test.js | 135 +--------------- 12 files changed, 275 insertions(+), 155 deletions(-) create mode 100644 test/context-propagation-only.test.js diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index 15d0ac7ae0..4c756b8363 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -36,6 +36,15 @@ Notes: [float] ===== Features +* Add <> configuration + option. This supports the use case of using the APM agent to propagate HTTP + trace-context and to support log-correlation (adding `trace.id` et al fields + to log records) **without** an APM server, and to otherwise reduce the + processing time of the agent. ({issues}2393[#2393]) ++ +This is similar to <>, but differs in that +`contextPropagationOnly: true` tells the agent to skip unnecessary work. + [float] ===== Bug fixes @@ -256,6 +265,7 @@ The fields affected by `longFieldMaxLength` are: * Add instrumentation of all AWS S3 methods when using the https://www.npmjs.com/package/aws-sdk[JavaScript AWS SDK v2] (`aws-sdk`). + * Add <> configuration option. This supports some use cases using the APM agent **without** an APM server. ({issues}2101[#2101]) diff --git a/docs/configuration.asciidoc b/docs/configuration.asciidoc index 8de930f8af..29dc81eade 100644 --- a/docs/configuration.asciidoc +++ b/docs/configuration.asciidoc @@ -148,6 +148,23 @@ See {kibana-ref}/filters.html#environment-selector[environment selector] in the NOTE: This feature is fully supported in the APM app in Kibana versions >= 7.2. You must use the query bar to filter for a specific environment in versions prior to 7.2. + +[[context-propagation-only]] +==== `contextPropagationOnly` + +* *Type:* Boolean +* *Default:* `false` +* *Env:* `ELASTIC_APM_CONTEXT_PROPAGATION_ONLY` + +If set to `true`, the agent will reduce its work to the minimum required to +support automatic https://w3c.github.io/trace-context/[HTTP trace-context] +propagation (for distributed tracing) and log correlation. The agent will not +communicate with APM server (no tracing data is forwarded, no central +configuration is retrieved) and will not collect metrics. This setting allows +using the APM agent with a service that cannot use APM server. Usage is expected +to be rare. + + [[disable-send]] ==== `disableSend` @@ -155,14 +172,17 @@ You must use the query bar to filter for a specific environment in versions prio * *Default:* `false` * *Env:* `ELASTIC_APM_DISABLE_SEND` -If set to `true`, the agent will work as usual, except for any task requiring -communication with the APM server. Events will be dropped and the agent won't be -able to receive central configuration, which means that any other configuration -cannot be changed in this state without restarting the service. Example uses -for this setting are: maintaining the ability to create traces and log -trace/transaction/span IDs through the log correlation feature, and getting -automatic distributed tracing via the https://w3c.github.io/trace-context/[W3C -HTTP headers]. +If set to `true`, the agent will work as usual, except that it will not attempt +to communicate with APM server. Tracing and metrics data will be dropped and the +agent won't be able to receive central configuration, which means that any other +configuration cannot be changed in this state without restarting the service. + +This setting is similar to <> in functionality. However, `disableSend` does not +attempt to reduce time spent collecting tracing data. A use case for this +setting is in CI environments, to test agent functionality without requiring a +configured APM server. + [[instrument]] ==== `instrument` diff --git a/index.d.ts b/index.d.ts index 6ebb9106fb..a831134341 100644 --- a/index.d.ts +++ b/index.d.ts @@ -225,6 +225,7 @@ declare namespace apm { cloudProvider?: string; configFile?: string; containerId?: string; + contextPropagationOnly?: boolean; disableInstrumentations?: string | string[]; disableSend?: boolean; environment?: string; diff --git a/lib/agent.js b/lib/agent.js index fa2f9c1ace..58ebf99339 100644 --- a/lib/agent.js +++ b/lib/agent.js @@ -408,8 +408,8 @@ Agent.prototype.captureError = function (err, opts, cb) { const id = errors.generateErrorId() - // Quick out if disableSend=true, no point in the processing time. - if (this._conf.disableSend) { + // Avoid unneeded error/stack processing if only propagating trace-context. + if (this._conf.contextPropagationOnly) { if (cb) { process.nextTick(cb, null, id) } diff --git a/lib/config.js b/lib/config.js index 4114bf12c2..091fdca1ce 100644 --- a/lib/config.js +++ b/lib/config.js @@ -53,6 +53,7 @@ var DEFAULTS = { centralConfig: true, cloudProvider: 'auto', containerId: undefined, + contextPropagationOnly: false, disableInstrumentations: [], disableSend: false, environment: process.env.NODE_ENV || 'development', @@ -109,8 +110,9 @@ var ENV_TABLE = { captureHeaders: 'ELASTIC_APM_CAPTURE_HEADERS', captureSpanStackTraces: 'ELASTIC_APM_CAPTURE_SPAN_STACK_TRACES', centralConfig: 'ELASTIC_APM_CENTRAL_CONFIG', - containerId: 'ELASTIC_APM_CONTAINER_ID', cloudProvider: 'ELASTIC_APM_CLOUD_PROVIDER', + containerId: 'ELASTIC_APM_CONTAINER_ID', + contextPropagationOnly: 'ELASTIC_APM_CONTEXT_PROPAGATION_ONLY', disableInstrumentations: 'ELASTIC_APM_DISABLE_INSTRUMENTATIONS', disableSend: 'ELASTIC_APM_DISABLE_SEND', environment: 'ELASTIC_APM_ENVIRONMENT', @@ -175,6 +177,7 @@ var BOOL_OPTS = [ 'captureHeaders', 'captureSpanStackTraces', 'centralConfig', + 'contextPropagationOnly', 'disableSend', 'errorOnAbortedRequests', 'filterHttpHeaders', @@ -279,7 +282,7 @@ class Config { this.metricsInterval = 0 } - if (this.disableSend) { + if (this.disableSend || this.contextPropagationOnly) { this.transport = function createNoopTransport (conf, agent) { return new NoopTransport() } diff --git a/lib/instrumentation/index.js b/lib/instrumentation/index.js index d0d50b72a8..a6d6f12d23 100644 --- a/lib/instrumentation/index.js +++ b/lib/instrumentation/index.js @@ -270,11 +270,12 @@ Instrumentation.prototype.addEndedTransaction = function (transaction) { this._log.debug({ ctxmgr: this._runCtxMgr.toString() }, 'addEndedTransaction(%s)', transaction.name) } - if (agent._conf.disableSend) { - // Save effort if disableSend=true. This one log.trace related to - // disableSend is included as a possible log hint to future debugging for - // why events are not being sent to APM server. - agent.logger.trace('disableSend: skip sendTransaction') + // Avoid transaction filtering time if only propagating trace-context. + if (agent._conf.contextPropagationOnly) { + // This one log.trace related to contextPropagationOnly is included as a + // possible log hint to future debugging for why events are not being sent + // to APM server. + agent.logger.trace('contextPropagationOnly: skip sendTransaction') return } @@ -306,7 +307,8 @@ Instrumentation.prototype.addEndedSpan = function (span) { } this._log.debug({ ctxmgr: this._runCtxMgr.toString() }, 'addEndedSpan(%s)', span.name) - if (agent._conf.disableSend) { + // Avoid span encoding time if only propagating trace-context. + if (agent._conf.contextPropagationOnly) { return } diff --git a/lib/instrumentation/transaction.js b/lib/instrumentation/transaction.js index ef7ad1133d..59bfc0e79d 100644 --- a/lib/instrumentation/transaction.js +++ b/lib/instrumentation/transaction.js @@ -288,8 +288,8 @@ Transaction.prototype._captureBreakdown = function (span) { const metrics = agent._metrics const conf = agent._conf - // Quick out if disableSend=true, no point in the processing time. - if (conf.disableSend) { + // Avoid unneeded breakdown metrics processing if only propagating trace context. + if (conf.contextPropagationOnly) { return } diff --git a/lib/metrics/index.js b/lib/metrics/index.js index 7355587025..821ddd6e45 100644 --- a/lib/metrics/index.js +++ b/lib/metrics/index.js @@ -23,7 +23,7 @@ class Metrics { start (refTimers) { const metricsInterval = this[agentSymbol]._conf.metricsInterval - const enabled = metricsInterval !== 0 && !this[agentSymbol]._conf.disableSend + const enabled = metricsInterval !== 0 && !this[agentSymbol]._conf.contextPropagationOnly if (enabled) { this[registrySymbol] = new MetricsRegistry(this[agentSymbol], { reporterOptions: { diff --git a/lib/noop-transport.js b/lib/noop-transport.js index 501a6f971d..9324d02a8d 100644 --- a/lib/noop-transport.js +++ b/lib/noop-transport.js @@ -1,7 +1,10 @@ 'use strict' // A no-op (does nothing) Agent transport -- i.e. the APM server client API -// provided by elastic-apm-http-client. This is used when `disableSend=true`. +// provided by elastic-apm-http-client. +// +// This is used for some configurations (when `disableSend=true` or when +// `contextPropagationOnly=true`) and in some tests. class NoopTransport { config (opts) {} diff --git a/test/config.test.js b/test/config.test.js index 2b99d226f2..d92c77b5a3 100644 --- a/test/config.test.js +++ b/test/config.test.js @@ -58,6 +58,7 @@ var optionFixtures = [ ['captureSpanStackTraces', 'CAPTURE_SPAN_STACK_TRACES', true], ['centralConfig', 'CENTRAL_CONFIG', true], ['containerId', 'CONTAINER_ID'], + ['contextPropagationOnly', 'CONTEXT_PROPAGATION_ONLY', false], ['disableSend', 'DISABLE_SEND', false], ['disableInstrumentations', 'DISABLE_INSTRUMENTATIONS', []], ['environment', 'ENVIRONMENT', 'development'], diff --git a/test/context-propagation-only.test.js b/test/context-propagation-only.test.js new file mode 100644 index 0000000000..5508a66734 --- /dev/null +++ b/test/context-propagation-only.test.js @@ -0,0 +1,211 @@ +'use strict' + +// Test the behaviour of the agent configured with 'contextPropagationOnly=true'. + +const { Writable } = require('stream') + +const ecsFormat = require('@elastic/ecs-pino-format') +let http // required below, *after* apm.start() +const pino = require('pino') +const tape = require('tape') + +const apm = require('../') +const { NoopTransport } = require('../lib/noop-transport') +const { MockAPMServer } = require('./_mock_apm_server') + +tape.test('contextPropagationOnly', function (suite) { + let server + let serverUrl + const METRICS_INTERVAL_S = 1 + + suite.test('setup', function (t) { + server = new MockAPMServer() + server.start(function (serverUrl_) { + serverUrl = serverUrl_ + t.comment('mock APM serverUrl: ' + serverUrl) + apm.start({ + serverUrl, + serviceName: 'test-disable-send', + logLevel: 'off', + captureExceptions: false, + metricsInterval: `${METRICS_INTERVAL_S}s`, // Try to force getting metrics soon. + transport: function customTransport (conf, agent) { + throw new Error('custom transport should not get called with contextPropagationOnly=true') + }, + contextPropagationOnly: true + }) + t.comment('apm started') + + // Intentionally delay import of 'http' until after we've started the + // APM agent so HTTP requests are instrumented. + http = require('http') + + t.end() + }) + }) + + suite.test('transport should be NoopTransport if contextPropagationOnly=true', function (t) { + t.ok(apm._transport instanceof NoopTransport, 'agent transport is NoopTransport') + t.end() + }) + + // Send a request: + // client -> serviceB -> serviceA + // and assert that: + // 1. the traceparent header automatically propagates the same trace_id + // between services B and A, and + // 2. log enrichment (adding trace.id et al) still works + suite.test('ensure distributed tracing and log enrichment still works', function (t) { + let headersA + let traceparentA + let traceparentB + + class LogCapturer extends Writable { + constructor (options) { + super(options) + this.chunks = [] + } + + _write (chunk, _encoding, cb) { + this.chunks.push(chunk) + cb() + } + + getLogRecords () { + return this.chunks + .join('') + .split('\n') + .filter(line => line.trim()) + .map(line => JSON.parse(line)) + } + } + const logStream = new LogCapturer() + + // `_elasticApm` is an internal/test-only option added in + // @elastic/ecs-pino-format@1.2.0 to allow passing in the import agent + // package that isn't `require`able at "elastic-apm-node". + const log = pino(ecsFormat({ _elasticApm: apm }), logStream) + const logA = log.child({ 'event.module': 'serviceA' }) + const logB = log.child({ 'event.module': 'serviceB' }) + + const serviceA = http.createServer(function (req, res) { + logA.info('handle request') + headersA = req.headers + traceparentA = apm.currentTraceparent + t.comment(`service A traceparent: ${traceparentA}`) + res.setHeader('Service', 'A') + res.end('the answer is 42') + }) + serviceA.listen(function () { + const urlA = 'http://localhost:' + serviceA.address().port + + const serviceB = http.createServer(function (req, res) { + logB.info('handle request') + traceparentB = apm.currentTraceparent + t.comment(`service B traceparent: ${traceparentB}`) + res.setHeader('Service', 'B') + http.get(urlA, function (resA) { + let bodyA = '' + resA.on('data', function (chunk) { + bodyA += chunk + }) + resA.on('end', function () { + res.setHeader('Service', 'B') + res.end('from service A: ' + bodyA) + }) + }) + }) + serviceB.listen(function () { + const urlB = 'http://localhost:' + serviceB.address().port + + log.info('send client request') + t.equal(apm.currentTraceparent, null, + 'there is no current traceparent before our client request') + http.get(urlB, function (resB) { + log.info('client got response') + let bodyB = '' + resB.on('data', function (chunk) { + bodyB += chunk + }) + resB.on('end', function () { + t.equal(bodyB, 'from service A: the answer is 42', + 'got expected body from client request') + const traceId = traceparentA.split('-')[1] + t.equal(traceId, traceparentB.split('-')[1], + 'the trace_id from apm.currentTraceparent in service A and B requests match') + t.equal(headersA.tracestate, 'es=s:1', + 'service A request got expected "tracestate" header') + + const recs = logStream.getLogRecords() + t.equal(recs[0].trace, undefined, + `log record 0 "${recs[0].message}" has no trace.id because trace has not yet started`) + t.equal(recs[1]['event.module'], 'serviceB', + `log record 1 "${recs[1].message}" is from serviceB`) + t.equal(recs[1].trace.id, traceId, + `log record 1 "${recs[1].message}" has trace.id set ${traceId}`) + t.equal(recs[2]['event.module'], 'serviceA', + `log record 2 "${recs[1].message}" is from serviceA`) + t.equal(recs[2].trace.id, traceId, + `log record 2 "${recs[2].message}" has trace.id set ${traceId}`) + + t.equal(server.events.length, 0, 'no events sent to APM server intake') + + serviceB.close() + serviceA.close() + t.end() + }) + }) + }) + }) + }) + + suite.test('transactions and spans are not sent to APM server', function (t) { + const tx = apm.startTransaction('mytx') + const s = tx.startSpan('myspan') + setImmediate(function () { + s.end() + tx.end() + apm.flush(function onFlushed () { + t.ok(tx.id, 'transaction has an id: ' + tx.id) + t.ok(s.id, 'span has an id: ' + s.id) + t.equal(server.events.length, 0, 'no events sent to APM server intake') + t.end() + }) + }) + }) + + suite.test('errors are not sent to APM server', function (t) { + const start = process.hrtime() + apm.captureError(new Error('myboom'), function (_, errId) { + const duration = process.hrtime(start) + + t.ok(errId, 'apm.captureError still calls back with an error id: ' + errId) + + // Test captureError speed as a proxy for testing that it avoids + // stacktrace collection when contextPropagationOnly=true. It isn't a + // perfect way to test that. + const durationMs = duration[0] / 1e3 + duration[1] / 1e6 + const THRESHOLD_MS = 3 // Is this long enough for slow CI? + t.ok(durationMs < THRESHOLD_MS, `captureError is fast (<${THRESHOLD_MS}ms): ${durationMs}ms`) + + t.equal(server.events.length, 0, 'no events sent to APM server intake') + t.end() + }) + }) + + suite.test('metrics are not sent to APM server', function (t) { + setTimeout(function afterMetricsInterval () { + t.equal(server.events.length, 0, + 'after metricsInterval, no events sent to APM server intake') + t.end() + }, METRICS_INTERVAL_S * 1000) + }) + + suite.test('teardown', function (t) { + server.close() + t.end() + apm.destroy() + }) + + suite.end() +}) diff --git a/test/disable-send.test.js b/test/disable-send.test.js index a06f414f11..e33034690e 100644 --- a/test/disable-send.test.js +++ b/test/disable-send.test.js @@ -2,11 +2,6 @@ // Test the behaviour of the agent configured with 'disableSend=true'. -const { Writable } = require('stream') - -const ecsFormat = require('@elastic/ecs-pino-format') -let http // required below, *after* apm.start() -const pino = require('pino') const tape = require('tape') const apm = require('../') @@ -30,135 +25,20 @@ tape.test('disableSend', function (suite) { captureExceptions: false, metricsInterval: `${METRICS_INTERVAL_S}s`, // Try to force getting metrics soon. transport: function customTransport (conf, agent) { - throw new Error('custom transport should not get called with disableSend') + throw new Error('custom transport should not get called with disableSend=true') }, disableSend: true }) t.comment('apm started') - - // Intentionally delay import of 'http' until after we've started the - // APM agent so HTTP requests are instrumented. - http = require('http') - t.end() }) }) - suite.test('transport should be NoopTransport if disableSend', function (t) { + suite.test('transport should be NoopTransport if disableSend=true', function (t) { t.ok(apm._transport instanceof NoopTransport, 'agent transport is NoopTransport') t.end() }) - // Send a request: - // client -> serviceB -> serviceA - // and assert that: - // 1. the traceparent header automatically propagates the same trace_id - // between services B and A, and - // 2. log enrichment (adding trace.id et al) still works - suite.test('ensure distributed tracing and log enrichment still works', function (t) { - let headersA - let traceparentA - let traceparentB - - class LogCapturer extends Writable { - constructor (options) { - super(options) - this.chunks = [] - } - - _write (chunk, _encoding, cb) { - this.chunks.push(chunk) - cb() - } - - getLogRecords () { - return this.chunks - .join('') - .split('\n') - .filter(line => line.trim()) - .map(line => JSON.parse(line)) - } - } - const logStream = new LogCapturer() - - // `_elasticApm` is an internal/test-only option added in - // @elastic/ecs-pino-format@1.2.0 to allow passing in the import agent - // package that isn't `require`able at "elastic-apm-node". - const log = pino(ecsFormat({ _elasticApm: apm }), logStream) - const logA = log.child({ 'event.module': 'serviceA' }) - const logB = log.child({ 'event.module': 'serviceB' }) - - const serviceA = http.createServer(function (req, res) { - logA.info('handle request') - headersA = req.headers - traceparentA = apm.currentTraceparent - t.comment(`service A traceparent: ${traceparentA}`) - res.setHeader('Service', 'A') - res.end('the answer is 42') - }) - serviceA.listen(function () { - const urlA = 'http://localhost:' + serviceA.address().port - - const serviceB = http.createServer(function (req, res) { - logB.info('handle request') - traceparentB = apm.currentTraceparent - t.comment(`service B traceparent: ${traceparentB}`) - res.setHeader('Service', 'B') - http.get(urlA, function (resA) { - let bodyA = '' - resA.on('data', function (chunk) { - bodyA += chunk - }) - resA.on('end', function () { - res.setHeader('Service', 'B') - res.end('from service A: ' + bodyA) - }) - }) - }) - serviceB.listen(function () { - const urlB = 'http://localhost:' + serviceB.address().port - - log.info('send client request') - t.equal(apm.currentTraceparent, null, - 'there is no current traceparent before our client request') - http.get(urlB, function (resB) { - log.info('client got response') - let bodyB = '' - resB.on('data', function (chunk) { - bodyB += chunk - }) - resB.on('end', function () { - t.equal(bodyB, 'from service A: the answer is 42', - 'got expected body from client request') - const traceId = traceparentA.split('-')[1] - t.equal(traceId, traceparentB.split('-')[1], - 'the trace_id from apm.currentTraceparent in service A and B requests match') - t.equal(headersA.tracestate, 'es=s:1', - 'service A request got expected "tracestate" header') - - const recs = logStream.getLogRecords() - t.equal(recs[0].trace, undefined, - `log record 0 "${recs[0].message}" has no trace.id because trace has not yet started`) - t.equal(recs[1]['event.module'], 'serviceB', - `log record 1 "${recs[1].message}" is from serviceB`) - t.equal(recs[1].trace.id, traceId, - `log record 1 "${recs[1].message}" has trace.id set ${traceId}`) - t.equal(recs[2]['event.module'], 'serviceA', - `log record 2 "${recs[1].message}" is from serviceA`) - t.equal(recs[2].trace.id, traceId, - `log record 2 "${recs[2].message}" has trace.id set ${traceId}`) - - t.equal(server.events.length, 0, 'no events sent to APM server intake') - - serviceB.close() - serviceA.close() - t.end() - }) - }) - }) - }) - }) - suite.test('transactions and spans are not sent to APM server', function (t) { const tx = apm.startTransaction('mytx') const s = tx.startSpan('myspan') @@ -175,19 +55,8 @@ tape.test('disableSend', function (suite) { }) suite.test('errors are not sent to APM server', function (t) { - const start = process.hrtime() apm.captureError(new Error('myboom'), function (_, errId) { - const duration = process.hrtime(start) - t.ok(errId, 'apm.captureError still calls back with an error id: ' + errId) - - // Test captureError speed as a proxy for testing that it avoids - // stacktrace collection when disableSend=true. It isn't a perfect way - // to test that. - const durationMs = duration[0] / 1e3 + duration[1] / 1e6 - const THRESHOLD_MS = 3 // Is this long enough for slow CI? - t.ok(durationMs < THRESHOLD_MS, `captureError is fast (<${THRESHOLD_MS}ms): ${durationMs}ms`) - t.equal(server.events.length, 0, 'no events sent to APM server intake') t.end() }) From 834be192a5e9df21a06135e63919d9de815884a2 Mon Sep 17 00:00:00 2001 From: Trent Mick Date: Fri, 29 Oct 2021 10:58:44 -0700 Subject: [PATCH 2/2] fix changelog spacing after merge and clarify the disableSend behaviour change in this version --- CHANGELOG.asciidoc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index b78e3928c9..630e054d27 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -33,6 +33,16 @@ Notes: [float] ===== Breaking changes +* Change <> to no longer skip internal processing + work. It now *only* disables communication with APM Server. Use + <> if your use case is + to limit the APM agent's processing to the minimum to support context + propagation and log correlation. ++ +This is listed under "Breaking changes" as a heads-up. The only possible +negative result of this `disableSend` change is some extra CPU processing time +by the agent. There is no outward functionality change. + [float] ===== Features @@ -44,6 +54,7 @@ Notes: + This is similar to <>, but differs in that `contextPropagationOnly: true` tells the agent to skip unnecessary work. + * The User-Agent header used for communication with APM Server now includes the `serviceName` and `serviceVersion`. For some users this can be https://github.com/elastic/apm/issues/509[helpful for APM Server log analysis].