Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use sampling on timeline events #4861

Merged
merged 3 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions integration-tests/profiler/profiler.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,9 @@ async function gatherNetworkTimelineEvents (cwd, scriptFilePath, eventType, args
const proc = fork(path.join(cwd, scriptFilePath), args, {
cwd,
env: {
DD_PROFILING_PROFILERS: 'wall',
DD_PROFILING_EXPORTERS: 'file',
DD_PROFILING_ENABLED: 1,
DD_PROFILING_EXPERIMENTAL_TIMELINE_ENABLED: 1
DD_INTERNAL_PROFILING_TIMELINE_SAMPLING_ENABLED: 0 // capture all events
}
})

Expand Down Expand Up @@ -205,12 +204,8 @@ describe('profiler', () => {
const proc = fork(path.join(cwd, 'profiler/codehotspots.js'), {
cwd,
env: {
DD_PROFILING_PROFILERS: 'wall',
DD_PROFILING_EXPORTERS: 'file',
DD_PROFILING_ENABLED: 1,
DD_PROFILING_CODEHOTSPOTS_ENABLED: 1,
DD_PROFILING_ENDPOINT_COLLECTION_ENABLED: 1,
DD_PROFILING_EXPERIMENTAL_TIMELINE_ENABLED: 1
DD_PROFILING_ENABLED: 1
}
})

Expand Down
3 changes: 3 additions & 0 deletions packages/dd-trace/src/profiling/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class Config {
const {
DD_AGENT_HOST,
DD_ENV,
DD_INTERNAL_PROFILING_TIMELINE_SAMPLING_ENABLED, // used for testing
DD_PROFILING_CODEHOTSPOTS_ENABLED,
DD_PROFILING_CPU_ENABLED,
DD_PROFILING_DEBUG_SOURCE_MAPS,
Expand Down Expand Up @@ -175,6 +176,8 @@ class Config {
DD_PROFILING_EXPERIMENTAL_TIMELINE_ENABLED, samplingContextsAvailable))
logExperimentalVarDeprecation('TIMELINE_ENABLED')
checkOptionWithSamplingContextAllowed(this.timelineEnabled, 'Timeline view')
this.timelineSamplingEnabled = isTrue(coalesce(options.timelineSamplingEnabled,
DD_INTERNAL_PROFILING_TIMELINE_SAMPLING_ENABLED, true))

this.codeHotspotsEnabled = isTrue(coalesce(options.codeHotspotsEnabled,
DD_PROFILING_CODEHOTSPOTS_ENABLED,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ const { performance } = require('perf_hooks')
// We are leveraging the TracingPlugin class for its functionality to bind
// start/error/finish methods to the appropriate diagnostic channels.
class EventPlugin extends TracingPlugin {
constructor (eventHandler) {
constructor (eventHandler, eventFilter) {
super()
this.eventHandler = eventHandler
this.eventFilter = eventFilter
this.store = storage('profiling')
this.entryType = this.constructor.entryType
}
Expand All @@ -30,17 +31,20 @@ class EventPlugin extends TracingPlugin {
}
const duration = performance.now() - startTime

const context = this.activeSpan?.context()
const _ddSpanId = context?.toSpanId()
const _ddRootSpanId = context?._trace.started[0]?.context().toSpanId() || _ddSpanId

const event = {
entryType: this.entryType,
startTime,
duration,
_ddSpanId,
_ddRootSpanId
duration
}

if (!this.eventFilter(event)) {
return
}

const context = this.activeSpan?.context()
event._ddSpanId = context?.toSpanId()
event._ddRootSpanId = context?._trace.started[0]?.context().toSpanId() || event._ddSpanId

this.eventHandler(this.extendEvent(event, startEvent))
}
}
Expand Down
55 changes: 47 additions & 8 deletions packages/dd-trace/src/profiling/profilers/events.js
Original file line number Diff line number Diff line change
Expand Up @@ -254,10 +254,10 @@ class NodeApiEventSource {
}

class DatadogInstrumentationEventSource {
constructor (eventHandler) {
constructor (eventHandler, eventFilter) {
this.plugins = ['dns_lookup', 'dns_lookupservice', 'dns_resolve', 'dns_reverse', 'net'].map(m => {
const Plugin = require(`./event_plugins/${m}`)
return new Plugin(eventHandler)
return new Plugin(eventHandler, eventFilter)
})

this.started = false
Expand Down Expand Up @@ -292,29 +292,68 @@ class CompositeEventSource {
}
}

function createPossionProcessSamplingFilter (samplingIntervalMillis) {
let nextSamplingInstant = performance.now()
let currentSamplingInstant = 0
setNextSamplingInstant()

return event => {
const endTime = event.startTime + event.duration
while (endTime >= nextSamplingInstant) {
setNextSamplingInstant()
}
// An event is sampled if it started before, and ended on or after a sampling instant. The above
// while loop will ensure that the ending invariant is always true for the current sampling
// instant so we don't have to test for it below. Across calls, the invariant also holds as long
// as the events arrive in endTime order. This is true for events coming from
// DatadogInstrumentationEventSource; they will be ordered by endTime by virtue of this method
// being invoked synchronously with the plugins' finish() handler which evaluates
// performance.now(). OTOH, events coming from NodeAPIEventSource (GC in typical setup) might be
// somewhat delayed as they are queued by Node, so they can arrive out of order with regard to
// events coming from the non-queued source. By omitting the endTime check, we will pass through
// some short events that started and ended before the current sampling instant. OTOH, if we
// were to check for this.currentSamplingInstant <= endTime, we would discard some long events
// that also ended before the current sampling instant. We'd rather err on the side of including
// some short events than excluding some long events.
return event.startTime < currentSamplingInstant
}

function setNextSamplingInstant () {
currentSamplingInstant = nextSamplingInstant
nextSamplingInstant -= Math.log(1 - Math.random()) * samplingIntervalMillis
}
}

/**
* This class generates pprof files with timeline events. It combines an event
* source with an event serializer.
* source with a sampling event filter and an event serializer.
*/
class EventsProfiler {
constructor (options = {}) {
this.type = 'events'
this.eventSerializer = new EventSerializer()

const eventHandler = event => {
this.eventSerializer.addEvent(event)
const eventHandler = event => this.eventSerializer.addEvent(event)
const eventFilter = options.timelineSamplingEnabled
// options.samplingInterval comes in microseconds, we need millis
? createPossionProcessSamplingFilter((options.samplingInterval ?? 1e6 / 99) / 1000)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was surprised to see that we never define options.samplingInterval in the config and therefore rely on the defaults set in WallProfiler / EventsProfiler

: _ => true
const filteringEventHandler = event => {
if (eventFilter(event)) {
eventHandler(event)
}
}

if (options.codeHotspotsEnabled) {
// Use Datadog instrumentation to collect events with span IDs. Still use
// Node API for GC events.
this.eventSource = new CompositeEventSource([
new DatadogInstrumentationEventSource(eventHandler),
new NodeApiEventSource(eventHandler, ['gc'])
new DatadogInstrumentationEventSource(eventHandler, eventFilter),
new NodeApiEventSource(filteringEventHandler, ['gc'])
])
} else {
// Use Node API instrumentation to collect events without span IDs
this.eventSource = new NodeApiEventSource(eventHandler)
this.eventSource = new NodeApiEventSource(filteringEventHandler)
}
}

Expand Down
Loading