From 850b12a688cd55bcec5ed750203881378247e272 Mon Sep 17 00:00:00 2001 From: Richard Startin Date: Tue, 22 Oct 2024 12:39:40 +0100 Subject: [PATCH 1/5] apply rate limit to queue events --- .../java/concurrent/QueueTimerHelper.java | 19 +++++++++- .../instrumentation/jfr/WindowSampler.java | 15 -------- .../jfr/backpressure/BackpressureSampler.java | 2 ++ .../DirectAllocationSampler.java | 2 ++ .../jfr/exceptions/ExceptionSampler.java | 2 ++ .../api/sampling/PerRecordingRateLimiter.java | 36 +++++++++++++++++++ 6 files changed, 60 insertions(+), 16 deletions(-) create mode 100644 internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java index ce488c1f1c3..56d7e215cb2 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java @@ -1,13 +1,27 @@ package datadog.trace.bootstrap.instrumentation.java.concurrent; +import datadog.trace.api.config.ProfilingConfig; import datadog.trace.api.profiling.QueueTiming; import datadog.trace.api.profiling.Timer; +import datadog.trace.api.sampling.PerRecordingRateLimiter; import datadog.trace.bootstrap.ContextStore; +import datadog.trace.bootstrap.config.provider.ConfigProvider; import datadog.trace.bootstrap.instrumentation.api.AgentTracer; import datadog.trace.bootstrap.instrumentation.jfr.InstrumentationBasedProfiling; +import java.time.Duration; +import java.time.temporal.ChronoUnit; public class QueueTimerHelper { + private static final PerRecordingRateLimiter RATE_LIMITER = + new PerRecordingRateLimiter( + Duration.of(500, ChronoUnit.MILLIS), + 10_000, // hard limit on queue events + ConfigProvider.getInstance() + .getInteger( + ProfilingConfig.PROFILING_UPLOAD_PERIOD, + ProfilingConfig.PROFILING_UPLOAD_PERIOD_DEFAULT)); + public static void startQueuingTimer( ContextStore taskContextStore, Class schedulerClass, T task) { State state = taskContextStore.get(task); @@ -17,7 +31,10 @@ public static void startQueuingTimer( public static void startQueuingTimer(State state, Class schedulerClass, Object task) { // avoid calling this before JFR is initialised because it will lead to reading the wrong // TSC frequency before JFR has set it up properly - if (task != null && state != null && InstrumentationBasedProfiling.isJFRReady()) { + if (task != null + && state != null + && InstrumentationBasedProfiling.isJFRReady() + && RATE_LIMITER.permit()) { QueueTiming timing = (QueueTiming) AgentTracer.get().getProfilingContext().start(Timer.TimerType.QUEUEING); timing.setTask(task); diff --git a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/WindowSampler.java b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/WindowSampler.java index f6489a29c2f..d49dfd83ad5 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/WindowSampler.java +++ b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/WindowSampler.java @@ -2,7 +2,6 @@ import datadog.trace.api.sampling.AdaptiveSampler; import java.time.Duration; -import java.time.temporal.ChronoUnit; import jdk.jfr.Event; import jdk.jfr.EventType; @@ -24,18 +23,4 @@ public void start() { public boolean sample() { return sampleType.isEnabled() && sampler.sample(); } - - protected static int samplingWindowsPerRecording( - long uploadPeriodSeconds, Duration samplingWindow) { - /* - * Java8 doesn't have dividedBy#Duration so we have to implement poor man's version. - * None of these durations should be big enough to warrant dealing with bigints. - * We also do not care about nanoseconds here. - */ - return (int) - Math.min( - Duration.of(uploadPeriodSeconds, ChronoUnit.SECONDS).toMillis() - / samplingWindow.toMillis(), - Integer.MAX_VALUE); - } } diff --git a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureSampler.java b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureSampler.java index 1469b71f28e..279d7f86f4b 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureSampler.java +++ b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/backpressure/BackpressureSampler.java @@ -1,5 +1,7 @@ package datadog.trace.bootstrap.instrumentation.jfr.backpressure; +import static datadog.trace.api.sampling.PerRecordingRateLimiter.samplingWindowsPerRecording; + import datadog.trace.api.Config; import datadog.trace.bootstrap.instrumentation.jfr.WindowSampler; import java.time.Duration; diff --git a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/directallocation/DirectAllocationSampler.java b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/directallocation/DirectAllocationSampler.java index fdf50dd581e..41bfe82a84f 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/directallocation/DirectAllocationSampler.java +++ b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/directallocation/DirectAllocationSampler.java @@ -1,5 +1,7 @@ package datadog.trace.bootstrap.instrumentation.jfr.directallocation; +import static datadog.trace.api.sampling.PerRecordingRateLimiter.samplingWindowsPerRecording; + import datadog.trace.api.Config; import datadog.trace.bootstrap.instrumentation.jfr.WindowSampler; import java.time.Duration; diff --git a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/exceptions/ExceptionSampler.java b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/exceptions/ExceptionSampler.java index 71ea45a278f..2dfae36a634 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/exceptions/ExceptionSampler.java +++ b/dd-java-agent/agent-bootstrap/src/main/java11/datadog/trace/bootstrap/instrumentation/jfr/exceptions/ExceptionSampler.java @@ -1,5 +1,7 @@ package datadog.trace.bootstrap.instrumentation.jfr.exceptions; +import static datadog.trace.api.sampling.PerRecordingRateLimiter.samplingWindowsPerRecording; + import datadog.trace.api.Config; import datadog.trace.bootstrap.instrumentation.jfr.WindowSampler; import java.time.Duration; diff --git a/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java b/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java new file mode 100644 index 00000000000..c3a7b7cbcc4 --- /dev/null +++ b/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java @@ -0,0 +1,36 @@ +package datadog.trace.api.sampling; + +import java.time.Duration; +import java.time.temporal.ChronoUnit; + +public class PerRecordingRateLimiter { + + private final AdaptiveSampler sampler; + + public PerRecordingRateLimiter(Duration windowDuration, int limit, int recordingLength) { + int lookback = samplingWindowsPerRecording(recordingLength, windowDuration); + int samplesPerWindow = limit / samplingWindowsPerRecording(recordingLength, windowDuration); + sampler = new AdaptiveSampler(windowDuration, samplesPerWindow, lookback, 16, false); + } + + public void start() { + sampler.start(); + } + + public boolean permit() { + return sampler.sample(); + } + + public static int samplingWindowsPerRecording(long uploadPeriodSeconds, Duration samplingWindow) { + /* + * Java8 doesn't have dividedBy#Duration so we have to implement poor man's version. + * None of these durations should be big enough to warrant dealing with bigints. + * We also do not care about nanoseconds here. + */ + return (int) + Math.min( + Duration.of(uploadPeriodSeconds, ChronoUnit.SECONDS).toMillis() + / samplingWindow.toMillis(), + Integer.MAX_VALUE); + } +} From 60ecdaa7e3a0fa40491d9f2273f4b8c1e6c5edc8 Mon Sep 17 00:00:00 2001 From: Richard Startin Date: Mon, 28 Oct 2024 13:28:50 +0000 Subject: [PATCH 2/5] test for PerRecordingRateLimiter --- .../java/concurrent/QueueTimerHelper.java | 9 ++--- .../api/sampling/PerRecordingRateLimiter.java | 20 +++++++---- .../sampling/PerRecordingRateLimiterTest.java | 34 +++++++++++++++++++ 3 files changed, 52 insertions(+), 11 deletions(-) create mode 100644 internal-api/src/test/java/datadog/trace/api/sampling/PerRecordingRateLimiterTest.java diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java index 56d7e215cb2..0dcb62d4c4f 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java @@ -17,10 +17,11 @@ public class QueueTimerHelper { new PerRecordingRateLimiter( Duration.of(500, ChronoUnit.MILLIS), 10_000, // hard limit on queue events - ConfigProvider.getInstance() - .getInteger( - ProfilingConfig.PROFILING_UPLOAD_PERIOD, - ProfilingConfig.PROFILING_UPLOAD_PERIOD_DEFAULT)); + Duration.ofSeconds( + ConfigProvider.getInstance() + .getInteger( + ProfilingConfig.PROFILING_UPLOAD_PERIOD, + ProfilingConfig.PROFILING_UPLOAD_PERIOD_DEFAULT))); public static void startQueuingTimer( ContextStore taskContextStore, Class schedulerClass, T task) { diff --git a/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java b/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java index c3a7b7cbcc4..42020a11923 100644 --- a/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java +++ b/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java @@ -7,18 +7,24 @@ public class PerRecordingRateLimiter { private final AdaptiveSampler sampler; - public PerRecordingRateLimiter(Duration windowDuration, int limit, int recordingLength) { - int lookback = samplingWindowsPerRecording(recordingLength, windowDuration); - int samplesPerWindow = limit / samplingWindowsPerRecording(recordingLength, windowDuration); - sampler = new AdaptiveSampler(windowDuration, samplesPerWindow, lookback, 16, false); + public PerRecordingRateLimiter(Duration windowDuration, int limit, Duration recordingLength) { + this(windowDuration, limit, recordingLength, 16); } - public void start() { - sampler.start(); + public PerRecordingRateLimiter( + Duration windowDuration, int limit, Duration recordingLength, int budgetLookback) { + int lookback = samplingWindowsPerRecording(recordingLength.getSeconds(), windowDuration); + int samplesPerWindow = + limit / samplingWindowsPerRecording(recordingLength.getSeconds(), windowDuration); + sampler = new AdaptiveSampler(windowDuration, samplesPerWindow, lookback, budgetLookback, true); } public boolean permit() { - return sampler.sample(); + if (!sampler.sample()) { + return false; + } + return true; + // return sampler.sample(); } public static int samplingWindowsPerRecording(long uploadPeriodSeconds, Duration samplingWindow) { diff --git a/internal-api/src/test/java/datadog/trace/api/sampling/PerRecordingRateLimiterTest.java b/internal-api/src/test/java/datadog/trace/api/sampling/PerRecordingRateLimiterTest.java new file mode 100644 index 00000000000..2b32c9c2cb5 --- /dev/null +++ b/internal-api/src/test/java/datadog/trace/api/sampling/PerRecordingRateLimiterTest.java @@ -0,0 +1,34 @@ +package datadog.trace.api.sampling; + +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.time.Duration; +import java.util.Arrays; +import org.junit.jupiter.api.Test; + +public class PerRecordingRateLimiterTest { + + @Test + public void testLimitApplied() { + Duration window = Duration.ofMillis(100); + int limit = 20; + Duration recordingDurationSeconds = Duration.ofSeconds(1); + PerRecordingRateLimiter rateLimiter = + new PerRecordingRateLimiter(window, limit, recordingDurationSeconds, 5); + // no rate limiting is applied during the first window + int[] slots = new int[(int) (recordingDurationSeconds.toMillis() / window.toMillis())]; + long start = System.nanoTime(); + while (true) { + int slot = (int) (NANOSECONDS.toMillis(System.nanoTime() - start) / window.toMillis()); + if (slot >= slots.length) { + break; + } + if (rateLimiter.permit()) { + slots[slot]++; + } + } + assertTrue(Arrays.stream(slots).max().orElse(limit + 1) <= limit); + assertTrue(Arrays.stream(slots).sum() <= 2 * limit); + } +} From 715316cf610384b6402b0793fb72225a5df0adbe Mon Sep 17 00:00:00 2001 From: Richard Startin Date: Tue, 29 Oct 2024 10:20:33 +0000 Subject: [PATCH 3/5] graal native image --- .../NativeImageGeneratorRunnerInstrumentation.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java b/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java index c8a53764447..28b6fed47df 100644 --- a/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java +++ b/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java @@ -98,6 +98,10 @@ public static void onEnter(@Advice.Argument(value = 0, readOnly = false) String[ + "datadog.trace.bootstrap.instrumentation.java.concurrent.ConcurrentState:build_time," + "datadog.trace.bootstrap.instrumentation.java.concurrent.ExcludeFilter:build_time," + "datadog.trace.bootstrap.instrumentation.java.concurrent.QueueTimeHelper:build_time," + + "datadog.trace.api.sampling.AdaptiveSampler:build_time," + + "datadog.trace.api.sampling.AdaptiveSampler$Counts:build_time," + + "datadog.trace.api.sampling.AdaptiveSampler$RollWindowTask:build_time," + + "datadog.trace.api.sampling.PerRecordingRateLimiter:build_time," + "datadog.trace.bootstrap.instrumentation.java.concurrent.TPEHelper:build_time," + "datadog.trace.bootstrap.instrumentation.jfr.exceptions.ExceptionCountEvent:build_time," + "datadog.trace.bootstrap.instrumentation.jfr.exceptions.ExceptionSampleEvent:build_time," From 58474f862cee47a682257a5209b9176e934cc75c Mon Sep 17 00:00:00 2001 From: Richard Startin Date: Tue, 29 Oct 2024 11:44:36 +0000 Subject: [PATCH 4/5] remove debugging code --- .../datadog/trace/api/sampling/PerRecordingRateLimiter.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java b/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java index 42020a11923..fdee9232459 100644 --- a/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java +++ b/internal-api/src/main/java/datadog/trace/api/sampling/PerRecordingRateLimiter.java @@ -20,11 +20,7 @@ public PerRecordingRateLimiter( } public boolean permit() { - if (!sampler.sample()) { - return false; - } - return true; - // return sampler.sample(); + return sampler.sample(); } public static int samplingWindowsPerRecording(long uploadPeriodSeconds, Duration samplingWindow) { From 116c7d8357757bb3bb68281797e4d4c2ef012e3a Mon Sep 17 00:00:00 2001 From: Richard Startin Date: Tue, 29 Oct 2024 12:15:31 +0000 Subject: [PATCH 5/5] no queue time on graal native image - requires too much build time initialisation to support rate limiting --- .../java/concurrent/QueueTimerHelper.java | 29 ++++++++++++------- ...veImageGeneratorRunnerInstrumentation.java | 4 --- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java index 0dcb62d4c4f..e05f1da155a 100644 --- a/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java +++ b/dd-java-agent/agent-bootstrap/src/main/java/datadog/trace/bootstrap/instrumentation/java/concurrent/QueueTimerHelper.java @@ -1,5 +1,6 @@ package datadog.trace.bootstrap.instrumentation.java.concurrent; +import datadog.trace.api.Platform; import datadog.trace.api.config.ProfilingConfig; import datadog.trace.api.profiling.QueueTiming; import datadog.trace.api.profiling.Timer; @@ -13,15 +14,19 @@ public class QueueTimerHelper { - private static final PerRecordingRateLimiter RATE_LIMITER = - new PerRecordingRateLimiter( - Duration.of(500, ChronoUnit.MILLIS), - 10_000, // hard limit on queue events - Duration.ofSeconds( - ConfigProvider.getInstance() - .getInteger( - ProfilingConfig.PROFILING_UPLOAD_PERIOD, - ProfilingConfig.PROFILING_UPLOAD_PERIOD_DEFAULT))); + private static final class RateLimiterHolder { + // indirection to prevent needing to instantiate the class and its transitive dependencies + // in graal native image + private static final PerRecordingRateLimiter RATE_LIMITER = + new PerRecordingRateLimiter( + Duration.of(500, ChronoUnit.MILLIS), + 10_000, // hard limit on queue events + Duration.ofSeconds( + ConfigProvider.getInstance() + .getInteger( + ProfilingConfig.PROFILING_UPLOAD_PERIOD, + ProfilingConfig.PROFILING_UPLOAD_PERIOD_DEFAULT))); + } public static void startQueuingTimer( ContextStore taskContextStore, Class schedulerClass, T task) { @@ -30,12 +35,16 @@ public static void startQueuingTimer( } public static void startQueuingTimer(State state, Class schedulerClass, Object task) { + if (Platform.isNativeImageBuilder()) { + // explicitly not supported for Graal native image + return; + } // avoid calling this before JFR is initialised because it will lead to reading the wrong // TSC frequency before JFR has set it up properly if (task != null && state != null && InstrumentationBasedProfiling.isJFRReady() - && RATE_LIMITER.permit()) { + && RateLimiterHolder.RATE_LIMITER.permit()) { QueueTiming timing = (QueueTiming) AgentTracer.get().getProfilingContext().start(Timer.TimerType.QUEUEING); timing.setTask(task); diff --git a/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java b/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java index 28b6fed47df..c8a53764447 100644 --- a/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java +++ b/dd-java-agent/instrumentation/graal/native-image/src/main/java/datadog/trace/instrumentation/graal/nativeimage/NativeImageGeneratorRunnerInstrumentation.java @@ -98,10 +98,6 @@ public static void onEnter(@Advice.Argument(value = 0, readOnly = false) String[ + "datadog.trace.bootstrap.instrumentation.java.concurrent.ConcurrentState:build_time," + "datadog.trace.bootstrap.instrumentation.java.concurrent.ExcludeFilter:build_time," + "datadog.trace.bootstrap.instrumentation.java.concurrent.QueueTimeHelper:build_time," - + "datadog.trace.api.sampling.AdaptiveSampler:build_time," - + "datadog.trace.api.sampling.AdaptiveSampler$Counts:build_time," - + "datadog.trace.api.sampling.AdaptiveSampler$RollWindowTask:build_time," - + "datadog.trace.api.sampling.PerRecordingRateLimiter:build_time," + "datadog.trace.bootstrap.instrumentation.java.concurrent.TPEHelper:build_time," + "datadog.trace.bootstrap.instrumentation.jfr.exceptions.ExceptionCountEvent:build_time," + "datadog.trace.bootstrap.instrumentation.jfr.exceptions.ExceptionSampleEvent:build_time,"