From 63d3bd50ce95e13d4cf68a17ff88e7ead00c03d3 Mon Sep 17 00:00:00 2001 From: David Schlosnagle Date: Fri, 6 Sep 2024 13:29:25 -0400 Subject: [PATCH 1/3] AutobatcherTelemetryComponents stores metric references --- .../AutobatcherTelemetryComponents.java | 62 ++++++++++++------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/atlasdb-autobatch/src/main/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponents.java b/atlasdb-autobatch/src/main/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponents.java index 57a3b0a8926..f29101de1e6 100644 --- a/atlasdb-autobatch/src/main/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponents.java +++ b/atlasdb-autobatch/src/main/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponents.java @@ -17,16 +17,25 @@ package com.palantir.atlasdb.autobatch; import com.codahale.metrics.Gauge; +import com.codahale.metrics.Histogram; import com.palantir.tritium.metrics.registry.TaggedMetricRegistry; import java.time.Duration; public final class AutobatcherTelemetryComponents { private final String safeLoggablePurpose; private final AutobatchOverheadMetrics overheadMetrics; + private final Histogram waitTimer; + private final Histogram runningTimer; + private final Histogram waitTimeHistogram; + private final Histogram totalTimer; private AutobatcherTelemetryComponents(String safeLoggablePurpose, AutobatchOverheadMetrics overheadMetrics) { this.safeLoggablePurpose = safeLoggablePurpose; this.overheadMetrics = overheadMetrics; + this.waitTimer = overheadMetrics.waitTimeNanos(); + this.runningTimer = overheadMetrics.runningTimeNanos(); + this.waitTimeHistogram = overheadMetrics.waitTimePercentage(); + this.totalTimer = overheadMetrics.totalTimeNanos(); } String getSafeLoggablePurpose() { @@ -45,19 +54,19 @@ void markWaitingTimeAndRunningTimeMetrics(Duration waitTime, Duration runningTim } void markWaitingTimeMetrics(Duration waitTime) { - overheadMetrics.waitTimeNanos().update(waitTime.toNanos()); + this.waitTimer.update(waitTime.toNanos()); } private void markRunningTimeMetrics(Duration runningTime) { - overheadMetrics.runningTimeNanos().update(runningTime.toNanos()); + this.runningTimer.update(runningTime.toNanos()); } private void markTotalTimeMetrics(long totalTimeNanos) { - overheadMetrics.totalTimeNanos().update(totalTimeNanos); + this.totalTimer.update(totalTimeNanos); } private void markWaitingTimePercentage(long waitTimePercentage) { - overheadMetrics.waitTimePercentage().update(waitTimePercentage); + this.waitTimeHistogram.update(waitTimePercentage); } public static AutobatcherTelemetryComponents create( @@ -67,26 +76,31 @@ public static AutobatcherTelemetryComponents create( .operationType(safeLoggablePurpose) .build(); - overheadMetrics.waitTimeNanosP1((Gauge) - () -> overheadMetrics.waitTimeNanos().getSnapshot().getValue(0.01)); - overheadMetrics.waitTimeNanosMedian((Gauge) - () -> overheadMetrics.waitTimeNanos().getSnapshot().getValue(0.5)); - - overheadMetrics.waitTimePercentageP1((Gauge) - () -> overheadMetrics.waitTimePercentage().getSnapshot().getValue(0.01)); - overheadMetrics.waitTimePercentageMedian((Gauge) - () -> overheadMetrics.waitTimePercentage().getSnapshot().getValue(0.5)); - - overheadMetrics.runningTimeNanosP1((Gauge) - () -> overheadMetrics.runningTimeNanos().getSnapshot().getValue(0.01)); - overheadMetrics.runningTimeNanosMedian((Gauge) - () -> overheadMetrics.runningTimeNanos().getSnapshot().getValue(0.5)); - - overheadMetrics.totalTimeNanosP1((Gauge) - () -> overheadMetrics.totalTimeNanos().getSnapshot().getValue(0.01)); - overheadMetrics.totalTimeNanosMedian((Gauge) - () -> overheadMetrics.totalTimeNanos().getSnapshot().getValue(0.5)); + AutobatcherTelemetryComponents autobatcherTelemetry = + new AutobatcherTelemetryComponents(safeLoggablePurpose, overheadMetrics); + autobatcherTelemetry.registerGauges(); + return autobatcherTelemetry; + } - return new AutobatcherTelemetryComponents(safeLoggablePurpose, overheadMetrics); + private void registerGauges() { + overheadMetrics.waitTimeNanosP1( + (Gauge) () -> waitTimer.getSnapshot().getValue(0.01)); + overheadMetrics.waitTimeNanosMedian( + (Gauge) () -> waitTimer.getSnapshot().getValue(0.5)); + + overheadMetrics.waitTimePercentageP1( + (Gauge) () -> waitTimeHistogram.getSnapshot().getValue(0.01)); + overheadMetrics.waitTimePercentageMedian( + (Gauge) () -> waitTimeHistogram.getSnapshot().getValue(0.5)); + + overheadMetrics.runningTimeNanosP1( + (Gauge) () -> runningTimer.getSnapshot().getValue(0.01)); + overheadMetrics.runningTimeNanosMedian( + (Gauge) () -> runningTimer.getSnapshot().getValue(0.5)); + + overheadMetrics.totalTimeNanosP1( + (Gauge) () -> totalTimer.getSnapshot().getValue(0.01)); + overheadMetrics.totalTimeNanosMedian( + (Gauge) () -> totalTimer.getSnapshot().getValue(0.5)); } } From 9f4405b4c18ab3f2853ae5b1ad304d12a91a645f Mon Sep 17 00:00:00 2001 From: David Schlosnagle Date: Sat, 7 Sep 2024 08:16:16 -0400 Subject: [PATCH 2/3] update tests --- .../AutobatcherTelemetryComponentsTest.java | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/atlasdb-autobatch/src/test/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponentsTest.java b/atlasdb-autobatch/src/test/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponentsTest.java index 13002f7d716..384395a9a97 100644 --- a/atlasdb-autobatch/src/test/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponentsTest.java +++ b/atlasdb-autobatch/src/test/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponentsTest.java @@ -116,19 +116,25 @@ private void assertTotalTimeMetricsAreReported(TaggedMetricRegistry registry, Lo } private void assertWaitTimeMetricsAreNotReported(TaggedMetricRegistry registry) { - assertThat(getWaitTimeHistogram(registry)).isNull(); + assertThat(getWaitTimeHistogram(registry)).isNotNull().satisfies(histogram -> assertThat(histogram.getCount()) + .isZero()); } private void assertRunningTimeMetricsAreNotReported(TaggedMetricRegistry registry) { - assertThat(getRunningTimeHistogram(registry)).isNull(); + assertThat(getRunningTimeHistogram(registry)) + .isNotNull() + .satisfies(histogram -> assertThat(histogram.getCount()).isZero()); } private void assertWaitTimePercentageMetricsAreNotReported(TaggedMetricRegistry registry) { - assertThat(getWaitTimePercentageHistogram(registry)).isNull(); + assertThat(getWaitTimePercentageHistogram(registry)) + .isNotNull() + .satisfies(histogram -> assertThat(histogram.getCount()).isZero()); } private void assertTotalTimeMetricsAreNotReported(TaggedMetricRegistry registry) { - assertThat(getTotalTimeHistogram(registry)).isNull(); + assertThat(getTotalTimeHistogram(registry)).isNotNull().satisfies(histogram -> assertThat(histogram.getCount()) + .isZero()); } private static Histogram getWaitTimeHistogram(TaggedMetricRegistry registry) { From 660e1483fcc4c6fb7b38f9adba8fd672d7e67112 Mon Sep 17 00:00:00 2001 From: David Schlosnagle Date: Sat, 7 Sep 2024 08:41:04 -0400 Subject: [PATCH 3/3] memoize gauge snapshots --- .../AutobatcherTelemetryComponents.java | 29 ++++++++++++++----- .../autobatch/DisruptorAutobatcherTest.java | 21 +++++++++----- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/atlasdb-autobatch/src/main/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponents.java b/atlasdb-autobatch/src/main/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponents.java index f29101de1e6..d77b537b62a 100644 --- a/atlasdb-autobatch/src/main/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponents.java +++ b/atlasdb-autobatch/src/main/java/com/palantir/atlasdb/autobatch/AutobatcherTelemetryComponents.java @@ -18,8 +18,11 @@ import com.codahale.metrics.Gauge; import com.codahale.metrics.Histogram; +import com.codahale.metrics.Snapshot; +import com.google.common.base.Suppliers; import com.palantir.tritium.metrics.registry.TaggedMetricRegistry; import java.time.Duration; +import java.util.function.Supplier; public final class AutobatcherTelemetryComponents { private final String safeLoggablePurpose; @@ -83,24 +86,34 @@ public static AutobatcherTelemetryComponents create( } private void registerGauges() { + // capturing a snapshot is expensive, so memoize for 10 milliseconds to reduce overhead during metrics read 50%. + Duration memoizeDuration = Duration.ofMillis(10); + Supplier waitTimerSnapshot = Suppliers.memoizeWithExpiration(waitTimer::getSnapshot, memoizeDuration); + Supplier waitTimeHistogramSnapshot = + Suppliers.memoizeWithExpiration(waitTimeHistogram::getSnapshot, memoizeDuration); + Supplier runningTimerSnapshot = + Suppliers.memoizeWithExpiration(runningTimer::getSnapshot, memoizeDuration); + Supplier totalTimerSnapshot = + Suppliers.memoizeWithExpiration(totalTimer::getSnapshot, memoizeDuration); + overheadMetrics.waitTimeNanosP1( - (Gauge) () -> waitTimer.getSnapshot().getValue(0.01)); + (Gauge) () -> waitTimerSnapshot.get().getValue(0.01)); overheadMetrics.waitTimeNanosMedian( - (Gauge) () -> waitTimer.getSnapshot().getValue(0.5)); + (Gauge) () -> waitTimerSnapshot.get().getValue(0.5)); overheadMetrics.waitTimePercentageP1( - (Gauge) () -> waitTimeHistogram.getSnapshot().getValue(0.01)); + (Gauge) () -> waitTimeHistogramSnapshot.get().getValue(0.01)); overheadMetrics.waitTimePercentageMedian( - (Gauge) () -> waitTimeHistogram.getSnapshot().getValue(0.5)); + (Gauge) () -> waitTimeHistogramSnapshot.get().getValue(0.5)); overheadMetrics.runningTimeNanosP1( - (Gauge) () -> runningTimer.getSnapshot().getValue(0.01)); + (Gauge) () -> runningTimerSnapshot.get().getValue(0.01)); overheadMetrics.runningTimeNanosMedian( - (Gauge) () -> runningTimer.getSnapshot().getValue(0.5)); + (Gauge) () -> runningTimerSnapshot.get().getValue(0.5)); overheadMetrics.totalTimeNanosP1( - (Gauge) () -> totalTimer.getSnapshot().getValue(0.01)); + (Gauge) () -> totalTimerSnapshot.get().getValue(0.01)); overheadMetrics.totalTimeNanosMedian( - (Gauge) () -> totalTimer.getSnapshot().getValue(0.5)); + (Gauge) () -> totalTimerSnapshot.get().getValue(0.5)); } } diff --git a/atlasdb-autobatch/src/test/java/com/palantir/atlasdb/autobatch/DisruptorAutobatcherTest.java b/atlasdb-autobatch/src/test/java/com/palantir/atlasdb/autobatch/DisruptorAutobatcherTest.java index 8f3f79cc1f7..a6b16cb082f 100644 --- a/atlasdb-autobatch/src/test/java/com/palantir/atlasdb/autobatch/DisruptorAutobatcherTest.java +++ b/atlasdb-autobatch/src/test/java/com/palantir/atlasdb/autobatch/DisruptorAutobatcherTest.java @@ -76,17 +76,24 @@ private void assertWaitTimeAndRunningTimeAndTotalTimeMetricsAreProduced( } private void assertNoWaitTimeAndRunningTimeMetricsAreProduced(TaggedMetricRegistry registry) { - assertThat(getWaitTimeHistogram(registry)).isNull(); - assertThat(getWaitTimePercentageHistogram(registry)).isNull(); - assertThat(getRunningTimeHistogram(registry)).isNull(); - assertThat(getTotalTimeHistogram(registry)).isNull(); + assertThat(getWaitTimeHistogram(registry)) + .satisfies(histogram -> assertThat(histogram.getCount()).isZero()); + assertThat(getWaitTimePercentageHistogram(registry)) + .satisfies(histogram -> assertThat(histogram.getCount()).isZero()); + assertThat(getRunningTimeHistogram(registry)) + .satisfies(histogram -> assertThat(histogram.getCount()).isZero()); + assertThat(getTotalTimeHistogram(registry)) + .satisfies(histogram -> assertThat(histogram.getCount()).isZero()); } private void assertOnlyWaitTimeMetricsAreProduced(TaggedMetricRegistry registry, int waitTimeNanos) { assertThat(getWaitTimeHistogram(registry).getSnapshot().getValues()).containsExactly(waitTimeNanos); - assertThat(getWaitTimePercentageHistogram(registry)).isNull(); - assertThat(getRunningTimeHistogram(registry)).isNull(); - assertThat(getTotalTimeHistogram(registry)).isNull(); + assertThat(getWaitTimePercentageHistogram(registry)) + .satisfies(histogram -> assertThat(histogram.getCount()).isZero()); + assertThat(getRunningTimeHistogram(registry)) + .satisfies(histogram -> assertThat(histogram.getCount()).isZero()); + assertThat(getTotalTimeHistogram(registry)) + .satisfies(histogram -> assertThat(histogram.getCount()).isZero()); } private static Histogram getWaitTimeHistogram(TaggedMetricRegistry registry) {