Skip to content

Commit

Permalink
instrument retrying channels scheduled executor service (#485)
Browse files Browse the repository at this point in the history
We produce metrics about requests that are going to be retried
  • Loading branch information
bulldozer-bot[bot] authored Mar 4, 2020
1 parent d520414 commit 46fadcd
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 26 deletions.
5 changes: 5 additions & 0 deletions changelog/@unreleased/pr-485.v2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
type: improvement
improvement:
description: Produce metrics about requests that are going to be retried
links:
- https://github.com/palantir/dialogue/pull/485
1 change: 1 addition & 0 deletions dialogue-core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies {
compile 'com.palantir.tracing:tracing'
compile 'io.dropwizard.metrics:metrics-core'
compile 'com.palantir.safethreadlocalrandom:safe-thread-local-random'
implementation 'com.palantir.tritium:tritium-metrics'

testImplementation 'com.palantir.tracing:tracing-test-utils'
testImplementation 'com.palantir.safe-logging:preconditions-assertj'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import com.github.benmanes.caffeine.cache.Ticker;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.ListeningScheduledExecutorService;
import com.google.errorprone.annotations.CheckReturnValue;
import com.palantir.conjure.java.client.config.ClientConfiguration;
import com.palantir.dialogue.Channel;
Expand All @@ -32,6 +31,7 @@
import java.util.Collection;
import java.util.List;
import java.util.Random;
import java.util.concurrent.ScheduledExecutorService;
import java.util.function.Supplier;
import javax.annotation.Nullable;

Expand All @@ -51,7 +51,7 @@ public static final class Builder {
private final List<Channel> channels = new ArrayList<>();
private Ticker clock = Ticker.systemTicker();
private Random random = SafeThreadLocalRandom.get();
private Supplier<ListeningScheduledExecutorService> scheduler = RetryingChannel.sharedScheduler;
private Supplier<ScheduledExecutorService> scheduler = RetryingChannel.sharedScheduler;

@Nullable
private ClientConfiguration config;
Expand Down Expand Up @@ -80,7 +80,7 @@ Builder random(Random value) {
}

@VisibleForTesting
Builder scheduler(ListeningScheduledExecutorService value) {
Builder scheduler(ScheduledExecutorService value) {
this.scheduler = () -> value;
return this;
}
Expand Down Expand Up @@ -129,16 +129,14 @@ private static void preconditions(Collection<? extends Channel> channels, Client
}

private static Channel retryingChannel(
ClientConfiguration conf,
Channel channel,
Supplier<ListeningScheduledExecutorService> scheduler,
Random random) {
ClientConfiguration conf, Channel channel, Supplier<ScheduledExecutorService> scheduler, Random random) {
if (conf.maxNumRetries() == 0) {
return channel;
}

return new RetryingChannel(
channel,
conf.taggedMetricRegistry(),
conf.maxNumRetries(),
conf.backoffSlotSize(),
conf.serverQoS(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package com.palantir.dialogue.core;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Suppliers;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
Expand All @@ -33,9 +34,13 @@
import com.palantir.logsafe.exceptions.SafeRuntimeException;
import com.palantir.tracing.DetachedSpan;
import com.palantir.tracing.Tracers;
import com.palantir.tritium.metrics.MetricRegistries;
import com.palantir.tritium.metrics.registry.DefaultTaggedMetricRegistry;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
import java.net.SocketTimeoutException;
import java.time.Duration;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.function.DoubleSupplier;
Expand All @@ -49,20 +54,19 @@
final class RetryingChannel implements Channel {

private static final Logger log = LoggerFactory.getLogger(RetryingChannel.class);
private static final String SCHEDULER_NAME = "dialogue-RetryingChannel-scheduler";

/*
* Shared single thread executor is reused between all retrying channels. If it becomes oversaturated
* we may wait longer than expected before resuming requests, but this is an
* edge case where services are already operating in a degraded state and we should not
* spam servers.
*/
static final Supplier<ListeningScheduledExecutorService> sharedScheduler =
Suppliers.memoize(() -> MoreExecutors.listeningDecorator(Tracers.wrap(
"dialogue-RetryingChannel-scheduler",
Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder()
.setNameFormat("dialogue-RetryingChannel-scheduler-%d")
.setDaemon(false)
.build()))));
static final Supplier<ScheduledExecutorService> sharedScheduler =
Suppliers.memoize(() -> Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder()
.setNameFormat(SCHEDULER_NAME + "-%d")
.setDaemon(false)
.build()));

private final ListeningScheduledExecutorService scheduler;
private final Channel delegate;
Expand All @@ -72,6 +76,7 @@ final class RetryingChannel implements Channel {
private final Duration backoffSlotSize;
private final DoubleSupplier jitter;

@VisibleForTesting
RetryingChannel(
Channel delegate,
int maxRetries,
Expand All @@ -80,6 +85,7 @@ final class RetryingChannel implements Channel {
ClientConfiguration.RetryOnTimeout retryOnTimeout) {
this(
delegate,
new DefaultTaggedMetricRegistry(),
maxRetries,
backoffSlotSize,
serverQoS,
Expand All @@ -90,18 +96,19 @@ final class RetryingChannel implements Channel {

RetryingChannel(
Channel delegate,
TaggedMetricRegistry metrics,
int maxRetries,
Duration backoffSlotSize,
ClientConfiguration.ServerQoS serverQoS,
ClientConfiguration.RetryOnTimeout retryOnTimeout,
ListeningScheduledExecutorService scheduler,
ScheduledExecutorService scheduler,
DoubleSupplier jitter) {
this.delegate = delegate;
this.maxRetries = maxRetries;
this.backoffSlotSize = backoffSlotSize;
this.serverQoS = serverQoS;
this.retryOnTimeout = retryOnTimeout;
this.scheduler = scheduler;
this.scheduler = instrument(scheduler, metrics);
this.jitter = jitter;
}

Expand Down Expand Up @@ -245,4 +252,10 @@ private static boolean shouldPropagateQos(ClientConfiguration.ServerQoS serverQo
throw new SafeIllegalStateException(
"Encountered unknown propagate QoS configuration", SafeArg.of("serverQoS", serverQoS));
}

private static ListeningScheduledExecutorService instrument(
ScheduledExecutorService delegate, TaggedMetricRegistry metrics) {
return MoreExecutors.listeningDecorator(
Tracers.wrap(SCHEDULER_NAME, MetricRegistries.instrument(metrics, delegate, SCHEDULER_NAME)));
}
}
19 changes: 12 additions & 7 deletions versions.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ com.fasterxml.jackson.datatype:jackson-datatype-jdk8:2.10.2 (2 constraints: 2529
com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.10.2 (1 constraints: 7b1c92a4)
com.fasterxml.jackson.module:jackson-module-afterburner:2.10.2 (2 constraints: 2529c2cb)
com.github.ben-manes.caffeine:caffeine:2.8.1 (2 constraints: c017484f)
com.google.code.findbugs:jsr305:3.0.2 (7 constraints: b058251a)
com.google.code.findbugs:jsr305:3.0.2 (11 constraints: 6b9fbbdf)
com.google.errorprone:error_prone_annotations:2.3.4 (5 constraints: 4a4818d6)
com.google.guava:failureaccess:1.0.1 (1 constraints: 140ae1b4)
com.google.guava:guava:28.1-jre (7 constraints: 7e7647a3)
com.google.guava:guava:28.1-jre (9 constraints: d2974be9)
com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava (1 constraints: bd17c918)
com.google.j2objc:j2objc-annotations:1.3 (1 constraints: b809eda0)
com.netflix.concurrency-limits:concurrency-limits-core:0.2.0 (1 constraints: 0405f135)
Expand All @@ -23,19 +23,22 @@ com.palantir.conjure.java.runtime:client-config:4.67.0 (1 constraints: 4305643b)
com.palantir.conjure.java.runtime:conjure-java-jackson-serialization:4.67.0 (2 constraints: 6316b80f)
com.palantir.conjure.java.runtime:keystores:4.67.0 (2 constraints: 651921de)
com.palantir.ri:resource-identifier:1.1.0 (2 constraints: ed14f4b5)
com.palantir.safe-logging:preconditions:1.13.0 (11 constraints: d9b7864c)
com.palantir.safe-logging:safe-logging:1.13.0 (7 constraints: 81793449)
com.palantir.safe-logging:preconditions:1.13.0 (13 constraints: 03d73cdd)
com.palantir.safe-logging:safe-logging:1.13.0 (9 constraints: ab98172c)
com.palantir.safethreadlocalrandom:safe-thread-local-random:0.1.0 (1 constraints: 0305ee35)
com.palantir.tokens:auth-tokens:3.6.1 (3 constraints: 90270b2e)
com.palantir.tracing:tracing:4.4.0 (2 constraints: f41532e9)
com.palantir.tracing:tracing-api:4.4.0 (1 constraints: a00c4509)
com.palantir.tritium:tritium-registry:0.16.6 (1 constraints: 1f14d376)
com.palantir.tritium:tritium-api:0.16.6 (2 constraints: 3b1fb8bd)
com.palantir.tritium:tritium-core:0.16.6 (1 constraints: 451044a2)
com.palantir.tritium:tritium-metrics:0.16.6 (1 constraints: 3f053b3b)
com.palantir.tritium:tritium-registry:0.16.6 (3 constraints: a129110e)
com.squareup.okhttp3:mockwebserver:3.13.1 (1 constraints: 3a053f3b)
com.squareup.okhttp3:okhttp:3.13.1 (2 constraints: a014ba9d)
com.squareup.okio:okio:1.17.2 (1 constraints: 850cc309)
commons-codec:commons-codec:1.11 (1 constraints: f20f8881)
commons-logging:commons-logging:1.2 (3 constraints: 43254b24)
io.dropwizard.metrics:metrics-core:3.2.6 (2 constraints: a115c8c3)
io.dropwizard.metrics:metrics-core:3.2.6 (3 constraints: b325741c)
jakarta.annotation:jakarta.annotation-api:1.3.5 (1 constraints: f10f7399)
jakarta.ws.rs:jakarta.ws.rs-api:2.1.6 (2 constraints: fb144cb7)
junit:junit:4.12 (2 constraints: e213d85a)
Expand All @@ -48,10 +51,12 @@ org.checkerframework:checker-qual:3.1.0 (2 constraints: 161ab343)
org.codehaus.mojo:animal-sniffer-annotations:1.18 (1 constraints: ee09d9aa)
org.hamcrest:hamcrest:2.2 (3 constraints: f41da570)
org.hamcrest:hamcrest-core:2.2 (3 constraints: 2f17d637)
org.hdrhistogram:HdrHistogram:2.1.12 (1 constraints: 3e103aa2)
org.immutables:value:2.8.3 (1 constraints: 0f051036)
org.mockito:mockito-core:3.3.1 (2 constraints: ca13e664)
org.mpierce.metrics.reservoir:hdrhistogram-metrics-reservoir:1.1.2 (1 constraints: 0c10f891)
org.objenesis:objenesis:2.6 (1 constraints: b40a14bd)
org.slf4j:slf4j-api:1.7.30 (7 constraints: 9d63de0d)
org.slf4j:slf4j-api:1.7.30 (9 constraints: d382e5e7)

[Test dependencies]
com.google.code.findbugs:annotations:3.0.1 (1 constraints: 9e0aafc3)
Expand Down
6 changes: 3 additions & 3 deletions versions.props
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@ com.github.ben-manes.caffeine:caffeine = 2.8.1
com.google.code.findbugs:jsr305 = 3.0.2
com.google.guava:guava = 27.0.1-jre
com.netflix.concurrency-limits:* = 0.2.0
com.palantir.baseline:* = 0.39.1
com.palantir.conjure.java:* = 5.9.0
com.palantir.conjure.java.api:* = 2.11.1
com.palantir.conjure.java.runtime:* = 4.67.0
com.palantir.conjure.java:* = 5.9.0
com.palantir.conjure:conjure = 4.10.1
com.palantir.ri:resource-identifier = 1.1.0
com.palantir.safe-logging:* = 1.13.0
com.palantir.safethreadlocalrandom:safe-thread-local-random = 0.1.0
com.palantir.tokens:auth-tokens = 3.6.1
com.palantir.tracing:* = 4.4.0
com.palantir.tritium:* = 0.16.6
com.uber.nullaway:nullaway = 0.7.9
io.dropwizard.metrics:metrics-core = 3.2.6
org.immutables:* = 2.8.3
org.slf4j:* = 1.7.30
com.palantir.conjure:conjure = 4.10.1

# test deps
junit:junit = 4.12
Expand Down

0 comments on commit 46fadcd

Please sign in to comment.