From 4c4d03e79517a1bff94916909e6285a0ef21dc24 Mon Sep 17 00:00:00 2001 From: Ulf Adams Date: Mon, 17 Aug 2020 12:39:00 +0200 Subject: [PATCH] Add --grpc_keepalive_time/grpc_keepalive_timeout This allows controlling keep-alive pings in the gRPC library. We have seen cases where GCP's external TCP load balancer silently drops connections without telling the client. If this happens when the client is waiting for replies from the server (e.g., when all worker threads are in blocking 'execute' calls), then the client does not notice the connection loss and hangs. In our testing, the client unblocked after ~2 hours without this change, although we are not sure why (maybe a default Linux Kernel timeout on TCP connections?). With this flag set to 10 minutes, and running builds over night, we see repeated 10-minute gaps in remote service utilization, which seems pretty clear evidence that this is the underlying problem. The gRPC library has two settings: keep-alive time and keep-alive timeout. The keep-alive time is the time to wait after the most recent received packet before sending a keep-alive ping, and the keep-alive timeout is the time to wait for a reply before concluding that the connection is dead. The default keep-alive timeout setting is 20s, but the default keep-alive time setting is infinity (i.e., disable keep-alive pings). The gRPC documentation also says to ask the service owner before enabling keep-alive pings based on the concern that these could generate a lot of hidden traffic. I don't immediately see how these concerns apply to the REAPI, and the REAPI also does not have a single service owner. For now, I'm making this opt-in. This is difficult to test automatically because there is no easy way to drop a TCP connection without telling the other end point (for good reason!). Change-Id: I5d59737a21515b5d70c13cbdd5037f0a434ec74f --- .../lib/authandtls/AuthAndTLSOptions.java | 34 +++++++++++++++++++ .../build/lib/authandtls/GoogleAuthUtils.java | 5 +++ 2 files changed, 39 insertions(+) diff --git a/src/main/java/com/google/devtools/build/lib/authandtls/AuthAndTLSOptions.java b/src/main/java/com/google/devtools/build/lib/authandtls/AuthAndTLSOptions.java index d215fa8817ae9b..22dd94d1002d7e 100644 --- a/src/main/java/com/google/devtools/build/lib/authandtls/AuthAndTLSOptions.java +++ b/src/main/java/com/google/devtools/build/lib/authandtls/AuthAndTLSOptions.java @@ -15,11 +15,14 @@ package com.google.devtools.build.lib.authandtls; import com.google.devtools.common.options.Converters.CommaSeparatedOptionListConverter; +import com.google.devtools.common.options.Converters.DurationConverter; import com.google.devtools.common.options.Option; import com.google.devtools.common.options.OptionDocumentationCategory; import com.google.devtools.common.options.OptionEffectTag; import com.google.devtools.common.options.OptionMetadataTag; import com.google.devtools.common.options.OptionsBase; + +import java.time.Duration; import java.util.List; /** @@ -100,4 +103,35 @@ public class AuthAndTLSOptions extends OptionsBase { + "value a valid TLS authority." ) public String tlsAuthorityOverride; + + @Option( + name = "grpc_keepalive_time", + defaultValue = "null", + converter = DurationConverter.class, + documentationCategory = OptionDocumentationCategory.UNCATEGORIZED, + effectTags = {OptionEffectTag.UNKNOWN}, + help = + "Configures keep-alive pings for outgoing gRPC connections. If this is set, then " + + "Bazel sends pings after this much time of no read operations on the connection, " + + "but only if there is at least one pending gRPC call. Times are treated as second " + + "granularity; it is an error to set a value less than one second. By default, " + + "keep-alive pings are disabled. You should coordinate with the service owner " + + "before enabling this setting." + ) + public Duration grpcKeepaliveTime; + + @Option( + name = "grpc_keepalive_timeout", + defaultValue = "20s", + converter = DurationConverter.class, + documentationCategory = OptionDocumentationCategory.UNCATEGORIZED, + effectTags = {OptionEffectTag.UNKNOWN}, + help = + "Configures a keep-alive timeout for outgoing gRPC connections. If keep-alive pings are " + + "enabled with --grpc_keepalive_time, then Bazel times out a connection if it does " + + "not receive a ping reply after this much time. Times are treated as second " + + "granularity; it is an error to set a value less than one second. If keep-alive " + + "pings are disabled, then this setting is ignored." + ) + public Duration grpcKeepaliveTimeout; } diff --git a/src/main/java/com/google/devtools/build/lib/authandtls/GoogleAuthUtils.java b/src/main/java/com/google/devtools/build/lib/authandtls/GoogleAuthUtils.java index 9be58e6d01ba10..c3ec90e878ed90 100644 --- a/src/main/java/com/google/devtools/build/lib/authandtls/GoogleAuthUtils.java +++ b/src/main/java/com/google/devtools/build/lib/authandtls/GoogleAuthUtils.java @@ -41,6 +41,7 @@ import java.io.IOException; import java.io.InputStream; import java.util.List; +import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; /** Utility methods for using {@link AuthAndTLSOptions} with Google Cloud. */ @@ -72,6 +73,10 @@ public static ManagedChannel newChannel( newNettyChannelBuilder(targetUrl, proxy) .negotiationType( isTlsEnabled(target) ? NegotiationType.TLS : NegotiationType.PLAINTEXT); + if (options.grpcKeepaliveTime != null) { + builder.keepAliveTime(options.grpcKeepaliveTime.getSeconds(), TimeUnit.SECONDS); + builder.keepAliveTimeout(options.grpcKeepaliveTimeout.getSeconds(), TimeUnit.SECONDS); + } if (interceptors != null) { builder.intercept(interceptors); }