Skip to content

Commit

Permalink
add more specific metrics and logging for DNS lookup failures (#2179)
Browse files Browse the repository at this point in the history
add more specific metrics and logging for DNS lookup failures
bjlaub authored Feb 28, 2024
1 parent 154a05a commit 409307a
Showing 6 changed files with 138 additions and 6 deletions.
5 changes: 5 additions & 0 deletions changelog/@unreleased/pr-2179.v2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
type: improvement
improvement:
description: try to log error messages when getaddrinfo fails
links:
- https://github.com/palantir/dialogue/pull/2179
2 changes: 2 additions & 0 deletions dialogue-clients/metrics.md
Original file line number Diff line number Diff line change
@@ -68,6 +68,8 @@ Dialogue DNS metrics.
- `success`: DNS resolution succeeded using `InetAddress.getAllByName`.
- `fallback`: DNS resolution using the primary mechanism failed, however addresses were available in the fallback cache.
- `failure`: No addresses could be resolved for the given hostname.
- `client.dns.failure` (meter): DNS resolver query failures.
- `error-type`: Describes the error type returned by getaddrinfo() when lookup fails.

### client.uri
Dialogue URI parsing metrics.
Original file line number Diff line number Diff line change
@@ -19,17 +19,26 @@
import com.google.common.collect.ImmutableSet;
import com.palantir.dialogue.core.DialogueDnsResolver;
import com.palantir.logsafe.Preconditions;
import com.palantir.logsafe.Safe;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.UnsafeArg;
import com.palantir.logsafe.logger.SafeLogger;
import com.palantir.logsafe.logger.SafeLoggerFactory;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Objects;
import java.util.Optional;

enum DefaultDialogueDnsResolver implements DialogueDnsResolver {
INSTANCE;

final class DefaultDialogueDnsResolver implements DialogueDnsResolver {
private static final SafeLogger log = SafeLoggerFactory.get(DefaultDialogueDnsResolver.class);

private final ClientDnsMetrics metrics;

DefaultDialogueDnsResolver(TaggedMetricRegistry registry) {
this.metrics = ClientDnsMetrics.of(registry);
}

@Override
public ImmutableSet<InetAddress> resolve(String hostname) {
Preconditions.checkNotNull(hostname, "hostname is required");
@@ -41,8 +50,77 @@ public ImmutableSet<InetAddress> resolve(String hostname) {
}
return ImmutableSet.copyOf(results);
} catch (UnknownHostException e) {
log.warn("Unknown host '{}'", UnsafeArg.of("hostname", hostname), e);
GaiError gaiError = extractGaiError(e, hostname);
log.warn(
"Unknown host '{}'. {}: {}",
SafeArg.of("gaiErrorType", gaiError.name()),
SafeArg.of("gaiErrorMessage", gaiError.errorMessage()),
UnsafeArg.of("hostname", hostname),
e);
metrics.failure(gaiError.name()).mark();
return ImmutableSet.of();
}
}

// these strings were taken from glibc-2.39, but likely have not changed in quite a while
// strings may be different on BSD systems like macos
// TODO(dns): update this list to try to match against known strings on other platforms
private enum GaiError {
EAI_ADDRFAMILY("Address family for hostname not supported"),
EAI_AGAIN("Temporary failure in name resolution"),
EAI_BADFLAGS("Bad value for ai_flags"),
EAI_FAIL("Non-recoverable failure in name resolution"),
EAI_FAMILY("ai_family not supported"),
EAI_MEMORY("Memory allocation failure"),
EAI_NODATA("No address associated with hostname"),
EAI_NONAME("Name or service not known"),
EAI_SERVICE("Servname not supported for ai_socktype"),
EAI_SOCKTYPE("ai_socktype not supported"),
EAI_SYSTEM("System error"),
EAI_INPROGRESS("Processing request in progress"),
EAI_CANCELED("Request canceled"),
EAI_NOTCANCELED("Request not canceled"),
EAI_ALLDONE("All requests done"),
EAI_INTR("Interrupted by a signal"),
EAI_IDN_ENCODE("Parameter string not correctly encoded"),
EAI_OVERFLOW("Result too large for supplied buffer"),
CACHED(),
UNKNOWN();

private final Optional<String> errorMessage;

GaiError() {
this.errorMessage = Optional.empty();
}

GaiError(String errorMessage) {
this.errorMessage = Optional.of(errorMessage);
}

@Safe
String errorMessage() {
return errorMessage.orElseGet(this::name);
}
}

private static GaiError extractGaiError(UnknownHostException exception, String requestedHostname) {
if (exception.getMessage() == null) {
return GaiError.UNKNOWN;
}

try {
if (Objects.equals(requestedHostname, exception.getMessage())) {
return GaiError.CACHED;
}

for (GaiError error : GaiError.values()) {
if (error.errorMessage.isPresent() && exception.getMessage().contains(error.errorMessage.get())) {
return error;
}
}
return GaiError.UNKNOWN;
} catch (Exception e) {
return GaiError.UNKNOWN;
}
}
}
Original file line number Diff line number Diff line change
@@ -161,7 +161,7 @@ default ConjureRuntime runtime() {
@Value.Default
default DialogueDnsResolver dnsResolver() {
return new CachingFallbackDnsResolver(
new ProtocolVersionFilteringDialogueDnsResolver(DefaultDialogueDnsResolver.INSTANCE),
new ProtocolVersionFilteringDialogueDnsResolver(new DefaultDialogueDnsResolver(taggedMetrics())),
taggedMetrics());
}

6 changes: 6 additions & 0 deletions dialogue-clients/src/main/metrics/dialogue-core-metrics.yml
Original file line number Diff line number Diff line change
@@ -37,3 +37,9 @@ namespaces:
- value: failure
docs: No addresses could be resolved for the given hostname.
docs: DNS resolver query metrics, on a per-hostname basis.
failure:
type: meter
tags:
- name: error-type
docs: Describes the error type returned by getaddrinfo() when lookup fails.
docs: DNS resolver query failures.
Original file line number Diff line number Diff line change
@@ -18,11 +18,16 @@

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.assertj.core.api.Assumptions.assumeThat;

import com.google.common.collect.ImmutableSet;
import com.palantir.dialogue.core.DialogueDnsResolver;
import com.palantir.logsafe.exceptions.SafeNullPointerException;
import com.palantir.tritium.metrics.registry.DefaultTaggedMetricRegistry;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.UUID;
import org.junit.jupiter.api.Test;

class DefaultDialogueDnsResolverTest {
@@ -66,7 +71,43 @@ void malformedIpv6_noBrackets() {
assertThat(resolve("::z")).isEmpty();
}

@Test
void unknown_host() {
assumeThat(System.getProperty("os.name").toLowerCase().startsWith("linux"))
.describedAs("GAI Error Strings are only defined for Linux environments")
.isTrue();

TaggedMetricRegistry registry = new DefaultTaggedMetricRegistry();
DialogueDnsResolver resolver = new DefaultDialogueDnsResolver(registry);

String badHost = UUID.randomUUID() + ".palantir.com";
ImmutableSet<InetAddress> result = resolver.resolve(badHost);

assertThat(result).isEmpty();
ClientDnsMetrics metrics = ClientDnsMetrics.of(registry);
assertThat(metrics.failure("EAI_NONAME").getCount()).isEqualTo(1);
}

@Test
void unknown_host_from_cache() {
TaggedMetricRegistry registry = new DefaultTaggedMetricRegistry();
DialogueDnsResolver resolver = new DefaultDialogueDnsResolver(registry);
ClientDnsMetrics metrics = ClientDnsMetrics.of(registry);

String badHost = UUID.randomUUID() + ".palantir.com";
ImmutableSet<InetAddress> result = resolver.resolve(badHost);

assertThat(result).isEmpty();
assertThat(metrics.failure("EAI_NONAME").getCount()).isEqualTo(1);

// should resolve from cache
ImmutableSet<InetAddress> result2 = resolver.resolve(badHost);
assertThat(result2).isEmpty();
assertThat(metrics.failure("CACHED").getCount()).isEqualTo(1);
}

private static ImmutableSet<InetAddress> resolve(String hostname) {
return DefaultDialogueDnsResolver.INSTANCE.resolve(hostname);
DialogueDnsResolver resolver = new DefaultDialogueDnsResolver(new DefaultTaggedMetricRegistry());
return resolver.resolve(hostname);
}
}

0 comments on commit 409307a

Please sign in to comment.