From 0af6bd06bafd4f3712b70a1fafe4696eb4c6a1c1 Mon Sep 17 00:00:00 2001 From: Charles Date: Mon, 12 Dec 2022 15:06:31 -0800 Subject: [PATCH] emit failure type in attempt_failure_by_origin (#20349) --- .../metrics/lib/ApmTraceConstants.java | 5 ++++ .../io/airbyte/metrics/lib/MetricTags.java | 6 +++++ .../metrics/lib/OssMetricsRegistry.java | 2 +- ...obCreationAndStatusUpdateActivityImpl.java | 26 +++++++++++++++---- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/ApmTraceConstants.java b/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/ApmTraceConstants.java index 5c534659e63f..fcdcc2e0b380 100644 --- a/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/ApmTraceConstants.java +++ b/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/ApmTraceConstants.java @@ -74,6 +74,11 @@ public static final class Tags { */ public static final String FAILURE_ORIGINS_KEY = "failure_origins"; + /** + * Name of the APM trace tag that holds the failure type(s) associated with the trace. + */ + public static final String FAILURE_TYPES_KEY = "failure_types"; + /** * Name of the APM trace tag that holds the job ID value associated with the trace. */ diff --git a/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/MetricTags.java b/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/MetricTags.java index 99e6fbe60469..4ffd6e2081c9 100644 --- a/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/MetricTags.java +++ b/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/MetricTags.java @@ -5,6 +5,7 @@ package io.airbyte.metrics.lib; import io.airbyte.config.FailureReason.FailureOrigin; +import io.airbyte.config.FailureReason.FailureType; import io.airbyte.db.instance.configs.jooq.generated.enums.ReleaseStage; import io.airbyte.db.instance.jobs.jooq.generated.enums.JobStatus; @@ -15,6 +16,7 @@ public class MetricTags { public static final String CONNECTION_ID = "connection_id"; public static final String FAILURE_ORIGIN = "failure_origin"; + public static final String FAILURE_TYPE = "failure_type"; public static final String JOB_ID = "job_id"; public static final String JOB_STATUS = "job_status"; public static final String RELEASE_STAGE = "release_stage"; @@ -32,6 +34,10 @@ public static String getFailureOrigin(final FailureOrigin origin) { return origin != null ? origin.value() : FailureOrigin.UNKNOWN.value(); } + public static String getFailureType(final FailureType origin) { + return origin != null ? origin.value() : UNKNOWN; + } + public static String getJobStatus(final JobStatus status) { return status != null ? status.getLiteral() : UNKNOWN; } diff --git a/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/OssMetricsRegistry.java b/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/OssMetricsRegistry.java index 3dfbbd95e302..1fae89c965c1 100644 --- a/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/OssMetricsRegistry.java +++ b/airbyte-metrics/metrics-lib/src/main/java/io/airbyte/metrics/lib/OssMetricsRegistry.java @@ -42,7 +42,7 @@ public enum OssMetricsRegistry implements MetricsRegistry { ATTEMPT_FAILED_BY_FAILURE_ORIGIN( MetricEmittingApps.WORKER, "attempt_failed_by_failure_origin", - "increments for every failure origin a failed attempt has. since a failure can have multiple origins, a single failure can be counted more than once. tagged by failure origin."), + "increments for every failure origin a failed attempt has. since a failure can have multiple origins, a single failure can be counted more than once. tagged by failure origin and failure type."), ATTEMPT_SUCCEEDED_BY_RELEASE_STAGE( MetricEmittingApps.WORKER, "attempt_succeeded_by_release_stage", diff --git a/airbyte-workers/src/main/java/io/airbyte/workers/temporal/scheduling/activities/JobCreationAndStatusUpdateActivityImpl.java b/airbyte-workers/src/main/java/io/airbyte/workers/temporal/scheduling/activities/JobCreationAndStatusUpdateActivityImpl.java index 617727bb4643..39b089f8fef2 100644 --- a/airbyte-workers/src/main/java/io/airbyte/workers/temporal/scheduling/activities/JobCreationAndStatusUpdateActivityImpl.java +++ b/airbyte-workers/src/main/java/io/airbyte/workers/temporal/scheduling/activities/JobCreationAndStatusUpdateActivityImpl.java @@ -9,6 +9,7 @@ import static io.airbyte.metrics.lib.ApmTraceConstants.Tags.ATTEMPT_NUMBER_KEY; import static io.airbyte.metrics.lib.ApmTraceConstants.Tags.CONNECTION_ID_KEY; import static io.airbyte.metrics.lib.ApmTraceConstants.Tags.FAILURE_ORIGINS_KEY; +import static io.airbyte.metrics.lib.ApmTraceConstants.Tags.FAILURE_TYPES_KEY; import static io.airbyte.metrics.lib.ApmTraceConstants.Tags.JOB_ID_KEY; import static io.airbyte.persistence.job.models.AttemptStatus.FAILED; @@ -504,11 +505,24 @@ private void trackCompletionForInternalFailure(final Long jobId, private void traceFailures(final AttemptFailureSummary failureSummary) { if (failureSummary != null) { if (CollectionUtils.isNotEmpty(failureSummary.getFailures())) { - ApmTraceUtils.addTagsToTrace(Map.of(FAILURE_ORIGINS_KEY, failureSummary.getFailures().stream().map(FailureReason::getFailureOrigin).map( - FailureOrigin::name).collect(Collectors.joining(",")))); + ApmTraceUtils.addTagsToTrace(Map.of( + FAILURE_TYPES_KEY, + failureSummary.getFailures() + .stream() + .map(FailureReason::getFailureType) + .map(MetricTags::getFailureType) + .collect(Collectors.joining(",")), + FAILURE_ORIGINS_KEY, + failureSummary.getFailures() + .stream() + .map(FailureReason::getFailureOrigin) + .map(FailureOrigin::name) + .collect(Collectors.joining(",")))); } } else { - ApmTraceUtils.addTagsToTrace(Map.of(FAILURE_ORIGINS_KEY, FailureOrigin.UNKNOWN.value())); + ApmTraceUtils.addTagsToTrace(Map.of( + FAILURE_TYPES_KEY, MetricTags.getFailureType(null), + FAILURE_ORIGINS_KEY, FailureOrigin.UNKNOWN.value())); } } @@ -521,11 +535,13 @@ private void trackFailures(final AttemptFailureSummary failureSummary) { if (failureSummary != null) { for (final FailureReason reason : failureSummary.getFailures()) { MetricClientFactory.getMetricClient().count(OssMetricsRegistry.ATTEMPT_FAILED_BY_FAILURE_ORIGIN, 1, - new MetricAttribute(MetricTags.FAILURE_ORIGIN, MetricTags.getFailureOrigin(reason.getFailureOrigin()))); + new MetricAttribute(MetricTags.FAILURE_ORIGIN, MetricTags.getFailureOrigin(reason.getFailureOrigin())), + new MetricAttribute(MetricTags.FAILURE_TYPE, MetricTags.getFailureType(reason.getFailureType()))); } } else { MetricClientFactory.getMetricClient().count(OssMetricsRegistry.ATTEMPT_FAILED_BY_FAILURE_ORIGIN, 1, - new MetricAttribute(MetricTags.FAILURE_ORIGIN, FailureOrigin.UNKNOWN.value())); + new MetricAttribute(MetricTags.FAILURE_ORIGIN, FailureOrigin.UNKNOWN.value()), + new MetricAttribute(MetricTags.FAILURE_TYPE, MetricTags.getFailureType(null))); } }