Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

emit failure type in attempt_failure_by_origin #20349

Merged
merged 4 commits into from
Dec 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ public static final class Tags {
*/
public static final String FAILURE_ORIGINS_KEY = "failure_origins";

/**
* Name of the APM trace tag that holds the failure type(s) associated with the trace.
*/
public static final String FAILURE_TYPES_KEY = "failure_types";

/**
* Name of the APM trace tag that holds the job ID value associated with the trace.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package io.airbyte.metrics.lib;

import io.airbyte.config.FailureReason.FailureOrigin;
import io.airbyte.config.FailureReason.FailureType;
import io.airbyte.db.instance.configs.jooq.generated.enums.ReleaseStage;
import io.airbyte.db.instance.jobs.jooq.generated.enums.JobStatus;

Expand All @@ -15,6 +16,7 @@ public class MetricTags {

public static final String CONNECTION_ID = "connection_id";
public static final String FAILURE_ORIGIN = "failure_origin";
public static final String FAILURE_TYPE = "failure_type";
public static final String JOB_ID = "job_id";
public static final String JOB_STATUS = "job_status";
public static final String RELEASE_STAGE = "release_stage";
Expand All @@ -32,6 +34,10 @@ public static String getFailureOrigin(final FailureOrigin origin) {
return origin != null ? origin.value() : FailureOrigin.UNKNOWN.value();
}

public static String getFailureType(final FailureType origin) {
return origin != null ? origin.value() : UNKNOWN;
}

public static String getJobStatus(final JobStatus status) {
return status != null ? status.getLiteral() : UNKNOWN;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public enum OssMetricsRegistry implements MetricsRegistry {
ATTEMPT_FAILED_BY_FAILURE_ORIGIN(
MetricEmittingApps.WORKER,
"attempt_failed_by_failure_origin",
"increments for every failure origin a failed attempt has. since a failure can have multiple origins, a single failure can be counted more than once. tagged by failure origin."),
"increments for every failure origin a failed attempt has. since a failure can have multiple origins, a single failure can be counted more than once. tagged by failure origin and failure type."),
ATTEMPT_SUCCEEDED_BY_RELEASE_STAGE(
MetricEmittingApps.WORKER,
"attempt_succeeded_by_release_stage",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import static io.airbyte.metrics.lib.ApmTraceConstants.Tags.ATTEMPT_NUMBER_KEY;
import static io.airbyte.metrics.lib.ApmTraceConstants.Tags.CONNECTION_ID_KEY;
import static io.airbyte.metrics.lib.ApmTraceConstants.Tags.FAILURE_ORIGINS_KEY;
import static io.airbyte.metrics.lib.ApmTraceConstants.Tags.FAILURE_TYPES_KEY;
import static io.airbyte.metrics.lib.ApmTraceConstants.Tags.JOB_ID_KEY;
import static io.airbyte.persistence.job.models.AttemptStatus.FAILED;

Expand Down Expand Up @@ -504,11 +505,24 @@ private void trackCompletionForInternalFailure(final Long jobId,
private void traceFailures(final AttemptFailureSummary failureSummary) {
if (failureSummary != null) {
if (CollectionUtils.isNotEmpty(failureSummary.getFailures())) {
ApmTraceUtils.addTagsToTrace(Map.of(FAILURE_ORIGINS_KEY, failureSummary.getFailures().stream().map(FailureReason::getFailureOrigin).map(
FailureOrigin::name).collect(Collectors.joining(","))));
ApmTraceUtils.addTagsToTrace(Map.of(
FAILURE_TYPES_KEY,
failureSummary.getFailures()
.stream()
.map(FailureReason::getFailureType)
.map(MetricTags::getFailureType)
.collect(Collectors.joining(",")),
FAILURE_ORIGINS_KEY,
failureSummary.getFailures()
.stream()
.map(FailureReason::getFailureOrigin)
.map(FailureOrigin::name)
.collect(Collectors.joining(","))));
}
} else {
ApmTraceUtils.addTagsToTrace(Map.of(FAILURE_ORIGINS_KEY, FailureOrigin.UNKNOWN.value()));
ApmTraceUtils.addTagsToTrace(Map.of(
FAILURE_TYPES_KEY, MetricTags.getFailureType(null),
FAILURE_ORIGINS_KEY, FailureOrigin.UNKNOWN.value()));
}
}

Expand All @@ -521,11 +535,13 @@ private void trackFailures(final AttemptFailureSummary failureSummary) {
if (failureSummary != null) {
for (final FailureReason reason : failureSummary.getFailures()) {
MetricClientFactory.getMetricClient().count(OssMetricsRegistry.ATTEMPT_FAILED_BY_FAILURE_ORIGIN, 1,
new MetricAttribute(MetricTags.FAILURE_ORIGIN, MetricTags.getFailureOrigin(reason.getFailureOrigin())));
new MetricAttribute(MetricTags.FAILURE_ORIGIN, MetricTags.getFailureOrigin(reason.getFailureOrigin())),
new MetricAttribute(MetricTags.FAILURE_TYPE, MetricTags.getFailureType(reason.getFailureType())));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we also add this to the traceFailures method right about the trackFailures one? That would add the information as a facet on the APM trace in addition to the metric.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, this sounds like a good idea

}
} else {
MetricClientFactory.getMetricClient().count(OssMetricsRegistry.ATTEMPT_FAILED_BY_FAILURE_ORIGIN, 1,
new MetricAttribute(MetricTags.FAILURE_ORIGIN, FailureOrigin.UNKNOWN.value()));
new MetricAttribute(MetricTags.FAILURE_ORIGIN, FailureOrigin.UNKNOWN.value()),
new MetricAttribute(MetricTags.FAILURE_TYPE, MetricTags.getFailureType(null)));
}
}

Expand Down