-
Notifications
You must be signed in to change notification settings - Fork 325
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
upstream run level lineage implementation #2658
Changes from 4 commits
b4944d7
2007c82
059c7da
400644a
a118c06
50a8ed5
61da280
870557f
5f99805
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,6 +36,7 @@ | |
import lombok.Value; | ||
import lombok.extern.slf4j.Slf4j; | ||
import marquez.api.models.SortDirection; | ||
import marquez.common.models.RunId; | ||
import marquez.db.OpenLineageDao; | ||
import marquez.service.ServiceFactory; | ||
import marquez.service.models.BaseEvent; | ||
|
@@ -130,6 +131,23 @@ public Response getLineageEvents( | |
return Response.ok(new Events(events, totalCount)).build(); | ||
} | ||
|
||
@Timed | ||
@ResponseMetered | ||
@ExceptionMetered | ||
@GET | ||
@Consumes(APPLICATION_JSON) | ||
@Produces(APPLICATION_JSON) | ||
@Path("/runlineage/upstream") | ||
public Response getRunLineageUpstream( | ||
@QueryParam("runId") @NotNull RunId runId, | ||
@QueryParam("depth") @DefaultValue(DEFAULT_DEPTH) int depth, | ||
@QueryParam("facets") String facets) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we pulling this in? I don't see it being used in the service. Did you intend to do something in the TODO block you left in there? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, my idea is to be able to select what facets to return for each dataset_version job_version and run in the result. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I removed this parameter in this iteration |
||
throwIfNotExists(runId); | ||
return Response.ok( | ||
lineageService.upstream(runId, depth, facets == null ? null : facets.split(","))) | ||
.build(); | ||
} | ||
|
||
@Value | ||
static class Events { | ||
@NonNull | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,15 +5,22 @@ | |
|
||
package marquez.db; | ||
|
||
import java.time.Instant; | ||
import java.util.Collection; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
import java.util.UUID; | ||
import javax.validation.constraints.NotNull; | ||
import marquez.common.models.DatasetName; | ||
import marquez.common.models.JobName; | ||
import marquez.common.models.NamespaceName; | ||
import marquez.common.models.RunId; | ||
import marquez.db.mappers.DatasetDataMapper; | ||
import marquez.db.mappers.JobDataMapper; | ||
import marquez.db.mappers.JobRowMapper; | ||
import marquez.db.mappers.RunMapper; | ||
import marquez.db.mappers.UpstreamRunRowMapper; | ||
import marquez.service.models.DatasetData; | ||
import marquez.service.models.JobData; | ||
import marquez.service.models.Run; | ||
|
@@ -25,8 +32,18 @@ | |
@RegisterRowMapper(JobDataMapper.class) | ||
@RegisterRowMapper(RunMapper.class) | ||
@RegisterRowMapper(JobRowMapper.class) | ||
@RegisterRowMapper(UpstreamRunRowMapper.class) | ||
public interface LineageDao { | ||
|
||
public record JobSummary(NamespaceName namespace, JobName name, UUID version) {} | ||
|
||
public record RunSummary(RunId id, Instant start, Instant end, String status) {} | ||
|
||
public record DatasetSummary( | ||
NamespaceName namespace, DatasetName name, UUID version, RunId producedByRunId) {} | ||
|
||
public record UpstreamRunRow(JobSummary job, RunSummary run, DatasetSummary input) {} | ||
|
||
/** | ||
* Fetch all of the jobs that consume or produce the datasets that are consumed or produced by the | ||
* input jobIds. This returns a single layer from the BFS using datasets as edges. Jobs that have | ||
|
@@ -154,4 +171,50 @@ SELECT DISTINCT on(r.job_name, r.namespace_name) r.*, jv.version as job_version | |
WHERE j.uuid in (<jobUuid>) OR j.symlink_target_uuid IN (<jobUuid>) | ||
ORDER BY r.job_name, r.namespace_name, created_at DESC""") | ||
List<Run> getCurrentRuns(@BindList Collection<UUID> jobUuid); | ||
|
||
@SqlQuery( | ||
""" | ||
WITH RECURSIVE | ||
upstream_runs( | ||
r_uuid, -- run uuid | ||
dataset_uuid, dataset_version_uuid, dataset_namespace, dataset_name, -- input dataset version to the run | ||
u_r_uuid, -- upstream run that produced that dataset version | ||
depth -- current depth of traversal | ||
) AS ( | ||
|
||
-- initial case: find the inputs of the initial runs | ||
select r.uuid, | ||
dv.dataset_uuid, dv."version", dv.namespace_name, dv.dataset_name, | ||
dv.run_uuid, | ||
0 AS depth -- starts at 0 | ||
FROM (SELECT :runId::uuid AS uuid) r -- initial run | ||
LEFT JOIN runs_input_mapping rim ON rim.run_uuid = r.uuid | ||
LEFT JOIN dataset_versions dv ON dv.uuid = rim.dataset_version_uuid | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just thinking loudly. Wouldn't be better to join dataset_versions after the recursion at the bottom of the query once all the runs are identified? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought of that as well but we actually need dataset_versions in the recursion because this is where we find the run_uuid that produced the DV for the next iteration. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. idea: we could add that run uuid to runs_input_mapping at the same time as the dataset_version that would allow to join just on that table in the recursion. That'd be neat. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Didn't get the idea: do you want to add dataset_versions to runs_input_mapping or run_uuid to dataset_versions? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea is we we could add the dataset_versions.run_uuid to the runs_input_mapping table. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but I would keep that for future improvement. |
||
|
||
UNION | ||
|
||
-- recursion: find the inputs of the inputs found on the previous iteration and increase depth to know when to stop | ||
SELECT | ||
ur.u_r_uuid, | ||
dv2.dataset_uuid, dv2."version", dv2.namespace_name, dv2.dataset_name, | ||
dv2.run_uuid, | ||
ur.depth + 1 AS depth -- increase depth to check end condition | ||
FROM upstream_runs ur | ||
LEFT JOIN runs_input_mapping rim2 ON rim2.run_uuid = ur.u_r_uuid | ||
LEFT JOIN dataset_versions dv2 ON dv2.uuid = rim2.dataset_version_uuid | ||
-- end condition of the recursion: no input or depth is over the maximum set | ||
-- also avoid following cycles (merge statement) | ||
WHERE ur.u_r_uuid IS NOT NULL AND ur.u_r_uuid <> ur.r_uuid AND depth < :depth | ||
) | ||
|
||
-- present the result: use Distinct as we may have traversed the same edge multiple times if there are diamonds in the graph. | ||
SELECT DISTINCT ON (upstream_runs.r_uuid, upstream_runs.dataset_version_uuid, upstream_runs.u_r_uuid) | ||
upstream_runs.*, | ||
-- we add the run information after the recursion so that we join with the large run table only once | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❤️ |
||
r.started_at, r.ended_at, r.current_run_state as state, | ||
r.job_uuid, r.job_version_uuid, r.namespace_name as job_namespace, r.job_name | ||
FROM upstream_runs, runs r where upstream_runs.r_uuid = r.uuid; | ||
; | ||
""") | ||
List<UpstreamRunRow> getUpstreamRuns(@NotNull UUID runId, int depth); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
/* | ||
* Copyright 2018-2023 contributors to the Marquez project | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package marquez.db.mappers; | ||
|
||
import static marquez.db.Columns.stringOrThrow; | ||
import static marquez.db.Columns.timestampOrThrow; | ||
import static marquez.db.Columns.uuidOrThrow; | ||
|
||
import java.sql.ResultSet; | ||
import java.sql.SQLException; | ||
import java.util.UUID; | ||
import lombok.NonNull; | ||
import marquez.common.models.DatasetName; | ||
import marquez.common.models.JobName; | ||
import marquez.common.models.NamespaceName; | ||
import marquez.common.models.RunId; | ||
import marquez.db.Columns; | ||
import marquez.db.LineageDao.DatasetSummary; | ||
import marquez.db.LineageDao.JobSummary; | ||
import marquez.db.LineageDao.RunSummary; | ||
import marquez.db.LineageDao.UpstreamRunRow; | ||
import org.jdbi.v3.core.mapper.RowMapper; | ||
import org.jdbi.v3.core.statement.StatementContext; | ||
|
||
public final class UpstreamRunRowMapper implements RowMapper<UpstreamRunRow> { | ||
@Override | ||
public UpstreamRunRow map(@NonNull ResultSet results, @NonNull StatementContext context) | ||
throws SQLException { | ||
return new UpstreamRunRow( | ||
new JobSummary( | ||
new NamespaceName(stringOrThrow(results, "job_namespace")), | ||
new JobName(stringOrThrow(results, "job_name")), | ||
UUID.fromString(stringOrThrow(results, "job_version_uuid"))), | ||
new RunSummary( | ||
new RunId(uuidOrThrow(results, "r_uuid")), | ||
timestampOrThrow(results, Columns.STARTED_AT), | ||
timestampOrThrow(results, Columns.ENDED_AT), | ||
stringOrThrow(results, Columns.STATE)), | ||
results.getObject("dataset_name") == null | ||
? null | ||
: new DatasetSummary( | ||
new NamespaceName(stringOrThrow(results, "dataset_namespace")), | ||
new DatasetName(stringOrThrow(results, "dataset_name")), | ||
UUID.fromString(stringOrThrow(results, "dataset_version_uuid")), | ||
new RunId(UUID.fromString(stringOrThrow(results, "u_r_uuid"))))); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please mind documenting this in
openapi.spec
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done