From 94f01d9472a0dd24beb7e6416b3cdbefc7dda8e3 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Mon, 29 Apr 2024 21:19:45 +0900 Subject: [PATCH 1/8] fix: improve lineage docs --- docs/api/tutorials/lineage.md | 119 ++++++++++++++++-- .../library/read_lineage_execute_graphql.py | 42 +++++++ .../examples/library/read_lineage_rest.py | 42 +------ 3 files changed, 156 insertions(+), 47 deletions(-) create mode 100644 metadata-ingestion/examples/library/read_lineage_execute_graphql.py diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index 13ec716b7870b..22e70275b18c8 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -15,6 +15,7 @@ This guide will show you how to - Add lineage between datasets. - Add column-level lineage between datasets. +- Read lineage. ## Prerequisites @@ -109,7 +110,7 @@ Expected Response: -### Expected Outcomes of Adding Lineage +### Expected Outcome You can now see the lineage between `fct_users_deleted` and `logging_events`. @@ -129,7 +130,7 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`. -### Expected Outcome of Adding Column Level Lineage +### Expected Outcome You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage. @@ -137,18 +138,17 @@ You can now see the column-level lineage between datasets. Note that you have to

-## Read Lineage +## Read Lineage (Lineage Impact Analysis) ```graphql -query searchAcrossLineage { - searchAcrossLineage( +query scrollAcrossLineage { + scrollAcrossLineage( input: { query: "*" - urn: "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)" - start: 0 + urn: "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" count: 10 direction: DOWNSTREAM orFilters: [ @@ -176,21 +176,72 @@ query searchAcrossLineage { } ``` -This example shows using lineage degrees as a filter, but additional search filters can be included here as well. - ```shell curl --location --request POST 'http://localhost:8080/api/graphql' \ --header 'Authorization: Bearer ' \ ---header 'Content-Type: application/json' --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}" +--header 'Content-Type: application/json' --data-raw '{ { "query": "query scrollAcrossLineage { scrollAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)\" count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}" }}' ``` +```python +{{ inline /metadata-ingestion/examples/library/read_lineage_execute_graphql.py show_path_as_comment }} +``` + + + + +This example shows using lineage degrees as a filter, but additional search filters can be included here as well. + +Note that `degree` means the number of hops in the lineage. For example, `degree: 1` means the immediate downstream entities, `degree: 2` means the entities that are two hops away, and so on. + +This will perform a multi-hop lineage search on the urn specified. For more information about the `scrollAcrossLineage` mutation, please refer to [scrollAcrossLineage](https://datahubproject.io/docs/graphql/queries/#scrollacrosslineage). + +### Expected Outcome + +As an outcome, you should see the downstream entities of `logging_events`. + +```graphql +{ + "data": { + "scrollAcrossLineage": { + "searchResults": [ + { + "degree": 1, + "entity": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)", + "type": "DATA_JOB" + } + }, + ... + { + "degree": 2, + "entity": { + "urn": "urn:li:mlPrimaryKey:(user_analytics,user_name)", + "type": "MLPRIMARY_KEY" + } + } + ] + } + }, + "extensions": {} +} +``` + +## Read Column-level Lineage + +You can also read column-level lineage via Python SDK. + + + + + + ```python {{ inline /metadata-ingestion/examples/library/read_lineage_rest.py show_path_as_comment }} ``` @@ -198,4 +249,50 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ -This will perform a multi-hop lineage search on the urn specified. For more information about the `searchAcrossLineage` mutation, please refer to [searchAcrossLineage](https://datahubproject.io/docs/graphql/queries/#searchacrosslineage). +### Expected Outcome + +As a response, you will get the full lineage information like this. + +```graphql +{ + "UpstreamLineageClass": { + "upstreams": [ + { + "UpstreamClass": { + "auditStamp": { + "AuditStampClass": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null, + "message": null + } + }, + "created": null, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)", + "type": "TRANSFORMED", + "properties": null, + "query": null + } + } + ], + "fineGrainedLineages": [ + { + "FineGrainedLineageClass": { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD),browser_id)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD),user_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD),browser)" + ], + "transformOperation": null, + "confidenceScore": 1.0, + "query": null + } + } + ] + } +} +``` \ No newline at end of file diff --git a/metadata-ingestion/examples/library/read_lineage_execute_graphql.py b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py new file mode 100644 index 0000000000000..1a1d43fb2a33e --- /dev/null +++ b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py @@ -0,0 +1,42 @@ +# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough) +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph + +gms_endpoint = "http://localhost:8080" +graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) + +# Query multiple aspects from entity +query = """ +query scrollAcrossLineage { + scrollAcrossLineage( + input: { + query: "*" + urn: "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" + count: 10 + direction: DOWNSTREAM + orFilters: [ + { + and: [ + { + condition: EQUAL + negated: false + field: "degree" + values: ["1", "2", "3+"] + } + ] + } + ] + } + ) { + searchResults { + degree + entity { + urn + type + } + } + } +} +""" +result = graph.execute_graphql(query=query) + +print(result) diff --git a/metadata-ingestion/examples/library/read_lineage_rest.py b/metadata-ingestion/examples/library/read_lineage_rest.py index bd9b4e8651dba..1767fa70c0b78 100644 --- a/metadata-ingestion/examples/library/read_lineage_rest.py +++ b/metadata-ingestion/examples/library/read_lineage_rest.py @@ -1,43 +1,13 @@ -# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough) from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +# Imports for metadata model classes +from datahub.metadata.schema_classes import UpstreamLineageClass + +# First we get the current owners gms_endpoint = "http://localhost:8080" graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) -# Query multiple aspects from entity -query = """ -query searchAcrossLineage { - searchAcrossLineage( - input: { - query: "*" - urn: "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)" - start: 0 - count: 10 - direction: DOWNSTREAM - orFilters: [ - { - and: [ - { - condition: EQUAL - negated: false - field: "degree" - values: ["1", "2", "3+"] - } - ] # Additional search filters can be included here as well - } - ] - } - ) { - searchResults { - degree - entity { - urn - type - } - } - } -} -""" -result = graph.execute_graphql(query=query) +urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" +result = graph.get_aspect(entity_urn=urn, aspect_type=UpstreamLineageClass) print(result) From 08565df2ab31ae2389574f11d1a4bfc3263ab850 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Thu, 2 May 2024 19:34:34 +0900 Subject: [PATCH 2/8] feat: add non-dataset entities lineage --- docs/api/tutorials/lineage.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index 22e70275b18c8..b6d8db67bcb08 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -118,6 +118,7 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`.

+ ## Add Column-level Lineage @@ -138,6 +139,19 @@ You can now see the column-level lineage between datasets. Note that you have to

+## Add Lineage to Non-Dataset Entities + +You can also add lineage to non-dataset entities, such as DataJobs, Charts, and Dashboards. +Please refer to the following examples. + +| Connection | Examples | A.K.A | +|---------------------|-------------------|-----------------| +| DataJob to DataFlow | - [lineage_job_dataflow.py](../../../metadata-ingestion/examples/library/lineage_job_dataflow.py) | | +| DataJob to Dataset | - [lineage_dataset_job_dataset.py](../../../metadata-ingestion/examples/library/lineage_dataset_job_dataset.py)
| Pipeline Lineage | +| Chart to Dashboard | - [lineage_chart_dashboard.py](../../../metadata-ingestion/examples/library/lineage_chart_dashboard.py) | | +| Chart to Dataset | - [lineage_dataset_chart.py](../../../metadata-ingestion/examples/library/lineage_dataset_chart.py) | | + + ## Read Lineage (Lineage Impact Analysis) From 9b3096f49b170d7c893d5d114c9341ae5859311c Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Mon, 20 May 2024 11:31:14 +0900 Subject: [PATCH 3/8] fix: reflect comments --- docs/api/tutorials/lineage.md | 24 ++++++++-- .../library/read_lineage_datajob_rest.py | 13 ++++++ ...e_rest.py => read_lineage_dataset_rest.py} | 2 +- .../library/read_lineage_execute_graphql.py | 46 ++++++++++--------- 4 files changed, 57 insertions(+), 28 deletions(-) create mode 100644 metadata-ingestion/examples/library/read_lineage_datajob_rest.py rename metadata-ingestion/examples/library/{read_lineage_rest.py => read_lineage_dataset_rest.py} (91%) diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index b6d8db67bcb08..5f82ad00acfed 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -189,6 +189,13 @@ query scrollAcrossLineage { } } ``` +:::info Degree +Note that `degree` means the number of hops in the lineage. For example, `degree: 1` means the immediate downstream entities, `degree: 2` means the entities that are two hops away, and so on. +::: + +The GraphQL example shows using lineage degrees as a filter, but additional search filters can be included here as well. +This will perform a multi-hop lineage search on the urn specified. For more information about the `scrollAcrossLineage` mutation, please refer to [scrollAcrossLineage](https://datahubproject.io/docs/graphql/queries/#scrollacrosslineage). + @@ -206,15 +213,22 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ ```python {{ inline /metadata-ingestion/examples/library/read_lineage_execute_graphql.py show_path_as_comment }} ``` +The Python SDK example shows how to read lineage of a dataset. Please note that the `aspect_type` parameter can vary depending on the entity type. +Below is a few examples of `aspect_type` for different entities. - - +|Entity|Aspect_type|Reference| +|-------|------------|-----| +|Dataset|`UpstreamLineageClass`| Link | +|Datajob|`DataJobInputOutputClass`| Link | +|Dashboard|`DashboardInfoClass`| Link | +|DataFlow|`DataFlowInfoClass`| Link | -This example shows using lineage degrees as a filter, but additional search filters can be included here as well. +You can find more information about the `aspect_type` in the `Outgoing relationship` that the entity consumes/produces. +Learn more about lineages of different entities in the [Add Lineage to Non-Dataset Entities](#add-lineage-to-non-dataset-entities) Section. -Note that `degree` means the number of hops in the lineage. For example, `degree: 1` means the immediate downstream entities, `degree: 2` means the entities that are two hops away, and so on. + +
-This will perform a multi-hop lineage search on the urn specified. For more information about the `scrollAcrossLineage` mutation, please refer to [scrollAcrossLineage](https://datahubproject.io/docs/graphql/queries/#scrollacrosslineage). ### Expected Outcome diff --git a/metadata-ingestion/examples/library/read_lineage_datajob_rest.py b/metadata-ingestion/examples/library/read_lineage_datajob_rest.py new file mode 100644 index 0000000000000..e23c1ee310641 --- /dev/null +++ b/metadata-ingestion/examples/library/read_lineage_datajob_rest.py @@ -0,0 +1,13 @@ +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph + +# Imports for metadata model classes +from datahub.metadata.schema_classes import DataJobInputOutputClass + +# Get the current lineage for a datajob +gms_endpoint = "http://localhost:8080" +graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) + +urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" +result = graph.get_aspect(entity_urn=urn, aspect_type=DataJobInputOutputClass) + +print(result) diff --git a/metadata-ingestion/examples/library/read_lineage_rest.py b/metadata-ingestion/examples/library/read_lineage_dataset_rest.py similarity index 91% rename from metadata-ingestion/examples/library/read_lineage_rest.py rename to metadata-ingestion/examples/library/read_lineage_dataset_rest.py index 1767fa70c0b78..5e3e4b643e4fe 100644 --- a/metadata-ingestion/examples/library/read_lineage_rest.py +++ b/metadata-ingestion/examples/library/read_lineage_dataset_rest.py @@ -3,7 +3,7 @@ # Imports for metadata model classes from datahub.metadata.schema_classes import UpstreamLineageClass -# First we get the current owners +# Get the current lineage for a dataset gms_endpoint = "http://localhost:8080" graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) diff --git a/metadata-ingestion/examples/library/read_lineage_execute_graphql.py b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py index 1a1d43fb2a33e..8c5c36ca4b053 100644 --- a/metadata-ingestion/examples/library/read_lineage_execute_graphql.py +++ b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py @@ -6,27 +6,8 @@ # Query multiple aspects from entity query = """ -query scrollAcrossLineage { - scrollAcrossLineage( - input: { - query: "*" - urn: "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)" - count: 10 - direction: DOWNSTREAM - orFilters: [ - { - and: [ - { - condition: EQUAL - negated: false - field: "degree" - values: ["1", "2", "3+"] - } - ] - } - ] - } - ) { +query scrollAcrossLineage($input: ScrollQueryInput!) { + scrollAcrossLineage(input: $input) { searchResults { degree entity { @@ -37,6 +18,27 @@ } } """ -result = graph.execute_graphql(query=query) + +variables = {"input": + { + "query": "*", + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", + "count": 10, + "direction": "DOWNSTREAM", + "orFilters": [ + { + "and": [ + { + "condition": "EQUAL", + "negated": "false", + "field": "degree", + "values": ["1", "2", "3+"] + } + ] + } + ] + } + } +result = graph.execute_graphql(query=query, variables=variables) print(result) From 9277a80399692507af4907fb36e752d116f1e633 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Mon, 20 May 2024 14:56:36 +0900 Subject: [PATCH 4/8] fix graphql query --- .../examples/library/read_lineage_execute_graphql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/examples/library/read_lineage_execute_graphql.py b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py index 8c5c36ca4b053..438ace20c6233 100644 --- a/metadata-ingestion/examples/library/read_lineage_execute_graphql.py +++ b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py @@ -6,7 +6,7 @@ # Query multiple aspects from entity query = """ -query scrollAcrossLineage($input: ScrollQueryInput!) { +query scrollAcrossLineage($input: ScrollAcrossLineageInput!) { scrollAcrossLineage(input: $input) { searchResults { degree From 257b39eacb3cddd4ac9c5d10538fefa93e9a8918 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Mon, 20 May 2024 15:01:43 +0900 Subject: [PATCH 5/8] fix reference for aspect_type --- docs/api/tutorials/lineage.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index 5f82ad00acfed..3242a68d9f2ca 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -216,14 +216,13 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \ The Python SDK example shows how to read lineage of a dataset. Please note that the `aspect_type` parameter can vary depending on the entity type. Below is a few examples of `aspect_type` for different entities. -|Entity|Aspect_type|Reference| -|-------|------------|-----| -|Dataset|`UpstreamLineageClass`| Link | -|Datajob|`DataJobInputOutputClass`| Link | -|Dashboard|`DashboardInfoClass`| Link | -|DataFlow|`DataFlowInfoClass`| Link | - -You can find more information about the `aspect_type` in the `Outgoing relationship` that the entity consumes/produces. +|Entity|Aspect_type| Reference | +|-------|------------|--------------------------------------------------------------------------| +|Dataset|`UpstreamLineageClass`| [Link](/docs/generated/metamodel/entities/dataset.md#upstreamlineage) | +|Datajob|`DataJobInputOutputClass`| [Link](/docs/generated/metamodel/entities/datajob.md#datajobinputoutput) | +|Dashboard|`DashboardInfoClass`| [Link](/docs/generated/metamodel/entities/dashboard.md#dashboardinfo) | +|DataFlow|`DataFlowInfoClass`| [Link](/docs/generated/metamodel/entities/dataflow.md#dataflowinfo) | + Learn more about lineages of different entities in the [Add Lineage to Non-Dataset Entities](#add-lineage-to-non-dataset-entities) Section. From 7280358215e8e3e3e2159a0ae2789a33a4183315 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Mon, 20 May 2024 15:41:33 +0900 Subject: [PATCH 6/8] fix broken link --- docs/api/tutorials/lineage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index 3242a68d9f2ca..65f8050896362 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -270,7 +270,7 @@ You can also read column-level lineage via Python SDK. ```python -{{ inline /metadata-ingestion/examples/library/read_lineage_rest.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/read_lineage_dataset_rest.py show_path_as_comment }} ``` From fc501d567938a7dd30aeb5a595fb21c6f4a31d13 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Mon, 20 May 2024 16:51:37 +0900 Subject: [PATCH 7/8] fix broken link --- docs/api/tutorials/lineage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index 65f8050896362..b2e7155489a66 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -219,9 +219,9 @@ Below is a few examples of `aspect_type` for different entities. |Entity|Aspect_type| Reference | |-------|------------|--------------------------------------------------------------------------| |Dataset|`UpstreamLineageClass`| [Link](/docs/generated/metamodel/entities/dataset.md#upstreamlineage) | -|Datajob|`DataJobInputOutputClass`| [Link](/docs/generated/metamodel/entities/datajob.md#datajobinputoutput) | +|Datajob|`DataJobInputOutputClass`| [Link](/docs/generated/metamodel/entities/dataJob.md#datajobinputoutput) | |Dashboard|`DashboardInfoClass`| [Link](/docs/generated/metamodel/entities/dashboard.md#dashboardinfo) | -|DataFlow|`DataFlowInfoClass`| [Link](/docs/generated/metamodel/entities/dataflow.md#dataflowinfo) | +|DataFlow|`DataFlowInfoClass`| [Link](/docs/generated/metamodel/entities/dataFlow.md#dataflowinfo) | Learn more about lineages of different entities in the [Add Lineage to Non-Dataset Entities](#add-lineage-to-non-dataset-entities) Section. From 7a1f1391dd6abe05e07241746f16b8b494a0d42b Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Wed, 22 May 2024 14:35:28 +0900 Subject: [PATCH 8/8] style: fix lint --- .../library/read_lineage_execute_graphql.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/metadata-ingestion/examples/library/read_lineage_execute_graphql.py b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py index 438ace20c6233..7b7f8ef43f4f5 100644 --- a/metadata-ingestion/examples/library/read_lineage_execute_graphql.py +++ b/metadata-ingestion/examples/library/read_lineage_execute_graphql.py @@ -19,26 +19,26 @@ } """ -variables = {"input": - { - "query": "*", - "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", - "count": 10, - "direction": "DOWNSTREAM", - "orFilters": [ - { - "and": [ - { - "condition": "EQUAL", - "negated": "false", - "field": "degree", - "values": ["1", "2", "3+"] - } - ] - } - ] - } +variables = { + "input": { + "query": "*", + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)", + "count": 10, + "direction": "DOWNSTREAM", + "orFilters": [ + { + "and": [ + { + "condition": "EQUAL", + "negated": "false", + "field": "degree", + "values": ["1", "2", "3+"], + } + ] } + ], + } +} result = graph.execute_graphql(query=query, variables=variables) print(result)