datahub-project · treff7es · Oct 9, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
@@ -1,5 +1,6 @@
 import logging
 import re
+from base64 import b32decode
 from collections import defaultdict
 from typing import Dict, Iterable, List, Optional, Set, Type, Union, cast
 
@@ -89,12 +90,13 @@
     HiveColumnToAvroConverter,
     get_schema_fields_for_hive_column,
 )
-from datahub.utilities.mapping import Constants
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.ratelimiter import RateLimiter
 from datahub.utilities.registries.domain_registry import DomainRegistry
 from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
 
+ENCODED_TAG_PREFIX = "urn_li_encoded_tag_"
+
 logger: logging.Logger = logging.getLogger(__name__)
 # Handle table snapshots
 # See https://cloud.google.com/bigquery/docs/table-snapshots-intro.
@@ -194,6 +196,18 @@ def store_table_refs(self):
             or self.config.use_queries_v2
         )
 
+    def modified_base32decode(self, text_to_decode: str) -> str:
+        # When we sync from DataHub to BigQuery, we encode the tags as modified base32 strings.
+        # BiqQuery labels only support lowercase letters, international characters, numbers, or underscores.
+        # So we need to modify the base32 encoding to replace the padding character `=` with `_` and convert to lowercase.
+        if not text_to_decode.startswith("%s" % ENCODED_TAG_PREFIX):
+            return text_to_decode
+        text_to_decode = (
+            text_to_decode.replace(ENCODED_TAG_PREFIX, "").upper().replace("_", "=")
+        )
+        text = b32decode(text_to_decode.encode("utf-8")).decode("utf-8")
+        return text
+
     def get_project_workunits(
         self, project: BigqueryProject
     ) -> Iterable[MetadataWorkUnit]:
@@ -253,7 +267,7 @@ def gen_dataset_containers(
         tags_joined: Optional[List[str]] = None
         if tags and self.config.capture_dataset_label_as_tag:
             tags_joined = [
-                f"{k}:{v}"
+                self.make_tag_from_label(k, v)
                 for k, v in tags.items()
                 if is_tag_allowed(self.config.capture_dataset_label_as_tag, k)
             ]
@@ -662,6 +676,11 @@ def _process_snapshot(
             dataset_name=dataset_name,
         )
 
+    def make_tag_from_label(self, key: str, value: str) -> str:
+        if not value.startswith(ENCODED_TAG_PREFIX):
+            return make_tag_urn(f"""{key}:{value}""")
+        return self.modified_base32decode(value)
+
     def gen_table_dataset_workunits(
         self,
         table: BigqueryTable,
@@ -707,7 +726,7 @@ def gen_table_dataset_workunits(
             tags_to_add = []
             tags_to_add.extend(
                 [
-                    make_tag_urn(f"""{k}:{v}""")
+                    self.make_tag_from_label(k, v)
                     for k, v in table.labels.items()
                     if is_tag_allowed(self.config.capture_table_label_as_tag, k)
                 ]
@@ -733,7 +752,7 @@ def gen_view_dataset_workunits(
         tags_to_add = None
         if table.labels and self.config.capture_view_label_as_tag:
             tags_to_add = [
-                make_tag_urn(f"{k}:{v}")
+                self.make_tag_from_label(k, v)
                 for k, v in table.labels.items()
                 if is_tag_allowed(self.config.capture_view_label_as_tag, k)
             ]
@@ -922,11 +941,6 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]:
                             break
             else:
                 tags = []
-                if col.is_partition_column:
-                    tags.append(
-                        TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY))
-                    )
-
                 if col.cluster_column_position is not None:
                     tags.append(
                         TagAssociationClass(
@@ -944,6 +958,7 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]:
                     type=SchemaFieldDataType(
                         self.BIGQUERY_FIELD_TYPE_MAPPINGS.get(col.data_type, NullType)()
                     ),
+                    isPartitioningKey=col.is_partition_column,
                     nativeDataType=col.data_type,
                     description=col.comment,
                     nullable=col.is_nullable,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py
@@ -290,8 +290,8 @@ def __init__(self, ctx: PipelineContext, config: MongoDBConfig):
 
         # See https://pymongo.readthedocs.io/en/stable/examples/datetimes.html#handling-out-of-range-datetimes
         self.mongo_client = MongoClient(
-            self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options
-        )  # type: ignore
+            self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options  # type: ignore
+        )
 
         # This cheaply tests the connection. For details, see
         # https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient

diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json
@@ -269,7 +269,8 @@
                             "actor": "urn:li:corpuser:datahub"
                         }
                     },
-                    "isPartOfKey": false
+                    "isPartOfKey": false,
+                    "isPartitioningKey": false
                 },
                 {
                     "fieldPath": "email",
@@ -296,7 +297,8 @@
                             "actor": "urn:li:corpuser:datahub"
                         }
                     },
-                    "isPartOfKey": false
+                    "isPartOfKey": false,
+                    "isPartitioningKey": false
                 }
             ]
         }
@@ -328,6 +330,29 @@
         "lastRunId": "no-run-id-provided"
     }
 },
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+    "changeType": "UPSERT",
+    "aspectName": "globalTags",
+    "aspect": {
+        "json": {
+            "tags": [
+                {
+                    "tag": "urn:li:tag:priority:high"
+                },
+                {
+                    "tag": "urn:li:tag:purchase"
+                }
+            ]
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1643871600000,
+        "runId": "bigquery-2022_02_03-07_00_00",
+        "lastRunId": "no-run-id-provided"
+    }
+},
 {
     "entityType": "dataset",
     "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
@@ -463,7 +488,8 @@
                             }
                         ]
                     },
-                    "isPartOfKey": false
+                    "isPartOfKey": false,
+                    "isPartitioningKey": false
                 },
                 {
                     "fieldPath": "email",
@@ -479,7 +505,8 @@
                     "globalTags": {
                         "tags": []
                     },
-                    "isPartOfKey": false
+                    "isPartOfKey": false,
+                    "isPartitioningKey": false
                 }
             ]
         }
@@ -620,7 +647,8 @@
                             }
                         ]
                     },
-                    "isPartOfKey": false
+                    "isPartOfKey": false,
+                    "isPartitioningKey": false
                 },
                 {
                     "fieldPath": "email",
@@ -636,7 +664,8 @@
                     "globalTags": {
                         "tags": []
                     },
-                    "isPartOfKey": false
+                    "isPartOfKey": false,
+                    "isPartitioningKey": false
                 }
             ]
         }
@@ -1021,5 +1050,37 @@
         "runId": "bigquery-2022_02_03-07_00_00",
         "lastRunId": "no-run-id-provided"
     }
+},
+{
+    "entityType": "tag",
+    "entityUrn": "urn:li:tag:priority:high",
+    "changeType": "UPSERT",
+    "aspectName": "tagKey",
+    "aspect": {
+        "json": {
+            "name": "priority:high"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1643871600000,
+        "runId": "bigquery-2022_02_03-07_00_00",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "tag",
+    "entityUrn": "urn:li:tag:purchase",
+    "changeType": "UPSERT",
+    "aspectName": "tagKey",
+    "aspect": {
+        "json": {
+            "name": "purchase"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1643871600000,
+        "runId": "bigquery-2022_02_03-07_00_00",
+        "lastRunId": "no-run-id-provided"
+    }
 }
 ]