datahub-project · rslanka · May 26, 2022 · Jan 28, 2022 · Jan 29, 2022 · Jan 29, 2022
diff --git a/datahub-web-react/src/images/iceberglogo.png b/datahub-web-react/src/images/iceberglogo.png
diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg.md b/metadata-ingestion/docs/sources/iceberg/iceberg.md
@@ -0,0 +1,23 @@
+### Concept Mapping
+
+<!-- This should be a manual mapping of concepts from the source to the DataHub Metadata Model -->
+<!-- Authors should provide as much context as possible about how this mapping was generated, including assumptions made, known shortcuts, & any other caveats -->
+
+This ingestion source maps the following Source System Concepts to DataHub Concepts:
+
+<!-- Remove all unnecessary/irrelevant DataHub Concepts -->
+
+| Source Concept | DataHub Concept | Notes |
+| -- | -- | -- |
+| `iceberg` | [Data Platform](docs/generated/metamodel/entities/dataPlatform.md) | |
+| Table | [Dataset](docs/generated/metamodel/entities/dataset.md) | Each Iceberg table maps to a Dataset named using the parent folders.  If a table is stored under `my/namespace/table`, the dataset name will be `my.namespace.table`.  If a [Platform Instance](https://datahubproject.io/docs/platform-instances/) is configured, it will be used as a prefix: `<platform_instance>.my.namespace.table`. |
+| [Table property](https://iceberg.apache.org/docs/latest/configuration/#table-properties) | [User (a.k.a CorpUser)](docs/generated/metamodel/entities/corpuser.md) | The value of a table property can be used as the name of a CorpUser owner.  This table property name can be configured with the source option `user_ownership_property`. |
+| [Table property](https://iceberg.apache.org/docs/latest/configuration/#table-properties) | CorpGroup | The value of a table property can be used as the name of a CorpGroup owner.  This table property name can be configured with the source option `group_ownership_property`. |
+| Table parent folders (excluding [warehouse catalog location](https://iceberg.apache.org/docs/latest/configuration/#catalog-properties)) | Container | Available in a future release | 
+| [Table schema](https://iceberg.apache.org/spec/#schemas-and-data-types) | SchemaField | Maps to the fields defined within the Iceberg table schema definition. | 
+
+## Troubleshooting
+
+### [Common Issue]
+
+[Provide description of common issues with this integration and steps to resolve]
diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml b/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml
@@ -0,0 +1,22 @@
+source:
+  type: "iceberg"
+  config:
+    env: PROD
+    adls:
+      # Will be translated to https://{account_name}.dfs.core.windows.net
+      account_name: my_adls_account
+      # Can use sas_token or account_key
+      sas_token: "${SAS_TOKEN}"
+      # account_key: "${ACCOUNT_KEY}"
+      container_name: warehouse
+      base_path: iceberg
+    platform_instance: my_iceberg_catalog
+    table_pattern:
+      allow:
+        - marketing.*
+    profiling:
+      enabled: true
+
+sink:
+  # sink configs
+
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -21,7 +21,7 @@ def get_long_description():
 
 
 base_requirements = {
-    # Compatability.
+    # Compatibility.
     "dataclasses>=0.6; python_version < '3.7'",
     # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to Airflow 2.0.2 dependency conflict
     "typing_extensions>=3.7.4.3 ;  python_version < '3.8'",
@@ -146,6 +146,12 @@ def get_long_description():
     "pyspark==3.0.3",
 }
 
+iceberg_common = {
+    # Iceberg Python SDK
+    "acryl-iceberg-legacy==0.0.4",
+    "azure-identity==1.10.0",
+}
+
 s3_base = {
     *data_lake_base,
     "moto[s3]",
@@ -195,7 +201,7 @@ def get_long_description():
     "feast": {"feast==0.18.0", "flask-openid>=1.3.0"},
     "glue": aws_common,
     # hdbcli is supported officially by SAP, sqlalchemy-hana is built on top but not officially supported
-    "hana": sql_common | {"sqlalchemy-hana>=0.5.0","hdbcli>=2.11.20"},
+    "hana": sql_common | {"sqlalchemy-hana>=0.5.0", "hdbcli>=2.11.20"},
     "hive": sql_common
     | {
         # Acryl Data maintains a fork of PyHive
@@ -204,6 +210,7 @@ def get_long_description():
         # - 0.6.12 adds support for Spark Thrift Server
         "acryl-pyhive[hive]>=0.6.13"
     },
+    "iceberg": iceberg_common,
     "kafka": {*kafka_common, *kafka_protobuf},
     "kafka-connect": sql_common | {"requests", "JPype1"},
     "ldap": {"python-ldap>=2.4"},
@@ -357,6 +364,7 @@ def get_long_description():
             dependency
             for plugin in [
                 "feast",
+                "iceberg",
                 "lookml",
             ]
             for dependency in plugins[plugin]
@@ -368,6 +376,7 @@ def get_long_description():
         {
             dependency
             for plugin in [
+                "iceberg",
                 "lookml",
             ]
             for dependency in plugins[plugin]
@@ -421,6 +430,7 @@ def get_long_description():
             for plugin in [
                 "athena",
                 "feast",
+                "iceberg",
             ]
             for dependency in plugins[plugin]
         }
@@ -476,6 +486,7 @@ def get_long_description():
         "starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource",
         "nifi = datahub.ingestion.source.nifi:NifiSource",
         "powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource",
+        "iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource",
         "vertica = datahub.ingestion.source.sql.vertica:VerticaSource",
         "presto-on-hive = datahub.ingestion.source.sql.presto_on_hive:PrestoOnHiveSource",
         "pulsar = datahub.ingestion.source.pulsar:PulsarSource",

diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py
@@ -1,6 +1,6 @@
 import json
 import logging
-from typing import Any, Callable, Dict, Generator, List, Optional, Union
+from typing import Any, Callable, Dict, Generator, List, Optional, Type, Union
 
 import avro.schema
 
@@ -70,7 +70,7 @@ class AvroToMceSchemaConverter:
     # FieldPath format version.
     version_string: str = "[version=2.0]"
 
-    field_type_mapping: Dict[str, Any] = {
+    field_type_mapping: Dict[str, Type] = {
         AVRO_TYPE_NULL: NullTypeClass,
         "bool": BooleanTypeClass,
         "boolean": BooleanTypeClass,
@@ -88,7 +88,7 @@ class AvroToMceSchemaConverter:
         "fixed": FixedTypeClass,
     }
 
-    field_logical_type_mapping: Dict[str, Any] = {
+    field_logical_type_mapping: Dict[str, Type] = {
         "date": DateTypeClass,
         "decimal": NumberTypeClass,
         "time-micros": TimeTypeClass,
@@ -128,17 +128,50 @@ def __init__(self, is_key_schema: bool, default_nullable: bool = False) -> None:
             avro.schema.LogicalSchema: self._gen_non_nested_to_mce_fields,
         }
 
+    @staticmethod
+    def _get_type_name(
+        avro_schema: avro.schema.Schema, logical_if_present: bool = False
+    ) -> str:
+        logical_type_name: Optional[str] = None
+        if logical_if_present:
+            logical_type_name = getattr(
+                avro_schema, "logical_type", None
+            ) or avro_schema.props.get("logicalType")
+        return logical_type_name or str(
+            getattr(avro_schema.type, "type", avro_schema.type)
+        )
+
+    @staticmethod
     def _get_column_type(
-        self, field_type: Union[str, dict], logical_type: Optional[str]
+        avro_schema: avro.schema.Schema, logical_type: Optional[str]
     ) -> SchemaFieldDataType:
-        tp = field_type
-        if hasattr(tp, "type"):
-            tp = tp.type  # type: ignore
-        tp = str(tp)
-        TypeClass: Any = self.field_type_mapping.get(tp)
+        type_name: str = AvroToMceSchemaConverter._get_type_name(avro_schema)
+        TypeClass: Optional[Type] = AvroToMceSchemaConverter.field_type_mapping.get(
+            type_name
+        )
         if logical_type is not None:
-            TypeClass = self.field_logical_type_mapping.get(logical_type, TypeClass)
+            TypeClass = AvroToMceSchemaConverter.field_logical_type_mapping.get(
+                logical_type, TypeClass
+            )
+        assert TypeClass is not None
         dt = SchemaFieldDataType(type=TypeClass())
+        # Handle Arrays and Maps
+        if isinstance(dt.type, ArrayTypeClass) and isinstance(
+            avro_schema, avro.schema.ArraySchema
+        ):
+            dt.type.nestedType = [
+                AvroToMceSchemaConverter._get_type_name(
+                    avro_schema.items, logical_if_present=True
+                )
+            ]
+        elif isinstance(dt.type, MapTypeClass) and isinstance(
+            avro_schema, avro.schema.MapSchema
+        ):
+            # Avro map's key is always a string. See: https://avro.apache.org/docs/current/spec.html#Maps
+            dt.type.keyType = "string"
+            dt.type.valueType = AvroToMceSchemaConverter._get_type_name(
+                avro_schema.values, logical_if_present=True
+            )
         return dt
 
     def _is_nullable(self, schema: avro.schema.Schema) -> bool:
@@ -282,20 +315,21 @@ def emit(self) -> Generator[SchemaField, None, None]:
                         tags=[TagAssociationClass(tag="urn:li:tag:Deprecated")]
                     )
 
+                logical_type_name: Optional[str] = (
+                    # logicalType nested inside type
+                    getattr(actual_schema, "logical_type", None)
+                    or actual_schema.props.get("logicalType")
+                    # bare logicalType
+                    or self._actual_schema.props.get("logicalType")
+                )
+
                 field = SchemaField(
                     fieldPath=field_path,
                     # Populate it with the simple native type for now.
                     nativeDataType=native_data_type,
                     type=self._converter._get_column_type(
-                        actual_schema.type,
-                        (
-                            getattr(
-                                actual_schema, "logical_type", None
-                            )  # logicalType nested inside type
-                            or self._actual_schema.props.get(
-                                "logicalType"
-                            )  # bare logicalType
-                        ),
+                        actual_schema,
+                        logical_type_name,
                     ),
                     description=description,
                     recursive=False,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/azure/__init__.py
diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py
@@ -0,0 +1,83 @@
+from typing import Dict, Optional, Union
+
+from azure.identity import ClientSecretCredential
+from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient
+from pydantic import Field, root_validator
+
+from datahub.configuration import ConfigModel
+from datahub.configuration.common import ConfigurationError
+
+
+class AdlsSourceConfig(ConfigModel):
+    """
+    Common Azure credentials config.
+
+    https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python
+    """
+
+    base_path: str = Field(
+        default="/",
+        description="Base folder in hierarchical namespaces to start from.",
+    )
+    container_name: str = Field(
+        description="Azure storage account container name.",
+    )
+    account_name: str = Field(
+        description="Name of the Azure storage account.  See [Microsoft official documentation on how to create a storage account.](https://docs.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account)",
+    )
+    account_key: Optional[str] = Field(
+        description="Azure storage account access key that can be used as a credential. **An account key, a SAS token or a client secret is required for authentication.**",
+    )
+    sas_token: Optional[str] = Field(
+        description="Azure storage account Shared Access Signature (SAS) token that can be used as a credential. **An account key, a SAS token or a client secret is required for authentication.**",
+    )
+    client_secret: Optional[str] = Field(
+        description="Azure client secret that can be used as a credential. **An account key, a SAS token or a client secret is required for authentication.**",
+    )
+    client_id: Optional[str] = Field(
+        description="Azure client (Application) ID required when a `client_secret` is used as a credential.",
+    )
+    tenant_id: Optional[str] = Field(
+        description="Azure tenant (Directory) ID required when a `client_secret` is used as a credential.",
+    )
+
+    def get_abfss_url(self, folder_path: str = "") -> str:
+        if not folder_path.startswith("/"):
+            folder_path = f"/{folder_path}"
+        return f"abfss://{self.container_name}@{self.account_name}.dfs.core.windows.net{folder_path}"
+
+    def get_filesystem_client(self) -> FileSystemClient:
+        return self.get_service_client().get_file_system_client(self.container_name)
+
+    def get_service_client(self) -> DataLakeServiceClient:
+        return DataLakeServiceClient(
+            account_url=f"https://{self.account_name}.dfs.core.windows.net",
+            credential=self.get_credentials(),
+        )
+
+    def get_credentials(
+        self,
+    ) -> Union[Optional[str], ClientSecretCredential]:
+        if self.client_id and self.client_secret and self.tenant_id:
+            return ClientSecretCredential(
+                tenant_id=self.tenant_id,
+                client_id=self.client_id,
+                client_secret=self.client_secret,
+            )
+        return self.sas_token if self.sas_token is not None else self.account_key
+
+    @root_validator()
+    def _check_credential_values(cls, values: Dict) -> Dict:
+        if (
+            values.get("account_key")
+            or values.get("sas_token")
+            or (
+                values.get("client_id")
+                and values.get("client_secret")
+                and values.get("tenant_id")
+            )
+        ):
+            return values
+        raise ConfigurationError(
+            "credentials missing, requires one combination of account_key or sas_token or (client_id and client_secret and tenant_id)"
+        )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/__init__.py