Merge branch 'master' into power-bi-plugin

datahub-project · Feb 25, 2022 · f653fa5 · f653fa5
2 parents ef0831a + a113e43
commit f653fa5
Show file tree

Hide file tree

Showing 27 changed files with 1,932 additions and 534 deletions.
diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml
@@ -19,6 +19,8 @@ jobs:
 
   metadata-ingestion-general:
     runs-on: ubuntu-latest
+    env:
+      SPARK_VERSION: 3.0.3
     strategy:
       matrix:
         python-version: ["3.6", "3.9.9"]
@@ -46,6 +48,8 @@ jobs:
 
   metadata-ingestion-by-version:
     runs-on: ubuntu-latest
+    env:
+      SPARK_VERSION: 3.0.3
     strategy:
       matrix:
         python-version: ["3.6", "3.9.9"]

diff --git a/.gitignore b/.gitignore
@@ -57,3 +57,6 @@ smoke-test/spark-smoke-test/__pycache__/
 
 # Metadata Ingestion Generated
 metadata-ingestion/generated/**
+
+# docs
+docs/generated/
diff --git a/datahub-web-react/src/graphql/relationships.graphql b/datahub-web-react/src/graphql/relationships.graphql
@@ -43,6 +43,9 @@ fragment relationshipFields on Entity {
     ... on Chart {
         tool
         chartId
+        platform {
+            ...platformFields
+        }
         properties {
             name
             description

diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
@@ -59,12 +59,7 @@ module.exports = {
       "docs/saas",
       "releases",
     ],
-    "Getting Started": [
-      "docs/quickstart",
-      "docs/cli",
-      "docs/debugging",
-      "docs/how/search",
-    ],
+    "Getting Started": ["docs/quickstart", "docs/cli", "docs/debugging"],
     "Metadata Ingestion": [
       // add a custom label since the default is 'Metadata Ingestion'
       // note that we also have to add the path to this file in sidebarsjs_hardcoded_titles in generateDocsDir.ts
@@ -200,7 +195,12 @@ module.exports = {
         ],
       },
     ],
-    "Usage Guides": ["docs/policies", "docs/domains", "docs/ui-ingestion"],
+    "Usage Guides": [
+      "docs/policies",
+      "docs/domains",
+      "docs/ui-ingestion",
+      "docs/how/search",
+    ],
     "Developer Guides": [
       // TODO: the titles of these should not be in question form in the sidebar
       "docs/developers",

diff --git a/docs/how/search.md b/docs/how/search.md
@@ -1,10 +1,10 @@
-# How to Search for Information in Datahub
+# Search Guide
 
 ## Introduction 
 
 The search bar is one of the means of finding data in Datahub. In this document, we discuss more effective ways of finding information beyond doing a standard keyword search. This is because keyword searches can return results from almost any part of an entity.
 
-### Search in Specific Fields:  
+### Search in Specific Fields
 
 The following examples are in the format of  
 X: *typical question* :  

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -102,6 +102,22 @@ def get_long_description():
 microsoft_common = {
     "msal==1.16.0"
 }
+
+data_lake_base = {
+    *aws_common,
+    "parse>=1.19.0",
+    "pyarrow>=6.0.1",
+    "tableschema>=1.20.2",
+    "ujson>=4.3.0",
+    "types-ujson>=4.2.1",
+    "smart-open[s3]>=5.2.1",
+}
+
+data_lake_profiling = {
+    "pydeequ==1.0.1",
+    "pyspark==3.0.3",
+}
+
 # Note: for all of these, framework_common will be added.
 plugins: Dict[str, Set[str]] = {
     # Sink plugins.
@@ -121,7 +137,7 @@ def get_long_description():
     "clickhouse-usage": sql_common | {"clickhouse-sqlalchemy==0.1.8"},
     "datahub-lineage-file": set(),
     "datahub-business-glossary": set(),
-    "data-lake": {*aws_common, "pydeequ==1.0.1", "pyspark==3.0.3", "parse==1.19.0"},
+    "data-lake": {*data_lake_base, *data_lake_profiling},
     "dbt": {"requests"},
     "druid": sql_common | {"pydruid>=0.6.2"},
     # Starting with 7.14.0 python client is checking if it is connected to elasticsearch client. If its not it throws
@@ -200,6 +216,7 @@ def get_long_description():
     *base_requirements,
     *framework_common,
     *mypy_stubs,
+    *data_lake_base,
     "black>=21.12b0",
     "coverage>=5.1",
     "flake8>=3.8.3",

diff --git a/metadata-ingestion/source_docs/data_lake.md b/metadata-ingestion/source_docs/data_lake.md
@@ -10,7 +10,7 @@ This source is in **Beta** and under active development. Not yet considered read
 
 ## Setup
 
-To install this plugin, run `pip install 'acryl-datahub[data-lake]'`. Because the files are read using PySpark, we require Spark 3.0.3 with Hadoop 3.2 to be installed.
+To install this plugin, run `pip install 'acryl-datahub[data-lake]'`. Note that because the profiling is run with PySpark, we require Spark 3.0.3 with Hadoop 3.2 to be installed (see [compatibility](#compatibility) for more details).
 
 The data lake connector extracts schemas and profiles from a variety of file formats (see below for an exhaustive list).
 Individual files are ingested as tables, and profiles are computed similar to the [SQL profiler](./sql_profiles.md).
@@ -37,7 +37,7 @@ If you would like to write a more complicated function for resolving file names,
 Extracts:
 
 - Row and column counts for each table
-- For each column, if applicable:
+- For each column, if profiling is enabled:
   - null counts and proportions
   - distinct counts and proportions
   - minimum, maximum, mean, median, standard deviation, some quantile values
@@ -47,20 +47,25 @@ This connector supports both local files as well as those stored on AWS S3 (whic
 
 - CSV
 - TSV
-- Parquet
 - JSON
+- Parquet
 - Apache Avro
 
+Schemas for Parquet and Avro files are extracted as provided.
+
+Schemas for schemaless formats (CSV, TSV, JSON) are inferred. For CSV and TSV files, we consider the first 100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details))
+JSON file schemas are inferred on the basis of the entire file (given the difficulty in extracting only the first few objects of the file), which may impact performance.
+We are working on using iterator-based JSON parsers to avoid reading in the entire JSON object.
+
 :::caution
 
 If you are ingesting datasets from AWS S3, we recommend running the ingestion on a server in the same region to avoid high egress costs.
 
 :::
 
-| Capability | Status | Details | 
-| -----------| ------ | ---- |
-| Platform Instance | 🛑 | [link](../../docs/platform-instances.md) |
-
+| Capability        | Status | Details                                  |
+| ----------------- | ------ | ---------------------------------------- |
+| Platform Instance | 🛑     | [link](../../docs/platform-instances.md) |
 
 ## Quickstart recipe
 
@@ -99,6 +104,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
 | `aws_config.aws_access_key_id`                       |                          | Autodetected | See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html                                                                                                                             |
 | `aws_config.aws_secret_access_key`                   |                          | Autodetected | See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html                                                                                                                             |
 | `aws_config.aws_session_token`                       |                          | Autodetected | See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html                                                                                                                             |
+| `max_rows`                                           |                          | `100`        | Maximum number of rows to use when inferring schemas for TSV and CSV files.                                                                                                                                    |
 | `schema_patterns.allow`                              |                          | `*`          | List of regex patterns for tables to ingest. Defaults to all.                                                                                                                                                  |
 | `schema_patterns.deny`                               |                          |              | List of regex patterns for tables to not ingest. Defaults to none.                                                                                                                                             |
 | `schema_patterns.ignoreCase`                         |                          | `True`       | Whether to ignore case sensitivity during pattern matching of tables to ingest.                                                                                                                                |
@@ -121,9 +127,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
 
 ## Compatibility
 
-Files are read using PySpark and profiles are computed with PyDeequ.
-We currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` environment variable to be set for PySpark.
-The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz).
+Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz).
 
 For an example guide on setting up PyDeequ on AWS, see [this guide](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/).
 

diff --git a/metadata-ingestion/source_docs/openapi.md b/metadata-ingestion/source_docs/openapi.md
@@ -30,7 +30,9 @@ source:
     name: test_endpoint # this name will appear in DatHub
     url: https://test_endpoint.com/
     swagger_file: classicapi/doc/swagger.json  # where to search for the OpenApi definitions
-    get_token: True  # optional, if you need to get an authentication token beforehand 
+    get_token:  # optional, if you need to get an authentication token beforehand 
+        request_type: get
+        url: api/authentication/login?username={username}&password={password}
     username: your_username  # optional
     password: your_password  # optional
     forced_examples:  # optionals
@@ -137,6 +139,15 @@ and this URL will be called to get back the needed metadata.
 
 ## Config details
 
+### Token authentication
+
+If this tool needs to get an access token to interrogate the endpoints, this can be requested. Two methods are available at the moment:
+
+* 'get' : this requires username/password combination to be present in the url. Note that {username} and {password} are mandatory placeholders. They will be replaced with the true credentials at runtime. Note that username and password will be sent in the request address, so it's unsecure. If your provider allows for the other method, please go for it.
+* 'post' : username and password will be inserted in the body of the POST request
+
+In both cases, username and password are the ones defined in the configuration file.
+
 ### Getting dataset metadata from `forced_example`
 
 Suppose you have an endpoint defined in the swagger file, but without example given, and the tool is 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -256,9 +256,8 @@ def process_dataflow_node(
                 # append S3 format if different ones exist
                 if len(s3_formats[s3_uri]) > 1:
                     node_urn = make_s3_urn(
-                        s3_uri,
+                        f"{s3_uri}.{node_args.get('format')}",
                         self.env,
-                        suffix=node_args.get("format"),
                     )
 
                 else:

diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py
@@ -1,17 +1,35 @@
-from typing import Optional
+import os
 
+S3_PREFIXES = ["s3://", "s3n://", "s3a://"]
 
-def make_s3_urn(s3_uri: str, env: str, suffix: Optional[str] = None) -> str:
 
-    if not s3_uri.startswith("s3://"):
-        raise ValueError("S3 URIs should begin with 's3://'")
+def is_s3_uri(uri: str) -> bool:
+    return any(uri.startswith(prefix) for prefix in S3_PREFIXES)
+
+
+def strip_s3_prefix(s3_uri: str) -> str:
     # remove S3 prefix (s3://)
-    s3_name = s3_uri[5:]
+    for s3_prefix in S3_PREFIXES:
+        if s3_uri.startswith(s3_prefix):
+            plain_base_path = s3_uri[len(s3_prefix) :]
+            return plain_base_path
+
+    raise ValueError(
+        f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}"
+    )
+
+
+def make_s3_urn(s3_uri: str, env: str) -> str:
+
+    s3_name = strip_s3_prefix(s3_uri)
 
     if s3_name.endswith("/"):
         s3_name = s3_name[:-1]
 
-    if suffix is not None:
-        return f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name}_{suffix},{env})"
+    name, extension = os.path.splitext(s3_name)
+
+    if extension != "":
+        extension = extension[1:]  # remove the dot
+        return f"urn:li:dataset:(urn:li:dataPlatform:s3,{name}_{extension},{env})"
 
     return f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name},{env})"