From fc7b6853b93649ffbb6d2df38f0ac1348220a2ec Mon Sep 17 00:00:00 2001 From: "jaegwon.seo" <162448493+wornjs@users.noreply.github.com> Date: Thu, 1 Aug 2024 05:05:55 +0900 Subject: [PATCH 1/4] change generate kafka connect properties from env (#10545) Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com> --- docker/kafka-setup/env_to_properties.py | 24 +++++++++++++++ docker/kafka-setup/kafka-setup.sh | 40 +------------------------ 2 files changed, 25 insertions(+), 39 deletions(-) create mode 100644 docker/kafka-setup/env_to_properties.py diff --git a/docker/kafka-setup/env_to_properties.py b/docker/kafka-setup/env_to_properties.py new file mode 100644 index 0000000000000..8d8b8c3cc7b59 --- /dev/null +++ b/docker/kafka-setup/env_to_properties.py @@ -0,0 +1,24 @@ +import os +import re +import sys + + +def env_to_properties(env_prefix: str, properties_file: str): + pattern = re.compile('(?<=[^_])_(?=[^_])') + props = {} + + for (env_name, val) in os.environ.items(): + if env_name.startswith(env_prefix): + raw_name = env_name[len(env_prefix):].lower() + prop_dot = '.'.join(pattern.split(raw_name)) + props[prop_dot] = val + + with open(properties_file, 'a') as f: + for k, v in props.items(): + f.writelines(f'{k}={v}\n') + + +if __name__ == '__main__': + env_prefix = sys.argv[1] + properties_file = sys.argv[2] + env_to_properties(env_prefix, properties_file) diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh index 439ffb4d4d829..392cca9466641 100755 --- a/docker/kafka-setup/kafka-setup.sh +++ b/docker/kafka-setup/kafka-setup.sh @@ -10,46 +10,8 @@ fi . kafka-config.sh echo "bootstrap.servers=$KAFKA_BOOTSTRAP_SERVER" > $CONNECTION_PROPERTIES_PATH -echo "security.protocol=$KAFKA_PROPERTIES_SECURITY_PROTOCOL" >> $CONNECTION_PROPERTIES_PATH -## Add support for SASL_PLAINTEXT -if [[ $KAFKA_PROPERTIES_SECURITY_PROTOCOL == "SASL_PLAINTEXT" ]]; then - echo "sasl.mechanism=$KAFKA_PROPERTIES_SASL_MECHANISM" >> $CONNECTION_PROPERTIES_PATH - echo "sasl.jaas.config=$KAFKA_PROPERTIES_SASL_JAAS_CONFIG" >> $CONNECTION_PROPERTIES_PATH - echo "sasl.kerberos.service.name=$KAFKA_PROPERTIES_SASL_KERBEROS_SERVICE_NAME" >> $CONNECTION_PROPERTIES_PATH -fi - -## Add support for SASL_SSL -if [[ $KAFKA_PROPERTIES_SECURITY_PROTOCOL == "SASL_SSL" ]]; then - echo "sasl.jaas.config=$KAFKA_PROPERTIES_SASL_JAAS_CONFIG" >> $CONNECTION_PROPERTIES_PATH - echo "sasl.mechanism=$KAFKA_PROPERTIES_SASL_MECHANISM" >> $CONNECTION_PROPERTIES_PATH -fi - -if [[ $KAFKA_PROPERTIES_SECURITY_PROTOCOL == "SSL" ]]; then - if [[ -n $KAFKA_PROPERTIES_SSL_KEYSTORE_LOCATION ]]; then - echo "ssl.keystore.location=$KAFKA_PROPERTIES_SSL_KEYSTORE_LOCATION" >> $CONNECTION_PROPERTIES_PATH - echo "ssl.keystore.password=$KAFKA_PROPERTIES_SSL_KEYSTORE_PASSWORD" >> $CONNECTION_PROPERTIES_PATH - echo "ssl.key.password=$KAFKA_PROPERTIES_SSL_KEY_PASSWORD" >> $CONNECTION_PROPERTIES_PATH - if [[ -n $KAFKA_PROPERTIES_SSL_KEYSTORE_TYPE ]]; then - echo "ssl.keystore.type=$KAFKA_PROPERTIES_SSL_KEYSTORE_TYPE" >> $CONNECTION_PROPERTIES_PATH - fi - fi - if [[ -n $KAFKA_PROPERTIES_SSL_TRUSTSTORE_LOCATION ]]; then - echo "ssl.truststore.location=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_LOCATION" >> $CONNECTION_PROPERTIES_PATH - if [[ $KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE != "PEM" ]]; then - echo "ssl.truststore.password=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_PASSWORD" >> $CONNECTION_PROPERTIES_PATH - fi - if [[ -n $KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE ]]; then - echo "ssl.truststore.type=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE" >> $CONNECTION_PROPERTIES_PATH - fi - fi - echo "ssl.endpoint.identification.algorithm=$KAFKA_PROPERTIES_SSL_ENDPOINT_IDENTIFICATION_ALGORITHM" >> $CONNECTION_PROPERTIES_PATH -fi - -# Add support for SASL_CLIENT_CALLBACK_HANDLER_CLASS -if [[ -n "$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" ]]; then - echo "sasl.client.callback.handler.class=$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" >> $CONNECTION_PROPERTIES_PATH -fi +python env_to_properties.py KAFKA_PROPERTIES_ $CONNECTION_PROPERTIES_PATH # cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 . kafka-ready.sh From 2336207a2a6a22de582b2dc4488407668276da2b Mon Sep 17 00:00:00 2001 From: "Renan F. Lima" <51028757+lima-renan@users.noreply.github.com> Date: Wed, 31 Jul 2024 17:06:19 -0300 Subject: [PATCH 2/4] fix(ingest): fix oracle cronjob ingestion (#11001) Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com> --- docker/datahub-ingestion-base/Dockerfile | 16 ++++++++-------- docker/datahub-ingestion/Dockerfile | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 383478b675640..8a238c32704bb 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -85,18 +85,18 @@ RUN apt-get update && apt-get install -y -qq \ RUN if [ $(arch) = "x86_64" ]; then \ mkdir /opt/oracle && \ cd /opt/oracle && \ - wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/216000/instantclient-basic-linux.x64-21.6.0.0.0dbru.zip && \ - unzip instantclient-basic-linux.x64-21.6.0.0.0dbru.zip && \ - rm instantclient-basic-linux.x64-21.6.0.0.0dbru.zip && \ - sh -c "echo /opt/oracle/instantclient_21_6 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ + wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/2115000/instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + unzip instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + rm instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + sh -c "echo /opt/oracle/instantclient_21_15 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ ldconfig; \ else \ mkdir /opt/oracle && \ cd /opt/oracle && \ - wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip && \ - unzip instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip && \ - rm instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip && \ - sh -c "echo /opt/oracle/instantclient_19_10 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ + wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/1923000/instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + unzip instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + rm instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + sh -c "echo /opt/oracle/instantclient_19_23 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ ldconfig; \ fi; diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 068911695811f..b8eda54849122 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=full ARG BASE_IMAGE=acryldata/datahub-ingestion-base -ARG DOCKER_VERSION=head +ARG DOCKER_VERSION=head-full ARG DEBIAN_REPO_URL=https://deb.debian.org/debian ARG PIP_MIRROR_URL=https://pypi.python.org/simple From b13d990f07d2a2d3c3042c79047958f8d288017b Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 31 Jul 2024 15:30:30 -0500 Subject: [PATCH 3/4] chore(ci): revert update deprecated github actions (#10977) (#11062) --- .github/workflows/airflow-plugin.yml | 4 ++-- .github/workflows/build-and-test.yml | 4 ++-- .github/workflows/dagster-plugin.yml | 4 ++-- .github/workflows/docker-unified.yml | 6 +++--- .github/workflows/metadata-ingestion.yml | 4 ++-- .github/workflows/metadata-io.yml | 4 ++-- .github/workflows/spark-smoke-test.yml | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index 114256ad825e5..d4f0a1369da25 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -74,7 +74,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && pip freeze - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: ${{ always() && matrix.python-version == '3.10' && matrix.extra_pip_requirements == 'apache-airflow>=2.7.0' }} with: name: Test Results (Airflow Plugin ${{ matrix.python-version}}) @@ -98,7 +98,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index d2116fc2fca78..c93267947b65a 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -99,7 +99,7 @@ jobs: if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} run: | ./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: always() with: name: Test Results (build) @@ -128,7 +128,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index 381a01aca82c3..48f1b24196c9e 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -56,7 +56,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/dagster-plugin/venv/bin/activate && pip freeze - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }} with: name: Test Results (dagster Plugin ${{ matrix.python-version}}) @@ -79,7 +79,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 216f51e8ce970..9487e71e8da3d 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -1024,18 +1024,18 @@ jobs: docker logs datahub-datahub-frontend-react-1 >& frontend-${{ matrix.test_strategy }}.log || true docker logs datahub-upgrade-1 >& upgrade-${{ matrix.test_strategy }}.log || true - name: Upload logs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 if: failure() with: name: docker logs path: "*.log" - name: Upload screenshots - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 if: failure() with: name: cypress-snapshots-${{ matrix.test_strategy }} path: smoke-test/tests/cypress/cypress/screenshots/ - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: always() with: name: Test Results (smoke tests) ${{ matrix.test_strategy }} diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index ef84afd9c3779..51b97552eb150 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -83,7 +83,7 @@ jobs: df -hl docker image ls docker system df - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 with: name: Test Results (metadata ingestion ${{ matrix.python-version }}) path: | @@ -106,7 +106,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index 4b1e878ea2526..6797c7ad67c0b 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -62,7 +62,7 @@ jobs: - name: Gradle build (and test) run: | ./gradlew :metadata-io:test - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: always() with: name: Test Results (metadata-io) @@ -78,7 +78,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index 46f6e95454477..8ffc8420ba941 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -69,14 +69,14 @@ jobs: docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true - name: Upload logs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 if: failure() with: name: docker logs path: | "**/build/container-logs/*.log" "*.log" - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: always() with: name: Test Results (smoke tests) From 89933fee1e141068fac60c3639eb8b2fa5b43871 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 31 Jul 2024 14:16:18 -0700 Subject: [PATCH 4/4] feat(ingest/dbt-cloud): update metadata_endpoint inference (#11041) --- metadata-ingestion/src/datahub/cli/get_cli.py | 1 + .../datahub/ingestion/source/dbt/dbt_cloud.py | 37 ++++++++++++++++--- .../tests/unit/test_dbt_source.py | 25 +++++++++++-- 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/get_cli.py b/metadata-ingestion/src/datahub/cli/get_cli.py index b6ff5f39a2c14..27fa987ac7977 100644 --- a/metadata-ingestion/src/datahub/cli/get_cli.py +++ b/metadata-ingestion/src/datahub/cli/get_cli.py @@ -56,6 +56,7 @@ def urn(ctx: Any, urn: Optional[str], aspect: List[str], details: bool) -> None: entity_urn=urn, aspects=aspect, typed=False, + details=details, ), sort_keys=True, indent=2, diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index 8a99f096b5167..0672b9ce6f781 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -40,8 +40,7 @@ class DBTCloudConfig(DBTCommonConfig): metadata_endpoint: str = Field( default="https://metadata.cloud.getdbt.com/graphql", - description="The dbt Cloud metadata API endpoint. This is deprecated, and will be removed in a future release. Please use access_url instead.", - deprecated=True, + description="The dbt Cloud metadata API endpoint. If not provided, we will try to infer it from the access_url.", ) token: str = Field( @@ -66,13 +65,39 @@ class DBTCloudConfig(DBTCommonConfig): @root_validator(pre=True) def set_metadata_endpoint(cls, values: dict) -> dict: if values.get("access_url") and not values.get("metadata_endpoint"): - parsed_uri = urlparse(values["access_url"]) - values[ - "metadata_endpoint" - ] = f"{parsed_uri.scheme}://metadata.{parsed_uri.netloc}/graphql" + metadata_endpoint = infer_metadata_endpoint(values["access_url"]) + if metadata_endpoint is None: + raise ValueError( + "Unable to infer the metadata endpoint from the access URL. Please provide a metadata endpoint." + ) + values["metadata_endpoint"] = metadata_endpoint return values +def infer_metadata_endpoint(access_url: str) -> Optional[str]: + # See https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses#api-access-urls + # and https://docs.getdbt.com/docs/dbt-cloud-apis/discovery-querying#discovery-api-endpoints + + try: + parsed_uri = urlparse(access_url) + assert parsed_uri.scheme is not None + assert parsed_uri.hostname is not None + except Exception as e: + logger.debug(f"Unable to parse access URL {access_url}: {e}", exc_info=e) + return None + + if parsed_uri.hostname.endswith(".dbt.com"): + # For cell-based deployments. + # prefix.region.dbt.com -> prefix.metadata.region.dbt.com + hostname_parts = parsed_uri.hostname.split(".", maxsplit=1) + return f"{parsed_uri.scheme}://{hostname_parts[0]}.metadata.{hostname_parts[1]}/graphql" + elif parsed_uri.hostname.endswith(".getdbt.com"): + return f"{parsed_uri.scheme}://metadata.{parsed_uri.netloc}/graphql" + else: + # The self-hosted variants also have the metadata. prefix. + return f"{parsed_uri.scheme}://metadata.{parsed_uri.netloc}/graphql" + + _DBT_GRAPHQL_COMMON_FIELDS = """ runId accountId diff --git a/metadata-ingestion/tests/unit/test_dbt_source.py b/metadata-ingestion/tests/unit/test_dbt_source.py index 48a6fd0f65068..01d7a4809b01b 100644 --- a/metadata-ingestion/tests/unit/test_dbt_source.py +++ b/metadata-ingestion/tests/unit/test_dbt_source.py @@ -7,7 +7,10 @@ from datahub.emitter import mce_builder from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.source.dbt.dbt_cloud import DBTCloudConfig +from datahub.ingestion.source.dbt.dbt_cloud import ( + DBTCloudConfig, + infer_metadata_endpoint, +) from datahub.ingestion.source.dbt.dbt_core import ( DBTCoreConfig, DBTCoreSource, @@ -366,7 +369,7 @@ def test_dbt_entity_emission_configuration_helpers(): def test_dbt_cloud_config_access_url(): config_dict = { - "access_url": "https://my-dbt-cloud.dbt.com", + "access_url": "https://emea.getdbt.com", "token": "dummy_token", "account_id": "123456", "project_id": "1234567", @@ -375,8 +378,8 @@ def test_dbt_cloud_config_access_url(): "target_platform": "dummy_platform", } config = DBTCloudConfig.parse_obj(config_dict) - assert config.access_url == "https://my-dbt-cloud.dbt.com" - assert config.metadata_endpoint == "https://metadata.my-dbt-cloud.dbt.com/graphql" + assert config.access_url == "https://emea.getdbt.com" + assert config.metadata_endpoint == "https://metadata.emea.getdbt.com/graphql" def test_dbt_cloud_config_with_defined_metadata_endpoint(): @@ -398,6 +401,20 @@ def test_dbt_cloud_config_with_defined_metadata_endpoint(): ) +def test_infer_metadata_endpoint() -> None: + assert ( + infer_metadata_endpoint("https://cloud.getdbt.com") + == "https://metadata.cloud.getdbt.com/graphql" + ) + assert ( + infer_metadata_endpoint("https://prefix.us1.dbt.com") + == "https://prefix.metadata.us1.dbt.com/graphql" + ) + assert ( + infer_metadata_endpoint("http://dbt.corp.internal") + ) == "http://metadata.dbt.corp.internal/graphql" + + def test_dbt_time_parsing() -> None: time_formats = [ "2024-03-28T05:56:15.236210Z",