From fbd8a92c01b2419c9da21366f0e0a5e455a52dc6 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sun, 17 Nov 2024 21:35:04 +0100 Subject: [PATCH 01/18] Add initial data-engineering template --- contrib/README.md | 43 ++++++++++ contrib/templates/README.md | 10 +++ contrib/templates/data-engineering/README.md | 10 +++ .../databricks_template_schema.json | 46 +++++++++++ .../{{.pipeline_name}}/explorations/README.md | 4 + .../explorations/exploration.ipynb.tmpl | 51 ++++++++++++ .../assets/{{.pipeline_name}}/main.py | 3 + .../{{.pipeline_name}}/sources/dev/taxis.py | 10 +++ .../{{.pipeline_name}}/sources/prod/taxis.py | 8 ++ .../tests/taxi_stats_test.py | 7 ++ .../transformations/__init__.py | 9 +++ .../transformations/taxi_stats.py | 23 ++++++ .../{{.pipeline_name}}.job.yml.tmpl | 24 ++++++ .../{{.pipeline_name}}.pipeline.yml.tmpl | 17 ++++ .../databricks_template_schema.json | 10 +++ .../job/databricks_template_schema.json | 10 +++ .../base/databricks_template_schema.json | 46 +++++++++++ .../.vscode/__builtins__.pyi | 3 + .../{{.project_name}}/.vscode/extensions.json | 7 ++ .../.vscode/settings.json.tmpl | 22 ++++++ .../template/{{.project_name}}/README.md.tmpl | 79 +++++++++++++++++++ .../{{.project_name}}/assets/README.md | 4 + .../template/{{.project_name}}/conftest.py | 36 +++++++++ .../{{.project_name}}/databricks.yml.tmpl | 50 ++++++++++++ .../template/{{.project_name}}/pyproject.toml | 22 ++++++ .../{{.project_name}}/scripts/add_asset.py | 46 +++++++++++ .../{{.project_name}}/scripts/test.py | 15 ++++ 27 files changed, 615 insertions(+) create mode 100644 contrib/README.md create mode 100644 contrib/templates/README.md create mode 100644 contrib/templates/data-engineering/README.md create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl create mode 100644 contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl create mode 100644 contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json create mode 100644 contrib/templates/data-engineering/assets/job/databricks_template_schema.json create mode 100644 contrib/templates/data-engineering/base/databricks_template_schema.json create mode 100644 contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/__builtins__.pyi create mode 100644 contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/extensions.json create mode 100644 contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/settings.json.tmpl create mode 100644 contrib/templates/data-engineering/base/template/{{.project_name}}/README.md.tmpl create mode 100644 contrib/templates/data-engineering/base/template/{{.project_name}}/assets/README.md create mode 100644 contrib/templates/data-engineering/base/template/{{.project_name}}/conftest.py create mode 100644 contrib/templates/data-engineering/base/template/{{.project_name}}/databricks.yml.tmpl create mode 100644 contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml create mode 100644 contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py create mode 100644 contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/test.py diff --git a/contrib/README.md b/contrib/README.md new file mode 100644 index 0000000..252d132 --- /dev/null +++ b/contrib/README.md @@ -0,0 +1,43 @@ +# Contrib Directory + +The `contrib` directory contains additional community-contributed examples and resources for Databricks Asset Bundles. These examples may include: + +- Custom configurations and extensions +- Advanced usage patterns +- Tools or utilities for enhancing Databricks Asset Bundles workflows + +## Structure + +Each contribution should be organized into its own subdirectory within `contrib/`. +Templates should go under `contrib/templates/`. For example: + +``` +contrib/ +├── awesome-bundle/ +│ ├── README.md +│ ├── databricks.yml +│ └── ... +└── templates/ + └── awesome-template/ + ├── README.md + ├── databricks_template_schema.json + ├── library/ + │ └── ... + └── template/ + └── ... +``` + +## How to Use Contributions + +To use or explore a contributed example, navigate to its subdirectory and follow the instructions in its `README.md` file. Each example should provide details on setup, configuration, and usage. + +## Contributing + +If you would like to add your own examples or resources, please: +1. Create a new directory under `contrib/` with a descriptive name. +2. Include a `README.md` file explaining the contribution. +3. Ensure that any necessary configuration files, scripts, or dependencies are included. + +For more information on Databricks Asset Bundles, see: +- [Public Preview Announcement](https://www.databricks.com/blog/announcing-public-preview-databricks-asset-bundles-apply-software-development-best-practices) +- [Databricks Asset Bundles Documentation](https://docs.databricks.com/dev-tools/bundles/index.html) \ No newline at end of file diff --git a/contrib/templates/README.md b/contrib/templates/README.md new file mode 100644 index 0000000..38d1654 --- /dev/null +++ b/contrib/templates/README.md @@ -0,0 +1,10 @@ +# Contrib/Templates directory + +This directory community-contributed templates. + +See https://github.com/databricks/bundle-examples/blob/main/contrib/README.md for +about community contributions. + +Looking to contribute? See https://github.com/databricks/cli/tree/main/libs/template/templates +for inspiration. These are the standard templates that are included with the +Databricks CLI. diff --git a/contrib/templates/data-engineering/README.md b/contrib/templates/data-engineering/README.md new file mode 100644 index 0000000..e1f892b --- /dev/null +++ b/contrib/templates/data-engineering/README.md @@ -0,0 +1,10 @@ +# data-engineering template + +This template introduces a new structure for organizing data-engineering +assets in DABs. + +Install it using + +``` +databricks bundle init https://github.com/databricks/bundle-examples/tree/main/contrib/templates/data-engineering/base +``` diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json b/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json new file mode 100644 index 0000000..774c5cb --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json @@ -0,0 +1,46 @@ +{ + "welcome_message": "\nWelcome to the data-engineering pipeline template!", + "properties": { + "pipeline_name": { + "type": "string", + "description": "\nPlease provide the name of the pipeline to generate.\npipeline_name", + "default": "etl_pipeline", + "order": 1 + }, + "format": { + "type": "string", + "description": "\nPlease select the format to use to define this pipeline.\nformat", + "order": 2, + "enum": [ + "python files", + "sql files", + "notebooks" + ], + "default": "python files" + }, + "only_python_files_supported": { + "skip_prompt_if": { + "properties": { + "format": { + "pattern": "python files" + } + } + }, + "default": "ignored", + "type": "string", + "description": "{{fail Only Python files are supported in this template at this time.}}", + "order": 3 + }, + "include_job": { + "type": "string", + "description": "\nWould you like to include a job that automatically triggers this pipeline?\nThis trigger will only be enabled for production deployments.\ninclude_job", + "order": 4, + "enum": [ + "yes", + "no" + ], + "default": "yes" + } + }, + "success_message": "\n\n🪠 New pipeline definition generated under 'assets/{{.pipeline_name}}'!" +} \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md new file mode 100644 index 0000000..e6cfb81 --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md @@ -0,0 +1,4 @@ +# scratch + +This folder is reserved for personal, exploratory notebooks. +By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl new file mode 100644 index 0000000..560703c --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl @@ -0,0 +1,51 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../transformations')\n", + "\n", + "\n", + "spark = SparkSession.builder.getOrCreate()\n", + "spark.sql('SELECT * FROM taxi_stats').show()" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "ipynb-notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py new file mode 100644 index 0000000..72d3f5c --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py @@ -0,0 +1,3 @@ +# This is the entry point for the {{.pipeline_name}} pipeline. +# It makes sure all transformations in the transformations directory are included. +import transformations \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py new file mode 100644 index 0000000..ed2a28c --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py @@ -0,0 +1,10 @@ +import dlt +from pyspark.sql import SparkSession, DataFrame + + +@dlt.view( + comment="Small set of taxis for development (uses LIMIT 10)" +) +def taxis() -> DataFrame: + spark = SparkSession.builder.getOrCreate() + return spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 10") \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py new file mode 100644 index 0000000..6c22c80 --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py @@ -0,0 +1,8 @@ +import dlt +from pyspark.sql import SparkSession, DataFrame + + +@dlt.view +def taxis() -> DataFrame: + spark = SparkSession.builder.getOrCreate() + return spark.sql("SELECT * FROM samples.nyctaxi.trips") \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py new file mode 100644 index 0000000..35daa25 --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py @@ -0,0 +1,7 @@ +from sources.dev.taxis import taxis +from transformations import taxi_stats + + +def test_taxi_stats(): + result = taxi_stats.filter_taxis(taxis()) + assert len(result.collect()) > 5 diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py new file mode 100644 index 0000000..26b7072 --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py @@ -0,0 +1,9 @@ +# __init__.py defines the 'transformations' Python package +import importlib +import pkgutil + + +# Import all modules in the package except those starting with '_', like '__init__.py' +for _, module_name, _ in pkgutil.iter_modules(__path__): + if not module_name.startswith("_"): + importlib.import_module(f"{__name__}.{module_name}") \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py new file mode 100644 index 0000000..7c979fb --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py @@ -0,0 +1,23 @@ +import dlt +from pyspark.sql.functions import to_date, count +from pyspark.sql import DataFrame + + +@dlt.table( + comment="Daily statistics of NYC Taxi trips" +) +def taxi_stats() -> DataFrame: + """ Read from the 'taxis' view from etl_pipeline/sources. """ + taxis = dlt.read("taxis") + + return filter_taxis(taxis) + + +def filter_taxis(taxis: DataFrame) -> DataFrame: + """ Group by date and calculate the number of trips. """ + return ( + taxis + .withColumn("pickup_date", to_date("tpep_pickup_datetime")) + .groupBy("pickup_date") + .agg(count("*").alias("number_of_trips")) + ) \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl new file mode 100644 index 0000000..a7af118 --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl @@ -0,0 +1,24 @@ +# The job that triggers {{.pipeline_name}}. +resources: + jobs: + {{.pipeline_name}}_job: + name: {{.pipeline_name}}_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + {{- if not is_service_principal}} + + email_notifications: + on_failure: + - {{user_name}} + + {{- end}} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.{{.pipeline_name}}.id} \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl new file mode 100644 index 0000000..a1fba4b --- /dev/null +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl @@ -0,0 +1,17 @@ +resources: + pipelines: + {{.pipeline_name}}: + name: {{.pipeline_name}} + serverless: true + {{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}} + ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: + # catalog: catalog_name + {{- else}} + catalog: {{default_catalog}} + {{- end}} + target: {{.pipeline_name}}_${bundle.environment} + libraries: + - file: + path: sources/${bundle.target}/*.py + - file: + path: main.py \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json b/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json new file mode 100644 index 0000000..df21996 --- /dev/null +++ b/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json @@ -0,0 +1,10 @@ +{ + "welcome_message": "\nWelcome to the data-engineering ingest-pipeline template!", + "properties": { + "pipeline_name": { + "type": "string", + "description": "\n{{fail \"The ingest-pipeline is not yet implemented.\"}}", + "order": 3 + } + } +} diff --git a/contrib/templates/data-engineering/assets/job/databricks_template_schema.json b/contrib/templates/data-engineering/assets/job/databricks_template_schema.json new file mode 100644 index 0000000..afcf5b6 --- /dev/null +++ b/contrib/templates/data-engineering/assets/job/databricks_template_schema.json @@ -0,0 +1,10 @@ +{ + "welcome_message": "\nWelcome to the data-engineering job resource template!", + "properties": { + "pipeline_name": { + "type": "string", + "description": "\n{{fail \"The ingest-pipeline is not yet implemented.\"}}", + "order": 3 + } + } +} diff --git a/contrib/templates/data-engineering/base/databricks_template_schema.json b/contrib/templates/data-engineering/base/databricks_template_schema.json new file mode 100644 index 0000000..debf4e1 --- /dev/null +++ b/contrib/templates/data-engineering/base/databricks_template_schema.json @@ -0,0 +1,46 @@ +{ + "welcome_message": "\nWelcome to the pipeline-folders template for Databricks Asset Bundles!", + "properties": { + "project_name": { + "type": "string", + "default": "my_data_project", + "description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project\nproject_name", + "order": 1, + "pattern": "^[A-Za-z0-9_]+$", + "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores." + }, + "default_catalog": { + "type": "string", + "default": "{{default_catalog}}", + "pattern": "^\\w*$", + "pattern_match_failure_message": "Invalid catalog name.", + "description": "\nPlease provide an initial catalog{{if eq (default_catalog) \"\"}} (leave blank when not using Unity Catalog){{end}}.\ndefault_catalog", + "order": 2 + }, + "personal_schemas": { + "type": "string", + "description": "\nWould you like to use a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas", + "enum": [ + "yes, use a schema based on the current user name during development", + "no, use a shared schema during development" + ], + "order": 3 + }, + "shared_schema": { + "skip_prompt_if": { + "properties": { + "personal_schemas": { + "const": "yes, use a schema based on the current user name during development" + } + } + }, + "type": "string", + "default": "default", + "pattern": "^\\w+$", + "pattern_match_failure_message": "Invalid schema name.", + "description": "\nPlease provide an initial schema during development.\ndefault_schema", + "order": 4 + } + }, + "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ databricks bundle init https://github.com/databricks/bundle-examples/tree/main/contrib/templates/data-engineering/assets/etl-pipeline\n\nRefer to the README.md file for full \"getting started\" instructions!" +} \ No newline at end of file diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/__builtins__.pyi b/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/__builtins__.pyi new file mode 100644 index 0000000..0edd518 --- /dev/null +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/extensions.json b/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/extensions.json new file mode 100644 index 0000000..5d15eba --- /dev/null +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/settings.json.tmpl b/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/settings.json.tmpl new file mode 100644 index 0000000..380587b --- /dev/null +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/settings.json.tmpl @@ -0,0 +1,22 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + {{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */ -}} + "python.analysis.extraPaths": ["assets/etl_pipeline"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/README.md.tmpl b/contrib/templates/data-engineering/base/template/{{.project_name}}/README.md.tmpl new file mode 100644 index 0000000..e83e65c --- /dev/null +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/README.md.tmpl @@ -0,0 +1,79 @@ +# {{.project_name}} + +The '{{.project_name}}' project was generated by using the data-engineering template. + +## Setup + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks auth login + ``` + +3. We recommend the UV package manager to install project dependencies. It's a drop-in replacement for `pip`. + See https://docs.astral.sh/uv/getting-started/installation/ for full installation instructions, + or run: + ``` + $ pip install uv + ``` + +4. Install all project dependencies: + ``` + $ uv sync + ``` + + See the "Running unit tests" below for more on testing. + +5. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from + https://www.databricks.com/blog/announcing-pycharm-integration-databricks. + +## Adding assets such as pipelines and jobs + +By default, the data-engineering template does not include any assets. + +1. To add an asset, run the `add-asset` script: + ``` + $ uv run add-asset + ``` + +2. Optionally, run all tests on serverless compute after adding an asset: + ``` + $ uv run test + ``` + +## Deploying assets + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + +2. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +3. Use the "summary" comand to review everything that was deployed: + ``` + $ databricks bundle summary + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +## Running unit tests + +1. Run tests on a serverless environment using: + ``` + $ uv run test + ``` + +2. Optionally, to run unit tests in a different environment, such as on a cluster, + please refer to the documentation of DB connect at + https://docs.databricks.com/en/dev-tools/databricks-connect/python/install.html diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/assets/README.md b/contrib/templates/data-engineering/base/template/{{.project_name}}/assets/README.md new file mode 100644 index 0000000..f6c8907 --- /dev/null +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/assets/README.md @@ -0,0 +1,4 @@ +This folder is reserved for Databricks Asset Bundles definitions. + +New jobs and pipelines should conventions from the 'data-engineering' template. +See https://github.com/databricks/bundle-examples/blob/main/contrib/templates/data-engineering/README.md. diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/conftest.py b/contrib/templates/data-engineering/base/template/{{.project_name}}/conftest.py new file mode 100644 index 0000000..4a49bb1 --- /dev/null +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/conftest.py @@ -0,0 +1,36 @@ +# conftest.py is used to configure pytest +import os +import sys +import dlt +import pathlib +import pytest +import warnings +from pyspark.sql import SparkSession +from databricks.connect import DatabricksSession + +# Dynamically find and add all `assets/*` directories to `sys.path` +for path in pathlib.Path("assets").glob("*"): + resolved_path = str(path.resolve()) + if resolved_path not in sys.path: + sys.path.append(resolved_path) + +# Work around issues in older databricks-connect +SparkSession.builder = DatabricksSession.builder +os.environ.pop("SPARK_REMOTE", None) + +# Make dlt.views in 'sources/dev' available for tests +warnings.filterwarnings( + "ignore", + message="This is a stub that only contains the interfaces to Delta Live Tables.*", + category=UserWarning, +) +dlt.enable_local_execution() +dlt.view = lambda func=None, *args, **kwargs: func or (lambda f: f) + + +# Provide a 'spark' fixture for tests and make sure the session is eagerly initialized +@pytest.fixture(scope="session", autouse=True) +def spark() -> SparkSession: + if hasattr(DatabricksSession.builder, "validateSession"): + return DatabricksSession.builder.validateSession().getOrCreate() + return DatabricksSession.builder.getOrCreate() diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/databricks.yml.tmpl b/contrib/templates/data-engineering/base/template/{{.project_name}}/databricks.yml.tmpl new file mode 100644 index 0000000..420dded --- /dev/null +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/databricks.yml.tmpl @@ -0,0 +1,50 @@ +# This is a Databricks asset bundle definition for {{.project_name}}. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: {{.project_name}} + +include: + - assets/*.yml + - assets/*/*.yml + +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + +{{- $dev_schema := .shared_schema }} +{{- $prod_schema := .shared_schema }} +{{- if (regexp "^yes").MatchString .personal_schemas}} + {{- $dev_schema = "${workspace.current_user.short_name}"}} + {{- $prod_schema = "default"}} +{{- end}} + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: {{workspace_host}} + variables: + catalog: {{.default_catalog}} + schema: {{$dev_schema}} + + prod: + mode: production + workspace: + host: {{workspace_host}} + # We explicitly specify /Workspace/Users/{{user_name}} to make sure we only have a single copy. + root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} + permissions: + - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} + level: CAN_MANAGE + run_as: + {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} + variables: + catalog: {{.default_catalog}} + schema: {{$prod_schema}} diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml b/contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml new file mode 100644 index 0000000..b04668c --- /dev/null +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "my_data_project" +version = "0.1.0" +description = "Databricks ETL pipeline project" +requires-python = ">=3.10" +dependencies = [ + "databricks-dlt", + "pytest", + "setuptools", + "wheel", + "databricks-connect==15.1.*", +] + +[project.scripts] +add-asset = "scripts.add_asset:main" +test = "scripts.test:main" + +[tool.uv] +package = true + +[tool.setuptools.packages.find] +include = ["scripts"] \ No newline at end of file diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py b/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py new file mode 100644 index 0000000..34c0f13 --- /dev/null +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# +# add_asset.py is used to initialize a new asset from the data-engineering template. +# +import sys +import subprocess +from typing import Literal + +VALID_ASSETS = ["etl-pipeline", "job", "ingest-pipeline"] +AssetType = Literal["etl-pipeline", "job", "ingest-pipeline"] + + +def init_bundle(asset_type: AssetType) -> None: + cmd = f"databricks bundle init ~/projects/bundle-examples/contrib/templates/data-engineering/assets/{asset_type}" + subprocess.run(cmd, shell=True) + + +def show_menu() -> AssetType: + print("\nSelect asset type to initialize:") + for i, asset in enumerate(VALID_ASSETS, 1): + print(f"{i}. {asset}") + + while True: + try: + choice = int(input("\nEnter number (1-3): ")) + if 1 <= choice <= len(VALID_ASSETS): + return VALID_ASSETS[choice - 1] + print("Invalid choice. Please try again.") + except ValueError: + print("Please enter a number.") + + +def main(): + if len(sys.argv) > 1: + asset_type = sys.argv[1] + if asset_type not in VALID_ASSETS: + print(f"Error: Asset type must be one of {VALID_ASSETS}") + sys.exit(1) + else: + asset_type = show_menu() + + init_bundle(asset_type) + + +if __name__ == "__main__": + main() diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/test.py b/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/test.py new file mode 100644 index 0000000..62d8dd3 --- /dev/null +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/test.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +# +# test.py runs the unit tests for this project using pytest. +# +import os +import subprocess + + +def main(): + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + subprocess.run(["pytest"], check=True) + + +if __name__ == "__main__": + main() From 54762a7b13e5d8c47c6061f3216e636b6dc371ad Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 18 Nov 2024 09:27:20 +0100 Subject: [PATCH 02/18] Fix init calls --- contrib/templates/data-engineering/README.md | 2 +- .../data-engineering/base/databricks_template_schema.json | 2 +- .../base/template/{{.project_name}}/README.md.tmpl | 7 +++++++ .../base/template/{{.project_name}}/scripts/add_asset.py | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/contrib/templates/data-engineering/README.md b/contrib/templates/data-engineering/README.md index e1f892b..c59d450 100644 --- a/contrib/templates/data-engineering/README.md +++ b/contrib/templates/data-engineering/README.md @@ -6,5 +6,5 @@ assets in DABs. Install it using ``` -databricks bundle init https://github.com/databricks/bundle-examples/tree/main/contrib/templates/data-engineering/base +databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/base ``` diff --git a/contrib/templates/data-engineering/base/databricks_template_schema.json b/contrib/templates/data-engineering/base/databricks_template_schema.json index debf4e1..018f5ad 100644 --- a/contrib/templates/data-engineering/base/databricks_template_schema.json +++ b/contrib/templates/data-engineering/base/databricks_template_schema.json @@ -42,5 +42,5 @@ "order": 4 } }, - "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ databricks bundle init https://github.com/databricks/bundle-examples/tree/main/contrib/templates/data-engineering/assets/etl-pipeline\n\nRefer to the README.md file for full \"getting started\" instructions!" + "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ cd ${{.project_name}}\n $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline\n\nRefer to the README.md file for full \"getting started\" instructions!" } \ No newline at end of file diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/README.md.tmpl b/contrib/templates/data-engineering/base/template/{{.project_name}}/README.md.tmpl index e83e65c..cadd25c 100644 --- a/contrib/templates/data-engineering/base/template/{{.project_name}}/README.md.tmpl +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/README.md.tmpl @@ -38,6 +38,13 @@ By default, the data-engineering template does not include any assets. $ uv run add-asset ``` + or, if you don't use UV, use + + ``` + $ export TYPE=etl-pipeline + $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/$TYPE + ``` + 2. Optionally, run all tests on serverless compute after adding an asset: ``` $ uv run test diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py b/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py index 34c0f13..931db61 100644 --- a/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py @@ -11,7 +11,7 @@ def init_bundle(asset_type: AssetType) -> None: - cmd = f"databricks bundle init ~/projects/bundle-examples/contrib/templates/data-engineering/assets/{asset_type}" + cmd = f"databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/{asset_type}" subprocess.run(cmd, shell=True) From 596ece0174b405e4276d11e97ca9580a47b6b11c Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 18 Nov 2024 11:50:45 +0100 Subject: [PATCH 03/18] Add --branch parameters for now --- contrib/templates/data-engineering/README.md | 2 +- .../data-engineering/base/databricks_template_schema.json | 2 +- .../base/template/{{.project_name}}/scripts/add_asset.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/templates/data-engineering/README.md b/contrib/templates/data-engineering/README.md index c59d450..ec5206c 100644 --- a/contrib/templates/data-engineering/README.md +++ b/contrib/templates/data-engineering/README.md @@ -6,5 +6,5 @@ assets in DABs. Install it using ``` -databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/base +databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/base --branch data-engineering ``` diff --git a/contrib/templates/data-engineering/base/databricks_template_schema.json b/contrib/templates/data-engineering/base/databricks_template_schema.json index 018f5ad..c94c0fb 100644 --- a/contrib/templates/data-engineering/base/databricks_template_schema.json +++ b/contrib/templates/data-engineering/base/databricks_template_schema.json @@ -42,5 +42,5 @@ "order": 4 } }, - "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ cd ${{.project_name}}\n $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline\n\nRefer to the README.md file for full \"getting started\" instructions!" + "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ cd ${{.project_name}}\n $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline --branch data-engineering\n\nRefer to the README.md file for full \"getting started\" instructions!" } \ No newline at end of file diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py b/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py index 931db61..80cac32 100644 --- a/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py @@ -11,7 +11,7 @@ def init_bundle(asset_type: AssetType) -> None: - cmd = f"databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/{asset_type}" + cmd = f"databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/{asset_type} --branch data-engineering" subprocess.run(cmd, shell=True) From 448c4f3b6319eb23ab4fb78cd7b0c093023a4863 Mon Sep 17 00:00:00 2001 From: "Lennart Kats (databricks)" Date: Mon, 18 Nov 2024 13:42:53 +0100 Subject: [PATCH 04/18] Update contrib/templates/data-engineering/base/databricks_template_schema.json Co-authored-by: Pieter Noordhuis --- .../data-engineering/base/databricks_template_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/templates/data-engineering/base/databricks_template_schema.json b/contrib/templates/data-engineering/base/databricks_template_schema.json index c94c0fb..6769871 100644 --- a/contrib/templates/data-engineering/base/databricks_template_schema.json +++ b/contrib/templates/data-engineering/base/databricks_template_schema.json @@ -42,5 +42,5 @@ "order": 4 } }, - "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ cd ${{.project_name}}\n $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline --branch data-engineering\n\nRefer to the README.md file for full \"getting started\" instructions!" + "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ cd {{.project_name}}\n $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline --branch data-engineering\n\nRefer to the README.md file for full \"getting started\" instructions!" } \ No newline at end of file From 5493ac42568015caa5371ec5dc07a4a09a2cdf8e Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Thu, 21 Nov 2024 10:15:49 +0100 Subject: [PATCH 05/18] Use vars for catalogs/schemas --- .../{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl index a1fba4b..eeedcbe 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl @@ -5,11 +5,11 @@ resources: serverless: true {{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}} ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: - # catalog: catalog_name + # catalog: ${var.catalog} {{- else}} - catalog: {{default_catalog}} + catalog: ${var.catalog} {{- end}} - target: {{.pipeline_name}}_${bundle.environment} + target: ${var.schema} libraries: - file: path: sources/${bundle.target}/*.py From e15ceb1c6b30ffda859038af4892386537c6b46f Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 22 Nov 2024 10:23:30 +0100 Subject: [PATCH 06/18] Update based on feedback --- contrib/README.md | 4 ++-- .../assets/etl-pipeline/databricks_template_schema.json | 2 +- .../assets/{{.pipeline_name}}/explorations/README.md | 4 ++-- .../etl-pipeline/template/assets/{{.pipeline_name}}/main.py | 3 ++- .../{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl | 3 +-- .../base/template/{{.project_name}}/conftest.py | 5 +++-- .../base/template/{{.project_name}}/databricks.yml.tmpl | 5 ++++- .../base/template/{{.project_name}}/pyproject.toml | 2 +- 8 files changed, 16 insertions(+), 12 deletions(-) diff --git a/contrib/README.md b/contrib/README.md index 252d132..31f7d01 100644 --- a/contrib/README.md +++ b/contrib/README.md @@ -39,5 +39,5 @@ If you would like to add your own examples or resources, please: 3. Ensure that any necessary configuration files, scripts, or dependencies are included. For more information on Databricks Asset Bundles, see: -- [Public Preview Announcement](https://www.databricks.com/blog/announcing-public-preview-databricks-asset-bundles-apply-software-development-best-practices) -- [Databricks Asset Bundles Documentation](https://docs.databricks.com/dev-tools/bundles/index.html) \ No newline at end of file +- The launch blog post at https://www.databricks.com/blog/announcing-general-availability-databricks-asset-bundles +- The docs at https://docs.databricks.com/dev-tools/bundles/index.html diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json b/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json index 774c5cb..a49171a 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json +++ b/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json @@ -28,7 +28,7 @@ }, "default": "ignored", "type": "string", - "description": "{{fail Only Python files are supported in this template at this time.}}", + "description": "{{fail \"Only Python files are supported in this template at this time.\"}}", "order": 3 }, "include_job": { diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md index e6cfb81..7292d7f 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md @@ -1,4 +1,4 @@ -# scratch +# explorations This folder is reserved for personal, exploratory notebooks. -By default these are not committed to Git, as 'scratch' is listed in .gitignore. +By default these are not committed to Git, as 'explorations' is listed in .gitignore. diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py index 72d3f5c..ad6b5f8 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py @@ -1,3 +1,4 @@ # This is the entry point for the {{.pipeline_name}} pipeline. # It makes sure all transformations in the transformations directory are included. -import transformations \ No newline at end of file +import transformations +__all__ = ["transformations"] \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl index a7af118..a75b746 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl @@ -13,8 +13,7 @@ resources: {{- if not is_service_principal}} email_notifications: - on_failure: - - {{user_name}} + on_failure: ${var.notifications} {{- end}} diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/conftest.py b/contrib/templates/data-engineering/base/template/{{.project_name}}/conftest.py index 4a49bb1..2602fb5 100644 --- a/contrib/templates/data-engineering/base/template/{{.project_name}}/conftest.py +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/conftest.py @@ -1,4 +1,5 @@ -# conftest.py is used to configure pytest +# conftest.py is used to configure pytest. +# This file is in the root since it affects all tests through this bundle. import os import sys import dlt @@ -9,7 +10,7 @@ from databricks.connect import DatabricksSession # Dynamically find and add all `assets/*` directories to `sys.path` -for path in pathlib.Path("assets").glob("*"): +for path in pathlib.Path(pathlib.Path(__file__).parent / "assets").glob("*"): resolved_path = str(path.resolve()) if resolved_path not in sys.path: sys.path.append(resolved_path) diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/databricks.yml.tmpl b/contrib/templates/data-engineering/base/template/{{.project_name}}/databricks.yml.tmpl index 420dded..d988fcc 100644 --- a/contrib/templates/data-engineering/base/template/{{.project_name}}/databricks.yml.tmpl +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/databricks.yml.tmpl @@ -12,6 +12,8 @@ variables: description: The catalog to use schema: description: The schema to use + notifications: + description: The email addresses to use for failure notifications {{- $dev_schema := .shared_schema }} {{- $prod_schema := .shared_schema }} @@ -33,7 +35,7 @@ targets: variables: catalog: {{.default_catalog}} schema: {{$dev_schema}} - + notifications: [] prod: mode: production workspace: @@ -48,3 +50,4 @@ targets: variables: catalog: {{.default_catalog}} schema: {{$prod_schema}} + notifications: [{{user_name}}] \ No newline at end of file diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml b/contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml index b04668c..25cc305 100644 --- a/contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml @@ -2,7 +2,7 @@ name = "my_data_project" version = "0.1.0" description = "Databricks ETL pipeline project" -requires-python = ">=3.10" +requires-python = "==3.11.*" dependencies = [ "databricks-dlt", "pytest", From 9b87093316fcdb07bb1e071efd6903100abeedf8 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 22 Nov 2024 10:27:33 +0100 Subject: [PATCH 07/18] Fix newline --- .../base/template/{{.project_name}}/.vscode/settings.json.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/settings.json.tmpl b/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/settings.json.tmpl index 380587b..2f753e8 100644 --- a/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/settings.json.tmpl +++ b/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/settings.json.tmpl @@ -8,7 +8,7 @@ ], "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, - {{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */ -}} + {{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */}} "python.analysis.extraPaths": ["assets/etl_pipeline"], "files.exclude": { "**/*.egg-info": true, From 3141102d6fe5e5ec4feb22c8a71241c74619c686 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sun, 24 Nov 2024 17:21:09 +0100 Subject: [PATCH 08/18] Move template one folder up --- contrib/templates/data-engineering/README.md | 5 ++++- .../{base => }/databricks_template_schema.json | 0 .../template/{{.project_name}}/.vscode/__builtins__.pyi | 0 .../template/{{.project_name}}/.vscode/extensions.json | 0 .../template/{{.project_name}}/.vscode/settings.json.tmpl | 0 .../{base => }/template/{{.project_name}}/README.md.tmpl | 0 .../{base => }/template/{{.project_name}}/assets/README.md | 0 .../{base => }/template/{{.project_name}}/conftest.py | 0 .../template/{{.project_name}}/databricks.yml.tmpl | 0 .../{base => }/template/{{.project_name}}/pyproject.toml | 2 +- .../template/{{.project_name}}/scripts/add_asset.py | 0 .../{base => }/template/{{.project_name}}/scripts/test.py | 0 12 files changed, 5 insertions(+), 2 deletions(-) rename contrib/templates/data-engineering/{base => }/databricks_template_schema.json (100%) rename contrib/templates/data-engineering/{base => }/template/{{.project_name}}/.vscode/__builtins__.pyi (100%) rename contrib/templates/data-engineering/{base => }/template/{{.project_name}}/.vscode/extensions.json (100%) rename contrib/templates/data-engineering/{base => }/template/{{.project_name}}/.vscode/settings.json.tmpl (100%) rename contrib/templates/data-engineering/{base => }/template/{{.project_name}}/README.md.tmpl (100%) rename contrib/templates/data-engineering/{base => }/template/{{.project_name}}/assets/README.md (100%) rename contrib/templates/data-engineering/{base => }/template/{{.project_name}}/conftest.py (100%) rename contrib/templates/data-engineering/{base => }/template/{{.project_name}}/databricks.yml.tmpl (100%) rename contrib/templates/data-engineering/{base => }/template/{{.project_name}}/pyproject.toml (92%) rename contrib/templates/data-engineering/{base => }/template/{{.project_name}}/scripts/add_asset.py (100%) rename contrib/templates/data-engineering/{base => }/template/{{.project_name}}/scripts/test.py (100%) diff --git a/contrib/templates/data-engineering/README.md b/contrib/templates/data-engineering/README.md index ec5206c..19df04b 100644 --- a/contrib/templates/data-engineering/README.md +++ b/contrib/templates/data-engineering/README.md @@ -6,5 +6,8 @@ assets in DABs. Install it using ``` -databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/base --branch data-engineering +databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering --branch data-engineering ``` + +Note that by default this template doesn't come with any assets such as jobs or pipelines. +Follow the instructions in the template setup and README to add them. \ No newline at end of file diff --git a/contrib/templates/data-engineering/base/databricks_template_schema.json b/contrib/templates/data-engineering/databricks_template_schema.json similarity index 100% rename from contrib/templates/data-engineering/base/databricks_template_schema.json rename to contrib/templates/data-engineering/databricks_template_schema.json diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/__builtins__.pyi b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/__builtins__.pyi similarity index 100% rename from contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/__builtins__.pyi rename to contrib/templates/data-engineering/template/{{.project_name}}/.vscode/__builtins__.pyi diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/extensions.json b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/extensions.json similarity index 100% rename from contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/extensions.json rename to contrib/templates/data-engineering/template/{{.project_name}}/.vscode/extensions.json diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/settings.json.tmpl b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/settings.json.tmpl similarity index 100% rename from contrib/templates/data-engineering/base/template/{{.project_name}}/.vscode/settings.json.tmpl rename to contrib/templates/data-engineering/template/{{.project_name}}/.vscode/settings.json.tmpl diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/README.md.tmpl b/contrib/templates/data-engineering/template/{{.project_name}}/README.md.tmpl similarity index 100% rename from contrib/templates/data-engineering/base/template/{{.project_name}}/README.md.tmpl rename to contrib/templates/data-engineering/template/{{.project_name}}/README.md.tmpl diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/assets/README.md b/contrib/templates/data-engineering/template/{{.project_name}}/assets/README.md similarity index 100% rename from contrib/templates/data-engineering/base/template/{{.project_name}}/assets/README.md rename to contrib/templates/data-engineering/template/{{.project_name}}/assets/README.md diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/conftest.py b/contrib/templates/data-engineering/template/{{.project_name}}/conftest.py similarity index 100% rename from contrib/templates/data-engineering/base/template/{{.project_name}}/conftest.py rename to contrib/templates/data-engineering/template/{{.project_name}}/conftest.py diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/databricks.yml.tmpl b/contrib/templates/data-engineering/template/{{.project_name}}/databricks.yml.tmpl similarity index 100% rename from contrib/templates/data-engineering/base/template/{{.project_name}}/databricks.yml.tmpl rename to contrib/templates/data-engineering/template/{{.project_name}}/databricks.yml.tmpl diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml similarity index 92% rename from contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml rename to contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml index 25cc305..909faaa 100644 --- a/contrib/templates/data-engineering/base/template/{{.project_name}}/pyproject.toml +++ b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml @@ -2,7 +2,7 @@ name = "my_data_project" version = "0.1.0" description = "Databricks ETL pipeline project" -requires-python = "==3.11.*" +requires-python = "==3.10.*" dependencies = [ "databricks-dlt", "pytest", diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py similarity index 100% rename from contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/add_asset.py rename to contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py diff --git a/contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/test.py b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py similarity index 100% rename from contrib/templates/data-engineering/base/template/{{.project_name}}/scripts/test.py rename to contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py From eca8f28f92caca0b72553ee3237a0b401990527e Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 25 Nov 2024 10:51:57 +0100 Subject: [PATCH 09/18] Add workaround for databricks-dlt conflict --- .../template/{{.project_name}}/pyproject.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml index 909faaa..a498f86 100644 --- a/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml +++ b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml @@ -17,6 +17,11 @@ test = "scripts.test:main" [tool.uv] package = true +override-dependencies = [ + # work around conflict with older databricks-dlt libraries + # pyspark is already installed by databricks-connect + "pyspark; sys_platform == 'never'", +] [tool.setuptools.packages.find] include = ["scripts"] \ No newline at end of file From 80a2e025d77b70a95eb3c5cb9208a2af28bfb43e Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Thu, 28 Nov 2024 12:35:01 +0100 Subject: [PATCH 10/18] Update based on reviewer feedback --- .vscode/settings.json | 22 +++++++++++++++++++ .../explorations/exploration.ipynb.tmpl | 1 + .../assets/{{.pipeline_name}}/main.py | 3 ++- .../databricks_template_schema.json | 4 ++-- .../job/databricks_template_schema.json | 4 ++-- .../template/{{.project_name}}/conftest.py | 5 ++++- .../template/{{.project_name}}/pyproject.toml | 2 -- 7 files changed, 33 insertions(+), 8 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..02d2535 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,22 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "debugpy.debugJustMyCode": false, + "editor.formatOnSave": true, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + } +} \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl index 560703c..14b9859 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl @@ -19,6 +19,7 @@ "source": [ "import sys\n", "sys.path.append('../transformations')\n", + "from transformations import taxi_stats\n", "\n", "\n", "spark = SparkSession.builder.getOrCreate()\n", diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py index ad6b5f8..67f4c4c 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py @@ -1,4 +1,5 @@ # This is the entry point for the {{.pipeline_name}} pipeline. # It makes sure all transformations in the transformations directory are included. import transformations -__all__ = ["transformations"] \ No newline at end of file + +__all__ = ["transformations"] diff --git a/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json b/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json index df21996..0f7dddd 100644 --- a/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json +++ b/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json @@ -3,8 +3,8 @@ "properties": { "pipeline_name": { "type": "string", - "description": "\n{{fail \"The ingest-pipeline is not yet implemented.\"}}", + "description": "\n{{fail \"The ingest-pipeline template is not yet implemented.\"}}", "order": 3 } } -} +} \ No newline at end of file diff --git a/contrib/templates/data-engineering/assets/job/databricks_template_schema.json b/contrib/templates/data-engineering/assets/job/databricks_template_schema.json index afcf5b6..5e0d4b9 100644 --- a/contrib/templates/data-engineering/assets/job/databricks_template_schema.json +++ b/contrib/templates/data-engineering/assets/job/databricks_template_schema.json @@ -3,8 +3,8 @@ "properties": { "pipeline_name": { "type": "string", - "description": "\n{{fail \"The ingest-pipeline is not yet implemented.\"}}", + "description": "\n{{fail \"The job template is not yet implemented.\"}}", "order": 3 } } -} +} \ No newline at end of file diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/conftest.py b/contrib/templates/data-engineering/template/{{.project_name}}/conftest.py index 2602fb5..2b7f5db 100644 --- a/contrib/templates/data-engineering/template/{{.project_name}}/conftest.py +++ b/contrib/templates/data-engineering/template/{{.project_name}}/conftest.py @@ -1,5 +1,7 @@ # conftest.py is used to configure pytest. # This file is in the root since it affects all tests through this bundle. +# It makes sure all 'assets/*' directories are added to `sys.path` so that +# tests can import them. import os import sys import dlt @@ -15,7 +17,8 @@ if resolved_path not in sys.path: sys.path.append(resolved_path) -# Work around issues in older databricks-connect +# For older databricks-connect, work around issues importing SparkSession +# and errors when SPARK_REMOTE is set. SparkSession.builder = DatabricksSession.builder os.environ.pop("SPARK_REMOTE", None) diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml index a498f86..8e98865 100644 --- a/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml +++ b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml @@ -6,8 +6,6 @@ requires-python = "==3.10.*" dependencies = [ "databricks-dlt", "pytest", - "setuptools", - "wheel", "databricks-connect==15.1.*", ] From d2ea026ea0ae1b9afe7a39a53b56a7bb3f8e9497 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Thu, 28 Nov 2024 12:37:26 +0100 Subject: [PATCH 11/18] Remove workaround for older databricks-dlt --- .../template/{{.project_name}}/pyproject.toml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml index 8e98865..966b1ab 100644 --- a/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml +++ b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml @@ -15,11 +15,6 @@ test = "scripts.test:main" [tool.uv] package = true -override-dependencies = [ - # work around conflict with older databricks-dlt libraries - # pyspark is already installed by databricks-connect - "pyspark; sys_platform == 'never'", -] [tool.setuptools.packages.find] include = ["scripts"] \ No newline at end of file From 56cb31701789613aa241d89dda8630dc8594c169 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Thu, 28 Nov 2024 12:39:30 +0100 Subject: [PATCH 12/18] Extend description --- .../template/{{.project_name}}/scripts/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py index 62d8dd3..4748c81 100644 --- a/contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py +++ b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # -# test.py runs the unit tests for this project using pytest. +# test.py runs the unit tests for this project using pytest and serverless compute. +# To use a different form of compute, instead use 'uv run pytest' or +# use your IDE's testing panel. When using VS Code, consider using the Databricks extension. # import os import subprocess From 93cc57ab6865d03e465113a14014a2526bc0aa35 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Thu, 28 Nov 2024 13:54:06 +0100 Subject: [PATCH 13/18] Update .gitignore --- .../template/{{.project_name}}/.gitignore | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 contrib/templates/data-engineering/template/{{.project_name}}/.gitignore diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/.gitignore b/contrib/templates/data-engineering/template/{{.project_name}}/.gitignore new file mode 100644 index 0000000..f6a3b5f --- /dev/null +++ b/contrib/templates/data-engineering/template/{{.project_name}}/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +**/explorations/** +**/!explorations/README.md From 6a615bb7fee69ae8545640382408a71c5030f81d Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sat, 30 Nov 2024 10:45:26 +0100 Subject: [PATCH 14/18] Rework package structure --- .../{{.pipeline_name}}/{main.py => __init__.py} | 0 .../explorations/exploration.ipynb.tmpl | 4 ++-- .../{{.pipeline_name}}/sources/dev/taxis.py | 10 ++++------ .../{{.pipeline_name}}/sources/prod/taxis.py | 6 +++--- .../{{.pipeline_name}}/tests/taxi_stats_test.py | 4 ++-- .../transformations/__init__.py | 2 +- .../transformations/taxi_stats.py | 15 ++++++--------- .../{{.pipeline_name}}.pipeline.yml.tmpl | 2 +- 8 files changed, 19 insertions(+), 24 deletions(-) rename contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{main.py => __init__.py} (100%) diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/__init__.py similarity index 100% rename from contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/main.py rename to contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/__init__.py diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl index 14b9859..ef1f017 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl @@ -18,8 +18,8 @@ "outputs": [], "source": [ "import sys\n", - "sys.path.append('../transformations')\n", - "from transformations import taxi_stats\n", + "sys.path.append('..')\n", + "from {{.pipeline_name}}.transformations import taxi_stats\n", "\n", "\n", "spark = SparkSession.builder.getOrCreate()\n", diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py index ed2a28c..1fba2e2 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py @@ -1,10 +1,8 @@ import dlt -from pyspark.sql import SparkSession, DataFrame +from pyspark.sql import DataFrame +from databricks.sdk.runtime import spark -@dlt.view( - comment="Small set of taxis for development (uses LIMIT 10)" -) +@dlt.view(comment="Small set of taxis for development (uses LIMIT 10)") def taxis() -> DataFrame: - spark = SparkSession.builder.getOrCreate() - return spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 10") \ No newline at end of file + return spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 10") diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py index 6c22c80..15ce56a 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py @@ -1,8 +1,8 @@ import dlt -from pyspark.sql import SparkSession, DataFrame +from pyspark.sql import DataFrame +from databricks.sdk.runtime import spark @dlt.view def taxis() -> DataFrame: - spark = SparkSession.builder.getOrCreate() - return spark.sql("SELECT * FROM samples.nyctaxi.trips") \ No newline at end of file + return spark.sql("SELECT * FROM samples.nyctaxi.trips") diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py index 35daa25..b0c4449 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py @@ -1,5 +1,5 @@ -from sources.dev.taxis import taxis -from transformations import taxi_stats +from ..sources.dev.taxis import taxis +from ..transformations import taxi_stats def test_taxi_stats(): diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py index 26b7072..80577db 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py @@ -6,4 +6,4 @@ # Import all modules in the package except those starting with '_', like '__init__.py' for _, module_name, _ in pkgutil.iter_modules(__path__): if not module_name.startswith("_"): - importlib.import_module(f"{__name__}.{module_name}") \ No newline at end of file + importlib.import_module(f"{__name__}.{module_name}") diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py index 7c979fb..5c5dcd9 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py @@ -3,21 +3,18 @@ from pyspark.sql import DataFrame -@dlt.table( - comment="Daily statistics of NYC Taxi trips" -) +@dlt.table(comment="Daily statistics of NYC Taxi trips") def taxi_stats() -> DataFrame: - """ Read from the 'taxis' view from etl_pipeline/sources. """ + """Read from the 'taxis' view from etl_pipeline/sources.""" taxis = dlt.read("taxis") - + return filter_taxis(taxis) def filter_taxis(taxis: DataFrame) -> DataFrame: - """ Group by date and calculate the number of trips. """ + """Group by date and calculate the number of trips.""" return ( - taxis - .withColumn("pickup_date", to_date("tpep_pickup_datetime")) + taxis.withColumn("pickup_date", to_date("tpep_pickup_datetime")) .groupBy("pickup_date") .agg(count("*").alias("number_of_trips")) - ) \ No newline at end of file + ) diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl index eeedcbe..86890fd 100644 --- a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl +++ b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl @@ -14,4 +14,4 @@ resources: - file: path: sources/${bundle.target}/*.py - file: - path: main.py \ No newline at end of file + path: __init__.py From c447107b79fde7d8b2ccad4f5ee2aebcf0c0c791 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sat, 30 Nov 2024 10:57:36 +0100 Subject: [PATCH 15/18] Remove settings.json from the present PR --- .vscode/settings.json | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 02d2535..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", - "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", - "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", - "python.testing.pytestArgs": [ - "." - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "files.exclude": { - "**/*.egg-info": true, - "**/__pycache__": true, - ".pytest_cache": true, - }, - "debugpy.debugJustMyCode": false, - "editor.formatOnSave": true, - "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", - "editor.formatOnSave": true, - } -} \ No newline at end of file From bed1101e2cb84e7c94c0d0c8cc45ae05b3c15633 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sun, 1 Dec 2024 14:59:02 +0100 Subject: [PATCH 16/18] Fix template name --- .vscode/settings.json | 22 +++++++++++++++++++ .../databricks_template_schema.json | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..02d2535 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,22 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "debugpy.debugJustMyCode": false, + "editor.formatOnSave": true, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + } +} \ No newline at end of file diff --git a/contrib/templates/data-engineering/databricks_template_schema.json b/contrib/templates/data-engineering/databricks_template_schema.json index 6769871..2d3048e 100644 --- a/contrib/templates/data-engineering/databricks_template_schema.json +++ b/contrib/templates/data-engineering/databricks_template_schema.json @@ -1,5 +1,5 @@ { - "welcome_message": "\nWelcome to the pipeline-folders template for Databricks Asset Bundles!", + "welcome_message": "\nWelcome to the data-engineering template for Databricks Asset Bundles!", "properties": { "project_name": { "type": "string", From 56093bd79df94515cd572350591e3196d8d6da5b Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 2 Dec 2024 09:16:24 +0100 Subject: [PATCH 17/18] Remove settings.json from this PR --- .vscode/settings.json | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 02d2535..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", - "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", - "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", - "python.testing.pytestArgs": [ - "." - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "files.exclude": { - "**/*.egg-info": true, - "**/__pycache__": true, - ".pytest_cache": true, - }, - "debugpy.debugJustMyCode": false, - "editor.formatOnSave": true, - "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", - "editor.formatOnSave": true, - } -} \ No newline at end of file From aab5cc5c52962acccdbc00f04fd07219dd14fac9 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 2 Dec 2024 09:23:47 +0100 Subject: [PATCH 18/18] Revert "Add --branch parameters for now" This reverts commit 596ece0174b405e4276d11e97ca9580a47b6b11c. --- contrib/templates/data-engineering/README.md | 2 +- .../templates/data-engineering/databricks_template_schema.json | 2 +- .../template/{{.project_name}}/scripts/add_asset.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/templates/data-engineering/README.md b/contrib/templates/data-engineering/README.md index 19df04b..c1ed1c6 100644 --- a/contrib/templates/data-engineering/README.md +++ b/contrib/templates/data-engineering/README.md @@ -6,7 +6,7 @@ assets in DABs. Install it using ``` -databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering --branch data-engineering +databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering ``` Note that by default this template doesn't come with any assets such as jobs or pipelines. diff --git a/contrib/templates/data-engineering/databricks_template_schema.json b/contrib/templates/data-engineering/databricks_template_schema.json index 2d3048e..575488f 100644 --- a/contrib/templates/data-engineering/databricks_template_schema.json +++ b/contrib/templates/data-engineering/databricks_template_schema.json @@ -42,5 +42,5 @@ "order": 4 } }, - "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ cd {{.project_name}}\n $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline --branch data-engineering\n\nRefer to the README.md file for full \"getting started\" instructions!" + "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n $ cd {{.project_name}}\n $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline\n\nRefer to the README.md file for full \"getting started\" instructions!" } \ No newline at end of file diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py index 80cac32..931db61 100644 --- a/contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py +++ b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py @@ -11,7 +11,7 @@ def init_bundle(asset_type: AssetType) -> None: - cmd = f"databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/{asset_type} --branch data-engineering" + cmd = f"databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/{asset_type}" subprocess.run(cmd, shell=True)