From ebbd06e8375d29c11b842a4b7b0fa2b2c1ee3b7e Mon Sep 17 00:00:00 2001 From: dat-a-man <98139823+dat-a-man@users.noreply.github.com> Date: Wed, 9 Oct 2024 04:54:07 +0000 Subject: [PATCH] Added the snippets --- .../deploy-with-modal-snippets.py | 57 +++++++++++++++++++ .../deploy-a-pipeline/deploy-with-modal.md | 52 +---------------- 2 files changed, 59 insertions(+), 50 deletions(-) create mode 100644 docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal-snippets.py diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal-snippets.py b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal-snippets.py new file mode 100644 index 0000000000..d6b8c0714f --- /dev/null +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal-snippets.py @@ -0,0 +1,57 @@ +import os +from tests.pipeline.utils import assert_load_info + +def modal_snippet() -> None: + # @@@DLT_SNIPPET_START modal_image + import modal + import os + + # Define the Modal Image + image = ( + modal.Image.debian_slim() + .pip_install( + "dlt>=1.1.0", + "dlt[duckdb]", # destination + "dlt[sql_database]", # source (postgres) + "pymysql" # database driver for MySQL source + ) + ) + + app = modal.App("example-dlt", image=image) + + # Modal Volume used to store the duckdb database file + vol = modal.Volume.from_name("duckdb-vol", create_if_missing=True) + # @@@DLT_SNIPPET_END modal_image + + # @@@DLT_SNIPPET_START modal_function + @app.function( + volumes={"/data/": vol}, + schedule=modal.Period(days=1), + secrets=[modal.Secret.from_name("sql-secret")], + ) + def load_tables(): + import dlt + from dlt.sources.sql_database import sql_database + + # Define the source database credentials; in production, you would save this as a Modal Secret which can be referenced here as an environment variable + # os.environ['SOURCES__SQL_DATABASE__CREDENTIALS']="mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" + + # Load tables "family" and "genome" + source = sql_database().with_resources("family", "genome") + + # Create dlt pipeline object + pipeline = dlt.pipeline( + pipeline_name="sql_to_duckdb_pipeline", + destination=dlt.destinations.duckdb("/data/rfam.duckdb"), # write the duckdb database file to this file location,, which will get mounted to the Modal Volume + dataset_name="sql_to_duckdb_pipeline_data", + progress="log", # output progress of the pipeline + ) + + # Run the pipeline + load_info = pipeline.run(source) + + # Print run statistics + print(load_info) + # @@@DLT_SNIPPET_END modal_function + + assert_load_info(load_info) \ No newline at end of file diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal.md index dc05d11123..d650e0e1b7 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-with-modal.md @@ -42,26 +42,7 @@ Here’s a dlt project setup to copy data from our MySQL into DuckDB: dlt init sql_database duckdb ``` 2. Open the file and define the Modal Image you want to run `dlt` in: - ```py - import modal - import os - - # Define the Modal Image - image = ( - modal.Image.debian_slim() - .pip_install( - "dlt>=1.1.0", - "dlt[duckdb]", # destination - "dlt[sql_database]", # source (postgres) - "pymysql" # database driver for MySQL source - ) - ) - - app = modal.App("example-dlt", image=image) - - # Modal Volume used to store the duckdb database file - vol = modal.Volume.from_name("duckdb-vol", create_if_missing=True) - ``` + 3. Define a Modal Function. A Modal Function is a containerized environment that runs tasks. It can be scheduled (e.g., daily or on a Cron schedule), request more CPU/memory, and scale across @@ -69,36 +50,7 @@ Here’s a dlt project setup to copy data from our MySQL into DuckDB: Here’s how to include your SQL pipeline in the Modal Function: - ```py - @app.function( - volumes={"/data/": vol}, - schedule=modal.Period(days=1), - secrets=[modal.Secret.from_name("sql-secret")], - ) - def load_tables(): - import dlt - from dlt.sources.sql_database import sql_database - - # Define the source database credentials; in production, you would save this as a Modal Secret which can be referenced here as an environment variable - # os.environ['SOURCES__SQL_DATABASE__CREDENTIALS']="mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam" - - # Load tables "family" and "genome" - source = sql_database().with_resources("family", "genome") - - # Create dlt pipeline object - pipeline = dlt.pipeline( - pipeline_name="sql_to_duckdb_pipeline", - destination=dlt.destinations.duckdb("/data/rfam.duckdb"), # write the duckdb database file to this file location,, which will get mounted to the Modal Volume - dataset_name="sql_to_duckdb_pipeline_data", - progress="log", # output progress of the pipeline - ) - - # Run the pipeline - load_info = pipeline.run(source) - - # Print run statistics - print(load_info) - ``` + 4. You can securely store your credentials using Modal secrets. When you reference secrets within a Modal script, the defined secret is automatically set as an environment variable. dlt natively supports environment variables,