Skip to content

Commit

Permalink
Added the snippets
Browse files Browse the repository at this point in the history
  • Loading branch information
dat-a-man committed Oct 9, 2024
1 parent fd225f9 commit ebbd06e
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 50 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
from tests.pipeline.utils import assert_load_info

def modal_snippet() -> None:
# @@@DLT_SNIPPET_START modal_image
import modal
import os

# Define the Modal Image
image = (
modal.Image.debian_slim()
.pip_install(
"dlt>=1.1.0",
"dlt[duckdb]", # destination
"dlt[sql_database]", # source (postgres)
"pymysql" # database driver for MySQL source
)
)

app = modal.App("example-dlt", image=image)

# Modal Volume used to store the duckdb database file
vol = modal.Volume.from_name("duckdb-vol", create_if_missing=True)
# @@@DLT_SNIPPET_END modal_image

# @@@DLT_SNIPPET_START modal_function
@app.function(
volumes={"/data/": vol},
schedule=modal.Period(days=1),
secrets=[modal.Secret.from_name("sql-secret")],
)
def load_tables():
import dlt
from dlt.sources.sql_database import sql_database

# Define the source database credentials; in production, you would save this as a Modal Secret which can be referenced here as an environment variable
# os.environ['SOURCES__SQL_DATABASE__CREDENTIALS']="mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"

# Load tables "family" and "genome"
source = sql_database().with_resources("family", "genome")

# Create dlt pipeline object
pipeline = dlt.pipeline(
pipeline_name="sql_to_duckdb_pipeline",
destination=dlt.destinations.duckdb("/data/rfam.duckdb"), # write the duckdb database file to this file location,, which will get mounted to the Modal Volume
dataset_name="sql_to_duckdb_pipeline_data",
progress="log", # output progress of the pipeline
)

# Run the pipeline
load_info = pipeline.run(source)

# Print run statistics
print(load_info)
# @@@DLT_SNIPPET_END modal_function

assert_load_info(load_info)
Original file line number Diff line number Diff line change
Expand Up @@ -42,63 +42,15 @@ Here’s a dlt project setup to copy data from our MySQL into DuckDB:
dlt init sql_database duckdb
```
2. Open the file and define the Modal Image you want to run `dlt` in:
```py
import modal
import os
# Define the Modal Image
image = (
modal.Image.debian_slim()
.pip_install(
"dlt>=1.1.0",
"dlt[duckdb]", # destination
"dlt[sql_database]", # source (postgres)
"pymysql" # database driver for MySQL source
)
)
app = modal.App("example-dlt", image=image)
# Modal Volume used to store the duckdb database file
vol = modal.Volume.from_name("duckdb-vol", create_if_missing=True)
```
<!--@@@DLT_SNIPPET modal_image-->

3. Define a Modal Function. A Modal Function is a containerized environment that runs tasks.
It can be scheduled (e.g., daily or on a Cron schedule), request more CPU/memory, and scale across
multiple containers.

Here’s how to include your SQL pipeline in the Modal Function:

```py
@app.function(
volumes={"/data/": vol},
schedule=modal.Period(days=1),
secrets=[modal.Secret.from_name("sql-secret")],
)
def load_tables():
import dlt
from dlt.sources.sql_database import sql_database
# Define the source database credentials; in production, you would save this as a Modal Secret which can be referenced here as an environment variable
# os.environ['SOURCES__SQL_DATABASE__CREDENTIALS']="mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam"
# Load tables "family" and "genome"
source = sql_database().with_resources("family", "genome")
# Create dlt pipeline object
pipeline = dlt.pipeline(
pipeline_name="sql_to_duckdb_pipeline",
destination=dlt.destinations.duckdb("/data/rfam.duckdb"), # write the duckdb database file to this file location,, which will get mounted to the Modal Volume
dataset_name="sql_to_duckdb_pipeline_data",
progress="log", # output progress of the pipeline
)
# Run the pipeline
load_info = pipeline.run(source)
# Print run statistics
print(load_info)
```
<!--@@@DLT_SNIPPET modal_function-->

4. You can securely store your credentials using Modal secrets. When you reference secrets within a Modal script,
the defined secret is automatically set as an environment variable. dlt natively supports environment variables,
Expand Down

0 comments on commit ebbd06e

Please sign in to comment.