Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update master with latest release #18

Merged
merged 13 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
16 changes: 6 additions & 10 deletions dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
ARG BASE_CONTAINER=meltano/meltano:v2.10.0-python3.9
ARG BASE_CONTAINER=meltano/meltano:v2.17.1-python3.9
# TODO: consider meltano/meltano:v2-python3.9
# metano tap-rest-api-msdk requires python ~3.9
# meltano tap-rest-api-msdk requires python ~3.9


FROM $BASE_CONTAINER as basepython

Expand All @@ -12,8 +13,8 @@ LABEL org.opencontainers.image.description="Mimodast. A minimal modern data stac
LABEL org.opencontainers.image.licenses=MIT

COPY requirements.txt .
#RUN pip install --upgrade pip
RUN pip install -r requirements.txt
#duckdb --> numpy-1.23.4

# For the healthcheck
RUN apt update \
Expand Down Expand Up @@ -51,20 +52,15 @@ RUN mkdir -p /${MELTANO_PROJ_ROOT}/data/dev/ \
&& mkdir -p /${MELTANO_PROJ_ROOT}/data/prod/ \
&& /${MELTANO_PROJ_ROOT}/${DUCKDB_CLI_FOLDER}/duckdb /${MELTANO_PROJ_ROOT}/data/dev/data.duckdb "select * from pg_tables;" \
&& /${MELTANO_PROJ_ROOT}/${DUCKDB_CLI_FOLDER}/duckdb /${MELTANO_PROJ_ROOT}/data/test/data.duckdb "select * from pg_tables;" \
&& /${MELTANO_PROJ_ROOT}/${DUCKDB_CLI_FOLDER}/duckdb /${MELTANO_PROJ_ROOT}/data/prod/data.duckdb "select * from pg_tables;"


###RUN chmod -R u+x /project/data/
###RUN /project/duckdb_cli/duckdb /project/data/dev/data.duckdb "select * from pg_tables;"
&& /${MELTANO_PROJ_ROOT}/${DUCKDB_CLI_FOLDER}/duckdb /${MELTANO_PROJ_ROOT}/data/prod/data.duckdb "select * from pg_tables;" \
&& meltano invoke dbt-duckdb:deps

RUN meltano invoke airflow dags pause stage_gie_dag \
&& meltano invoke airflow dags pause stage_gie_backfill_dag

COPY ./standup/. .
RUN meltano invoke airflow variables import airflowvariables.json \
&& meltano invoke superset import-dashboards -p dashboards.zip
### \
### && meltano invoke superset import_datasources -p database.zip

COPY ./meltano_transform/. /${MELTANO_PROJ_ROOT}/${PROJECT}/transform/

Expand Down
37 changes: 19 additions & 18 deletions meltano.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ plugins:
params:
date: $ENV_DATE_GIE
#date: '2023-02-15'
stream_maps:
stg_gie_storage:
key_hash: md5(config['hash_seed'] + (gasDayStart + code))
#__alias__: stg_gie_storage_vX
stg_gie_company:
key_hash: md5(config['hash_seed'] + (gasDayStart + code))
stg_gie_country:
key_hash: md5(config['hash_seed'] + (gasDayStart + code))
stg_gie_region:
key_hash: md5(config['hash_seed'] + (gasDayStart + code))
stream_map_config:
hash_seed: 01AWZh7A6DzGm6iJZZ2T
streams:
- name: stg_gie_storage
path: /api
Expand Down Expand Up @@ -132,15 +144,13 @@ plugins:
pip_url: target-duckdb~=0.4
config:
add_metadata_columns: true
#default_target_schema: gie_stage
default_target_schema: main
filepath: $DB_LOCATION
data_flattening_max_level: 10
- name: target-duckdb-usgs
inherit_from: target-duckdb
config:
add_metadata_columns: true
#default_target_schema: usgs_stage
default_target_schema: main
filepath: $DB_LOCATION
data_flattening_max_level: 10
Expand All @@ -149,7 +159,6 @@ plugins:
inherit_from: target-duckdb
config:
add_metadata_columns: true
#default_target_schema: gie
default_target_schema: main
filepath: $DB_LOCATION
data_flattening_max_level: 10
Expand All @@ -158,42 +167,34 @@ plugins:
- name: airflow
variant: apache
pip_url: apache-airflow==2.1.2 --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.1.2/constraints-${MELTANO__PYTHON_VERSION}.txt
# transformers:
# - name: dbt-duckdb
# variant: jwills
# pip_url: dbt-core~=1.2.0 dbt-duckdb~=1.2.0
files:
- name: files-airflow
variant: meltano
pip_url: git+https://github.com/meltano/files-airflow.git --constraint https://raw.githubusercontent.com/apache/airflow/constraints-2.1.2/constraints-${MELTANO__PYTHON_VERSION}.txt
utilities:
- name: superset
variant: apache
#pip_url: apache-superset==1.5.0 markupsafe==2.0.1 duckdb-engine==0.6.4
pip_url: apache-superset==2.0.0 flask==2.0.3 werkzeug==2.0.3 jinja2==3.0.1 wtforms==2.3.3
git+https://github.com/meltano/superset-ext.git@main cryptography==3.4.7 markupsafe==2.0.1
cryptography==3.4.7
markupsafe==2.0.1
duckdb-engine==0.7.0
# git+https://github.com/meltano/superset-ext.git@main
- name: dbt-duckdb
variant: jwills
#pip_url: dbt-core~=1.3.0 dbt-duckdb~=1.3.0 git+https://github.com/meltano/dbt-ext.git@main
pip_url: dbt-core~=1.4.0 dbt-duckdb~=1.4.0 git+https://github.com/meltano/dbt-ext.git@main
pip_url: dbt-core~=1.4.0 dbt-duckdb~=1.4.0
git+https://github.com/meltano/dbt-ext.git@main
commands:
usgs:
args: run --select tag:usgs
description: Runsd dbt USGS (earthquake data) jobs instead of all dbt jobs.
description: Runs dbt USGS (earthquake data) jobs instead of all dbt jobs.
gie:
args: run --select tag:gie
description: Runs dbt GIE (gas inventory data) jobs instead of all dbt jobs.
jobs:
- name: usgs-to-duckdb-rpt
tasks:
- stg_usgs target-duckdb-usgs dbt-duckdb:usgs
schedules:
#- name: USGS-Earthquake
# interval: 35 */1 * * *
# extractor: stg_usgs
# loader: target-duckdb-usgs
# transform: skip
# start_date: 2023-01-01 15:40:21.295936
- name: USGS-Earthquake
interval: 35 */1 * * *
job: usgs-to-duckdb-rpt
Expand Down
2 changes: 2 additions & 0 deletions meltano_transform/models/gie_rpt/rpt_gie_storage.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ SSO AS
gasdaystart::DATE gasdaystart,
split_part(url, '/', 2) as country,
split_part(url, '/', 3) as company_eic,
key_hash,
code as sso_eic,
name as sso_name,
status,
Expand Down Expand Up @@ -36,6 +37,7 @@ FROM
select
_sdc_batched_at,
_sdc_extracted_at,
key_hash,
sso.gasdaystart,
country,
SSO.company_eic,
Expand Down
12 changes: 12 additions & 0 deletions meltano_transform/models/gie_rpt/rpt_gie_storage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,25 @@ models:
+tags:
- gie

tests:
- dbt_utils.unique_combination_of_columns:
combination_of_columns:
- sso_eic
- gasdaystart

columns:
- name: _sdc_batched_at
description: Timestamp when the data was captured in the database.

- name: _sdc_extracted_at
description: Timestamp when the data was retrieved from the REST API.

- name: key_hash
description: Has of sso_eic and gasdaystart.
tests:
- unique
- not_null

- name: gasdaystart
description: Date of the observation. Ex. the injection field refers to the injection on this date. gasinstorage as per end of the gasdaystart.
tests:
Expand Down
4 changes: 4 additions & 0 deletions meltano_transform/models/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

![mimodast Logo](https://github.com/EJOOSTEROP/mimodast/blob/master/assets/hatchful/logo_transparent.png)

<a href="https://github.com/EJOOSTEROP/mimodast">
<img src="assets/hatchful/logo_transparent.png" alt="Logo" width="180" height="180">
</a>

# Mimodast dbt Project
Mimodast is a minimal modern data stack with working data pipelines in a single Docker container.

Expand Down
3 changes: 3 additions & 0 deletions meltano_transform/packages.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
packages:
- package: dbt-labs/dbt_utils
version: 1.0.0
3 changes: 3 additions & 0 deletions meltano_transform/tests/richter_max.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT *
FROM {{ ref('rpt_usgs_events')}}
WHERE magnitude > 10
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT *
FROM {{ ref('rpt_gie_storage')}}
WHERE _sdc_batched_at < _sdc_extracted_at
WHERE gasinstorage - workinggasvolume > 1
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ version: 2

# NOTE: THIS DOES NOT WORK. UNCLEAR IF YOU CAN DOCUMENT A TEST AT THIS STAGE.
tests:
- name: stage_timing
- name: storage_max
description: Data needs to be extracted before it can be stored in the database.
Loading