diff --git a/docs/credible_set_qc/README.md b/docs/credible_set_qc/README.md new file mode 100644 index 0000000..2863927 --- /dev/null +++ b/docs/credible_set_qc/README.md @@ -0,0 +1,17 @@ +## Credible set qc dag + +Credible set qc is a set of operations performed on the `StudyLocus` datasets originally finemapped by OpenTargets to: + +- Ensure pValue of each locus does meet the pre-defined threshold +- Perform repartitioning of the credible sets, as the output from the batch job contains files per loci, resulting in slow queries. +- Ensure no duplicated loci exist in the clean credible sets. + +![credible_set_qc](credible_set_qc.svg) + +The dag contains following steps: + +- qc of credible sets coming from `gwas_catalog_sumstats_susie` bucket +- qc of credible sets coming from `ukb_ppp_eur_data` bucket + +> [!NOTE] +> The outputs of the steps are contained in the target bucket with prefix _credible_set_clean_. diff --git a/docs/credible_set_qc/credible_set_qc.svg b/docs/credible_set_qc/credible_set_qc.svg new file mode 100644 index 0000000..2af33a7 --- /dev/null +++ b/docs/credible_set_qc/credible_set_qc.svg @@ -0,0 +1,62 @@ + + + + + + +credible_set_qc + +credible_set_qc + + +create_cluster + +create_cluster + + + +gwas_catalog_sumstats_susie_credible_set_qc + +gwas_catalog_sumstats_susie_credible_set_qc + + + +create_cluster->gwas_catalog_sumstats_susie_credible_set_qc + + + + + +ukb_ppp_eur_data_credible_set_qc + +ukb_ppp_eur_data_credible_set_qc + + + +create_cluster->ukb_ppp_eur_data_credible_set_qc + + + + + +delete_cluster + +delete_cluster + + + +gwas_catalog_sumstats_susie_credible_set_qc->delete_cluster + + + + + +ukb_ppp_eur_data_credible_set_qc->delete_cluster + + + + + diff --git a/docs/datasources/gwas_catalog_data/README.md b/docs/datasources/gwas_catalog_data/README.md index 4e3b2da..9ac7e70 100644 --- a/docs/datasources/gwas_catalog_data/README.md +++ b/docs/datasources/gwas_catalog_data/README.md @@ -260,6 +260,7 @@ Bucket `gs://gwas_catalog_sumstats_susie` contains: ``` gs://gwas_catalog_sumstats_susie/credible_set_datasets/ +gs://gwas_catalog_sumstats_susie/credible_sets_clean/ gs://gwas_catalog_sumstats_susie/finemapping_logs/ gs://gwas_catalog_sumstats_susie/finemapping_manifests/ gs://gwas_catalog_sumstats_susie/study_index/ @@ -324,6 +325,11 @@ The output of finemapping can be found under the: - `gs://gwas_catalog_sumstats_susie/finemapping_manifests/` - manifests used during the fine mapping job - `gs://gwas_catalog_sumstats_susie/finemapping_logs/` - logs from the individual finemapping tasks +### Credible set qc + +After the finemapping is performed, the qc dag is run. For more detail see [credible set qc dag](../../credible_set_qc/README.md) +The final credible sets are collected in the `gs://gwas_catalog_sumstats_susie/credible_set_clean/` + #### Parametrization of google batch finemapping job The configuration of the google batch infrastructure and individual step parameters can be found in `gwas_catalog_sumstats_susie_finemapping.yaml` file. diff --git a/docs/datasources/ukb_ppp_eur_data/README.md b/docs/datasources/ukb_ppp_eur_data/README.md index 375d4d8..f7eaf57 100644 --- a/docs/datasources/ukb_ppp_eur_data/README.md +++ b/docs/datasources/ukb_ppp_eur_data/README.md @@ -8,6 +8,7 @@ Data stored under `gs://ukb_ppp_eur_data` bucket comes with following structure ``` gs://ukb_ppp_eur_data/credible_set_datasets/susie +gs://ukb_ppp_eur_data/credible_set_clean/ gs://ukb_ppp_eur_data/docs/ gs://ukb_ppp_eur_data/finemapping_logs/ gs://ukb_ppp_eur_data/finemapping_manifests/ @@ -109,6 +110,11 @@ The output of finemapping can be found under the: - `gs://ukb_ppp_eur_data/finemapping_manifests/` - manifests used during the fine mapping job - `gs://ukb_ppp_eur_data/finemapping_logs/` - logs from the individual finemapping tasks +### Credible set qc + +After the finemapping is performed, the qc dag is run. For more detail see [credible set qc dag](../../credible_set_qc/README.md). +The final credible sets are collected in the `gs://ukb_ppp_eur_data/credible_set_clean/`. + #### Parametrization of google batch finemapping job The configuration of the google batch infrastructure and individual step parameters can be found in `ukb_ppp_eur_finemapping.yaml` file. diff --git a/src/ot_orchestration/dags/config/credible_set_qc.yaml b/src/ot_orchestration/dags/config/credible_set_qc.yaml new file mode 100644 index 0000000..d1a4810 --- /dev/null +++ b/src/ot_orchestration/dags/config/credible_set_qc.yaml @@ -0,0 +1,34 @@ +dataproc: + python_main_module: gs://genetics_etl_python_playground/initialisation/gentropy/dev/cli.py + cluster_metadata: + PACKAGE: gs://genetics_etl_python_playground/initialisation/gentropy/dev/gentropy-0.0.0-py3-none-any.whl + cluster_init_script: gs://genetics_etl_python_playground/initialisation/gentropy/dev/install_dependencies_on_cluster.sh + cluster_name: otg-credible-set-qc + autoscaling_policy: otg-etl + +nodes: + - id: gwas_catalog_sumstats_susie_credible_set_qc + kind: Task + prerequisites: [] + params: + step: credible_set_qc + step.credible_sets_path: gs://gwas_catalog_sumstats_susie/credible_set_datasets + step.output_path: gs://gwas_catalog_sumstats_susie/credible_set_clean + step.p_value_threshold: 1.0e-5 + step.purity_min_r2: 0.01 + step.n_partitions: 200 + step.session.write_mode: overwrite + step.session.start_hail: true + + - id: ukb_ppp_eur_data_credible_set_qc + kind: Task + prerequisites: [] + params: + step: credible_set_qc + step.credible_sets_path: gs://ukb_ppp_eur_data/credible_set_datasets/susie + step.output_path: gs://ukb_ppp_eur_data/credible_set_clean + step.p_value_threshold: 1.0e-5 + step.purity_min_r2: 0.01 + step.n_partitions: 50 + step.session.write_mode: overwrite + step.session.start_hail: true diff --git a/src/ot_orchestration/dags/credible_set_qc.py b/src/ot_orchestration/dags/credible_set_qc.py new file mode 100644 index 0000000..6f31f31 --- /dev/null +++ b/src/ot_orchestration/dags/credible_set_qc.py @@ -0,0 +1,41 @@ +"""Airflow DAG for the credible set qc.""" + +from __future__ import annotations + +from pathlib import Path + +from airflow.models.dag import DAG + +from ot_orchestration.utils import chain_dependencies, read_yaml_config +from ot_orchestration.utils.common import shared_dag_args, shared_dag_kwargs +from ot_orchestration.utils.dataproc import ( + generate_dataproc_task_chain, + submit_gentropy_step, +) + +CONFIG_FILE_PATH = Path(__file__).parent / "config" / "credible_set_qc.yaml" +config = read_yaml_config(CONFIG_FILE_PATH) + +with DAG( + dag_id=Path(__file__).stem, + description="Open Targets Genetics — CredibleSet QC ", + default_args=shared_dag_args, + **shared_dag_kwargs, +): + tasks = {} + for step in config["nodes"]: + task = submit_gentropy_step( + cluster_name=config["dataproc"]["cluster_name"], + step_name=step["id"], + python_main_module=config["dataproc"]["python_main_module"], + params=step["params"], + ) + tasks[step["id"]] = task + + chain_dependencies(nodes=config["nodes"], tasks_or_task_groups=tasks) + dag = generate_dataproc_task_chain( + cluster_name=config["dataproc"]["cluster_name"], + cluster_init_script=config["dataproc"]["cluster_init_script"], + cluster_metadata=config["dataproc"]["cluster_metadata"], + tasks=[t for t in tasks.values()], + )