feat: add CI for a scaled down version of a cancer WES pipeline (#499)

bihealth · Apr 18, 2024 · b971088 · b971088
1 parent 79499d9
commit b971088
Show file tree

Hide file tree

Showing 40 changed files with 12,808 additions and 0 deletions.
diff --git a/.github/workflows/ci-e2e.yml b/.github/workflows/ci-e2e.yml
@@ -0,0 +1,80 @@
+name: Tests
+
+on:
+  # always run tests when pushing to main
+  push:
+    branches: [ main ]
+
+  # only run tests on pull requests which actually modify files that affect the tests
+  pull_request:
+    branches_ignore: []
+    # for now, the tests defined in this file are only run when any of the following paths are modified:
+    paths:
+      # workflow definitions
+      - '.tests/**'
+
+      # github workflow configurations
+      - '.github/**'
+
+      # 'base' files of snappy-pipeline
+      - 'snappy_pipeline/*'
+      - 'snappy_pipeline/workflows/abstract/**'
+
+      # steps used in the test workflows
+      - 'snappy_pipeline/workflows/ngs_mapping/**'
+      - 'snappy_pipeline/workflows/somatic_variant_calling/**'
+      - 'snappy_pipeline/workflows/somatic_variant_annotation/**'
+      - 'snappy_pipeline/workflows/somatic_variant_filtration/**'
+
+      # 'base' files of snappy_wrappers
+      - 'snappy_wrappers/*'
+
+      # wrappers used in the test workflows
+      - 'snappy_wrappers/wrappers/alfred/**'
+      - 'snappy_wrappers/wrappers/bcftools/**'
+      - 'snappy_wrappers/wrappers/bwa/**'
+      - 'snappy_wrappers/wrappers/link_in_bam/**'
+      - 'snappy_wrappers/wrappers/mutect/**'
+      - 'snappy_wrappers/wrappers/mutect_par/**'
+      - 'snappy_wrappers/wrappers/mutect2/**'
+      - 'snappy_wrappers/wrappers/mutect2_par/**'
+      - 'snappy_wrappers/wrappers/ngs_chew/**'
+      - 'snappy_wrappers/wrappers/somatic_variant_filtration/**'
+      - 'snappy_wrappers/wrappers/vep/**'
+
+
+jobs:
+
+  Dryrun_Tests:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        lfs: 'true'
+    - name: Test workflow (local FASTQs)
+      uses: snakemake/snakemake-github-action@v1
+      with:
+        directory: .tests/test-workflow
+        snakefile: .tests/test-workflow/workflow/Snakefile
+        args: "--configfile .tests/test-workflow/config/config.yaml --use-conda --show-failed-logs -j 2 --conda-cleanup-pkgs cache --dryrun"
+        show-disk-usage-on-error: true
+
+
+  Tests:
+    runs-on: ubuntu-latest
+    needs:
+      - Dryrun_Tests
+    steps:
+    - name: update apt
+      run: sudo apt-get update
+    - uses: actions/checkout@v4
+      with:
+        lfs: 'true'
+    - name: Test workflow (local FASTQs)
+      uses: snakemake/snakemake-github-action@v1
+      with:
+        directory: .tests/test-workflow
+        snakefile: .tests/test-workflow/workflow/Snakefile
+        args: "--configfile .tests/test-workflow/config/config.yaml --use-conda --show-failed-logs -j 2 --conda-cleanup-pkgs cache"
+        show-disk-usage-on-error: true
+
diff --git a/.tests/test-workflow/.gitattributes b/.tests/test-workflow/.gitattributes
diff --git a/.tests/test-workflow/.gitignore b/.tests/test-workflow/.gitignore
@@ -0,0 +1,8 @@
+**/.snappy_path_cache
+**/.*.lock
+**/output
+**/work
+logs
+snappy-pipeline
+pipelines/*/.snappy_pipeline/config.yaml
+**/__pycache__
diff --git a/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2 b/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2
@@ -0,0 +1,61 @@
+static_data_config:
+  reference:
+    path: {{ params["reference"] }}
+
+step_config:
+  ngs_mapping:
+    tools:
+      dna: [bwa]    # Required if DNA analysis; otherwise, leave empty. Example: 'bwa'.
+    bwa:
+      path_index: {{ params["bwa_index"] }}
+      mask_duplicates: false
+      memory_bam_sort: 2G
+      num_threads_bam_sort: 2
+      num_threads_bam_view: 2
+      num_threads_align: 2
+    ngs_chew_fingerprint:
+      enabled: false
+    target_coverage_report:
+      path_target_interval_list_mapping:
+      - name: MedExome_hg19_empirical_targets
+        pattern: MedExome
+        path: ../resources/Exome-MedExome.chr12.bed
+
+
+  somatic_variant_calling:
+    tools: [mutect2]
+    mutect2:
+      ngs_mapping: ../ngs_mapping
+      extra_arguments: []
+      window_length: 300000000
+      keep_tmpdir: onerror
+      job_mult_time: 5
+
+
+  somatic_variant_annotation:
+    path_somatic_variant_calling: ../somatic_variant_calling  # REQUIRED
+    tools: ["vep"]
+    vep:
+      cache_dir: {{ params["vep_cache"] }}
+      assembly: GRCh37
+
+
+  somatic_variant_filtration:
+    path_somatic_variant: ../somatic_variant_annotation
+    path_ngs_mapping: ../ngs_mapping
+    filter_list:
+    - dkfz: {}
+    - bcftools:
+        exclude: FORMAT/DP[1]<=50 | AD[1:1]<5 | AD[1:1]/(AD[1:0]+AD[1:1])<0.025
+
+
+
+data_sets:
+  trbc:
+    file: samplesheet.tsv
+    search_patterns:
+    - {left: '*.R1.fastq.gz', right: '*.R2.fastq.gz'}
+    search_paths:
+    - ../raw
+    type: matched_cancer
+    naming_scheme: only_secondary_id
diff --git a/.tests/test-workflow/config/config.yaml b/.tests/test-workflow/config/config.yaml
@@ -0,0 +1,12 @@
+reference:
+  species: "homo_sapiens"
+  datatype: "dna"
+  build: "GRCh37"
+  release: 111
+  chromosome: "12"
+
+pipeline-configuration:
+  cancer_wes:
+    workdir: "pipelines/snappy-cancer_wes"
+    config: "pipelines/snappy-cancer_wes/.snappy_pipeline/config.yaml"
+    samplesheet: "pipelines/snappy-cancer_wes/.snappy_pipeline/samplesheet.tsv"
diff --git a/.tests/test-workflow/pipelines/snappy-cancer_wes/.gitattributes b/.tests/test-workflow/pipelines/snappy-cancer_wes/.gitattributes
@@ -0,0 +1 @@
+raw filter=lfs diff=lfs merge=lfs -text
diff --git a/.tests/test-workflow/pipelines/snappy-cancer_wes/.snappy_pipeline/samplesheet.tsv b/.tests/test-workflow/pipelines/snappy-cancer_wes/.snappy_pipeline/samplesheet.tsv
@@ -0,0 +1,15 @@
+[Metadata]
+schema	cancer_matched
+schema_version	v1
+title	TRCB
+description	Public cancer dataset https://www.nature.com/articles/sdata201610
+
+[Custom Fields]
+key	annotatedEntity	docs	type	minimum	maximum	unit	choices	pattern
+extractionType	bioSample	extraction type	string	0	0	0	0	0
+libraryKit	ngsLibrary	exome enrichment kit	string	0	0	0	0	0
+
+[Data]
+patientName	sampleName	isTumor	extractionType	libraryType	folderName	libraryKit
+case001subregion	N1	N	DNA	WES	case001subregion-N1-DNA1-WES1	MedExome
+case001subregion	T1	Y	DNA	WES	case001subregion-T1-DNA1-WES1	MedExome
diff --git a/.tests/test-workflow/pipelines/snappy-cancer_wes/ngs_mapping/config.yaml b/.tests/test-workflow/pipelines/snappy-cancer_wes/ngs_mapping/config.yaml
@@ -0,0 +1,5 @@
+pipeline_step:
+  name: ngs_mapping
+  version: 1
+
+$ref: 'file://../.snappy_pipeline/config.yaml'
diff --git a/.tests/test-workflow/pipelines/snappy-cancer_wes/ngs_mapping/pipeline_job.sh b/.tests/test-workflow/pipelines/snappy-cancer_wes/ngs_mapping/pipeline_job.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+
+# SNAPPY best practice pipeline_job.sh
+#
+# Version: 3
+# Date: 2017-02-02
+
+# The medium project/queue is a sensible default.
+#SBATCH --partition medium
+# Set a required running time for the master job.
+#SBATCH --time 3-00
+# Reserve some resources
+#SBATCH --mem=6G
+# Keep current environment variables
+#SBATCH --export=all
+# Send a mail upon job completion and error
+##SBATCH --mail-type ALL
+##SBATCH --mail-user your.name@mdc-berlin.de
+# Logs should be written into "slurm_log" sub directory.
+#SBATCH --output slurm_log/%x-%J.log
+# Use more descriptive name in Slurm.
+#SBATCH --job-name ngs_mapping
+
+# Enable the official bash strict mode (fail early, fail often)
+set -euo pipefail
+
+# Fix the umask.
+umask ug=rwx,o=
+
+# Configuration variables ---------------------------------------------------
+
+# Maximal number of jobs to execute at the same time
+MAX_JOBS=500
+# Maximal number of jobs per second
+MAX_JOBS_PER_SECOND=10
+# Number of times to restart jobs
+RESTART_TIMES=0
+
+# Check preconditions -------------------------------------------------------
+
+# Ensure slurm_log is a directory
+test -d slurm_log || { >&2 echo "${PWD}/slurm_log does not exist"; exit 1; }
+
+# Enforce existence of TMPDIR -----------------------------------------------
+
+export TMPDIR=${HOME}/scratch/tmp
+mkdir -p ${TMPDIR}
+
+# Create one log directory per Snakemake run --------------------------------
+
+test -z "${SLURM_JOB_ID-}" && SLURM_JOB_ID=$(date +%Y-%m-%d_%H-%M)
+LOGDIR=slurm_log/${SLURM_JOB_ID}
+mkdir -p ${LOGDIR}
+export SBATCH_DEFAULTS=" --output=${LOGDIR}/%x-%j.log"
+
+# Activate appropriate Miniconda3 installation ------------------------------
+
+# 1. If CONDA_PATH is set, use this.
+# 2. Look into parent directories for miniconda3 (owned by current user)
+# 3. Look whether there is a conda in $PATH and use it.
+# 4. Look for ~/miniconda3 and use it
+# 5. If all fails, bail out.
+
+conda-in-parent()
+{
+    current=$PWD
+    while [[ -n "$current" ]] && [[ "$current" != "/" ]]; do
+        if [[ -e "$current/miniconda3.$USER" ]] && \
+                [[ $(stat -c %u $current/miniconda3.$USER) == $UID ]]; then
+            echo "$current/miniconda3.$USER"
+            return 0
+        fi
+        if [[ -e "$current/miniconda3" ]] && \
+                [[ $(stat -c %u $current/miniconda3) == $UID ]]; then
+            echo "$current/miniconda3"
+            return 0
+        fi
+        current=$(dirname $current)
+    done
+
+    return 1
+}
+
+if [[ -n "${CONDA_PATH-}" ]] || CONDA_PATH=$(conda-in-parent); then
+    :
+elif which conda >/dev/null; then
+    CONDA_PATH=$(dirname $(dirname $(which conda)))
+elif [[ -e $HOME/miniconda3 ]]; then
+    CONDA_PATH=$HOME/miniconda3
+elif [[ -e $HOME/work/miniconda3 ]]; then
+    CONDA_PATH=$HOME/work/miniconda3
+else
+    >&2 echo "Could not determine a suitable CONDA_PATH."
+    exit 1
+fi
+
+>&2 echo "Using conda installation in $CONDA_PATH"
+>&2 echo "+ conda activate "
+set +euo pipefail
+conda deactivate &>/dev/null || true  # disable any existing
+source $CONDA_PATH/etc/profile.d/conda.sh
+conda activate  # enable found
+set -euo pipefail
+
+# Activate bash cmd printing, debug info ------------------------------------
+
+set -x
+>&2 hostname
+>&2 date
+
+# Kick off Snakemake --------------------------------------------------------
+
+# Interpret array jobs.
+# Allow selection of batch
+if [[ ! -z "${SNAPPY_BATCH-}" ]]; then
+    SNAKEMAKE_BATCH_ARG="--batch ${SNAKEMAKE_BATCH_RULE-default}=${SNAPPY_BATCH}"
+else
+    SNAKEMAKE_BATCH_ARG=
+fi
+
+# Using the medium project/queue is a sensible default.
+snappy-snake --printshellcmds \
+    ${SNAKEMAKE_BATCH_ARG} \
+    --snappy-pipeline-use-profile "cubi-v1" \
+    --snappy-pipeline-jobs $MAX_JOBS \
+    --restart-times ${RESTART_TIMES} \
+    --default-partition="medium" \
+    --rerun-incomplete \
+    -- \
+    $*
+
+# Print date after finishing, for good measure ------------------------------
+
+>&2 date
+>&2 echo "All done. Have a nice day."
diff --git a/...py-cancer_wes/raw/case001subregion-N1-DNA1-WES1/case001subregion-N1-DNA1-WES1.R1.fastq.gz b/...py-cancer_wes/raw/case001subregion-N1-DNA1-WES1/case001subregion-N1-DNA1-WES1.R1.fastq.gz
diff --git a/...py-cancer_wes/raw/case001subregion-N1-DNA1-WES1/case001subregion-N1-DNA1-WES1.R2.fastq.gz b/...py-cancer_wes/raw/case001subregion-N1-DNA1-WES1/case001subregion-N1-DNA1-WES1.R2.fastq.gz
diff --git a/...py-cancer_wes/raw/case001subregion-T1-DNA1-WES1/case001subregion-T1-DNA1-WES1.R1.fastq.gz b/...py-cancer_wes/raw/case001subregion-T1-DNA1-WES1/case001subregion-T1-DNA1-WES1.R1.fastq.gz
diff --git a/...py-cancer_wes/raw/case001subregion-T1-DNA1-WES1/case001subregion-T1-DNA1-WES1.R2.fastq.gz b/...py-cancer_wes/raw/case001subregion-T1-DNA1-WES1/case001subregion-T1-DNA1-WES1.R2.fastq.gz