Skip to content

Commit

Permalink
feat: add CI for a scaled down version of a cancer WES pipeline (#499)
Browse files Browse the repository at this point in the history
  • Loading branch information
tedil authored Apr 18, 2024
1 parent 79499d9 commit b971088
Show file tree
Hide file tree
Showing 40 changed files with 12,808 additions and 0 deletions.
80 changes: 80 additions & 0 deletions .github/workflows/ci-e2e.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: Tests

on:
# always run tests when pushing to main
push:
branches: [ main ]

# only run tests on pull requests which actually modify files that affect the tests
pull_request:
branches_ignore: []
# for now, the tests defined in this file are only run when any of the following paths are modified:
paths:
# workflow definitions
- '.tests/**'

# github workflow configurations
- '.github/**'

# 'base' files of snappy-pipeline
- 'snappy_pipeline/*'
- 'snappy_pipeline/workflows/abstract/**'

# steps used in the test workflows
- 'snappy_pipeline/workflows/ngs_mapping/**'
- 'snappy_pipeline/workflows/somatic_variant_calling/**'
- 'snappy_pipeline/workflows/somatic_variant_annotation/**'
- 'snappy_pipeline/workflows/somatic_variant_filtration/**'

# 'base' files of snappy_wrappers
- 'snappy_wrappers/*'

# wrappers used in the test workflows
- 'snappy_wrappers/wrappers/alfred/**'
- 'snappy_wrappers/wrappers/bcftools/**'
- 'snappy_wrappers/wrappers/bwa/**'
- 'snappy_wrappers/wrappers/link_in_bam/**'
- 'snappy_wrappers/wrappers/mutect/**'
- 'snappy_wrappers/wrappers/mutect_par/**'
- 'snappy_wrappers/wrappers/mutect2/**'
- 'snappy_wrappers/wrappers/mutect2_par/**'
- 'snappy_wrappers/wrappers/ngs_chew/**'
- 'snappy_wrappers/wrappers/somatic_variant_filtration/**'
- 'snappy_wrappers/wrappers/vep/**'


jobs:

Dryrun_Tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
lfs: 'true'
- name: Test workflow (local FASTQs)
uses: snakemake/snakemake-github-action@v1
with:
directory: .tests/test-workflow
snakefile: .tests/test-workflow/workflow/Snakefile
args: "--configfile .tests/test-workflow/config/config.yaml --use-conda --show-failed-logs -j 2 --conda-cleanup-pkgs cache --dryrun"
show-disk-usage-on-error: true


Tests:
runs-on: ubuntu-latest
needs:
- Dryrun_Tests
steps:
- name: update apt
run: sudo apt-get update
- uses: actions/checkout@v4
with:
lfs: 'true'
- name: Test workflow (local FASTQs)
uses: snakemake/snakemake-github-action@v1
with:
directory: .tests/test-workflow
snakefile: .tests/test-workflow/workflow/Snakefile
args: "--configfile .tests/test-workflow/config/config.yaml --use-conda --show-failed-logs -j 2 --conda-cleanup-pkgs cache"
show-disk-usage-on-error: true

Empty file.
8 changes: 8 additions & 0 deletions .tests/test-workflow/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
**/.snappy_path_cache
**/.*.lock
**/output
**/work
logs
snappy-pipeline
pipelines/*/.snappy_pipeline/config.yaml
**/__pycache__
61 changes: 61 additions & 0 deletions .tests/test-workflow/config/cancer_wes/config.yaml.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
static_data_config:
reference:
path: {{ params["reference"] }}

step_config:
ngs_mapping:
tools:
dna: [bwa] # Required if DNA analysis; otherwise, leave empty. Example: 'bwa'.
bwa:
path_index: {{ params["bwa_index"] }}
mask_duplicates: false
memory_bam_sort: 2G
num_threads_bam_sort: 2
num_threads_bam_view: 2
num_threads_align: 2
ngs_chew_fingerprint:
enabled: false
target_coverage_report:
path_target_interval_list_mapping:
- name: MedExome_hg19_empirical_targets
pattern: MedExome
path: ../resources/Exome-MedExome.chr12.bed


somatic_variant_calling:
tools: [mutect2]
mutect2:
ngs_mapping: ../ngs_mapping
extra_arguments: []
window_length: 300000000
keep_tmpdir: onerror
job_mult_time: 5


somatic_variant_annotation:
path_somatic_variant_calling: ../somatic_variant_calling # REQUIRED
tools: ["vep"]
vep:
cache_dir: {{ params["vep_cache"] }}
assembly: GRCh37


somatic_variant_filtration:
path_somatic_variant: ../somatic_variant_annotation
path_ngs_mapping: ../ngs_mapping
filter_list:
- dkfz: {}
- bcftools:
exclude: FORMAT/DP[1]<=50 | AD[1:1]<5 | AD[1:1]/(AD[1:0]+AD[1:1])<0.025



data_sets:
trbc:
file: samplesheet.tsv
search_patterns:
- {left: '*.R1.fastq.gz', right: '*.R2.fastq.gz'}
search_paths:
- ../raw
type: matched_cancer
naming_scheme: only_secondary_id
12 changes: 12 additions & 0 deletions .tests/test-workflow/config/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
reference:
species: "homo_sapiens"
datatype: "dna"
build: "GRCh37"
release: 111
chromosome: "12"

pipeline-configuration:
cancer_wes:
workdir: "pipelines/snappy-cancer_wes"
config: "pipelines/snappy-cancer_wes/.snappy_pipeline/config.yaml"
samplesheet: "pipelines/snappy-cancer_wes/.snappy_pipeline/samplesheet.tsv"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
raw filter=lfs diff=lfs merge=lfs -text
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[Metadata]
schema cancer_matched
schema_version v1
title TRCB
description Public cancer dataset https://www.nature.com/articles/sdata201610

[Custom Fields]
key annotatedEntity docs type minimum maximum unit choices pattern
extractionType bioSample extraction type string 0 0 0 0 0
libraryKit ngsLibrary exome enrichment kit string 0 0 0 0 0

[Data]
patientName sampleName isTumor extractionType libraryType folderName libraryKit
case001subregion N1 N DNA WES case001subregion-N1-DNA1-WES1 MedExome
case001subregion T1 Y DNA WES case001subregion-T1-DNA1-WES1 MedExome
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pipeline_step:
name: ngs_mapping
version: 1

$ref: 'file://../.snappy_pipeline/config.yaml'
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/bin/bash

# SNAPPY best practice pipeline_job.sh
#
# Version: 3
# Date: 2017-02-02

# The medium project/queue is a sensible default.
#SBATCH --partition medium
# Set a required running time for the master job.
#SBATCH --time 3-00
# Reserve some resources
#SBATCH --mem=6G
# Keep current environment variables
#SBATCH --export=all
# Send a mail upon job completion and error
##SBATCH --mail-type ALL
##SBATCH --mail-user your.name@mdc-berlin.de
# Logs should be written into "slurm_log" sub directory.
#SBATCH --output slurm_log/%x-%J.log
# Use more descriptive name in Slurm.
#SBATCH --job-name ngs_mapping

# Enable the official bash strict mode (fail early, fail often)
set -euo pipefail

# Fix the umask.
umask ug=rwx,o=

# Configuration variables ---------------------------------------------------

# Maximal number of jobs to execute at the same time
MAX_JOBS=500
# Maximal number of jobs per second
MAX_JOBS_PER_SECOND=10
# Number of times to restart jobs
RESTART_TIMES=0

# Check preconditions -------------------------------------------------------

# Ensure slurm_log is a directory
test -d slurm_log || { >&2 echo "${PWD}/slurm_log does not exist"; exit 1; }

# Enforce existence of TMPDIR -----------------------------------------------

export TMPDIR=${HOME}/scratch/tmp
mkdir -p ${TMPDIR}

# Create one log directory per Snakemake run --------------------------------

test -z "${SLURM_JOB_ID-}" && SLURM_JOB_ID=$(date +%Y-%m-%d_%H-%M)
LOGDIR=slurm_log/${SLURM_JOB_ID}
mkdir -p ${LOGDIR}
export SBATCH_DEFAULTS=" --output=${LOGDIR}/%x-%j.log"

# Activate appropriate Miniconda3 installation ------------------------------

# 1. If CONDA_PATH is set, use this.
# 2. Look into parent directories for miniconda3 (owned by current user)
# 3. Look whether there is a conda in $PATH and use it.
# 4. Look for ~/miniconda3 and use it
# 5. If all fails, bail out.

conda-in-parent()
{
current=$PWD
while [[ -n "$current" ]] && [[ "$current" != "/" ]]; do
if [[ -e "$current/miniconda3.$USER" ]] && \
[[ $(stat -c %u $current/miniconda3.$USER) == $UID ]]; then
echo "$current/miniconda3.$USER"
return 0
fi
if [[ -e "$current/miniconda3" ]] && \
[[ $(stat -c %u $current/miniconda3) == $UID ]]; then
echo "$current/miniconda3"
return 0
fi
current=$(dirname $current)
done

return 1
}

if [[ -n "${CONDA_PATH-}" ]] || CONDA_PATH=$(conda-in-parent); then
:
elif which conda >/dev/null; then
CONDA_PATH=$(dirname $(dirname $(which conda)))
elif [[ -e $HOME/miniconda3 ]]; then
CONDA_PATH=$HOME/miniconda3
elif [[ -e $HOME/work/miniconda3 ]]; then
CONDA_PATH=$HOME/work/miniconda3
else
>&2 echo "Could not determine a suitable CONDA_PATH."
exit 1
fi

>&2 echo "Using conda installation in $CONDA_PATH"
>&2 echo "+ conda activate "
set +euo pipefail
conda deactivate &>/dev/null || true # disable any existing
source $CONDA_PATH/etc/profile.d/conda.sh
conda activate # enable found
set -euo pipefail

# Activate bash cmd printing, debug info ------------------------------------

set -x
>&2 hostname
>&2 date

# Kick off Snakemake --------------------------------------------------------

# Interpret array jobs.
# Allow selection of batch
if [[ ! -z "${SNAPPY_BATCH-}" ]]; then
SNAKEMAKE_BATCH_ARG="--batch ${SNAKEMAKE_BATCH_RULE-default}=${SNAPPY_BATCH}"
else
SNAKEMAKE_BATCH_ARG=
fi

# Using the medium project/queue is a sensible default.
snappy-snake --printshellcmds \
${SNAKEMAKE_BATCH_ARG} \
--snappy-pipeline-use-profile "cubi-v1" \
--snappy-pipeline-jobs $MAX_JOBS \
--restart-times ${RESTART_TIMES} \
--default-partition="medium" \
--rerun-incomplete \
-- \
$*

# Print date after finishing, for good measure ------------------------------

>&2 date
>&2 echo "All done. Have a nice day."
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Loading

0 comments on commit b971088

Please sign in to comment.