bihealth · holtgrewe · Dec 28, 2022 · Dec 27, 2022 · Dec 28, 2022 · Dec 28, 2022
diff --git a/README.md b/README.md
@@ -47,3 +47,53 @@ See [user installation](docs/quickstart.rst) if you just want to use the pipelin
 
 See [developer installation)[docs/installation.rst) for getting started with working on the pipeline code and also building the documentation.
 
+## Using GATK3
+
+Some wrappers rely on GATK 3.
+GATK v3 is not free software and cannot be redistributed.
+Earlier, we had an internal CUBI conda server but this limits use of the wrapper for the general public.
+Now, the using pipeline steps must be activated as follows.
+
+If you are a member of CUBI, you can use the central GATK download.
+Alternatively, you can download the tarball [from the Broad archive](https://storage.googleapis.com/gatk-software/package-archive/gatk/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef.tar.bz2).
+
+```bash
+$ ls -lh /fast/groups/cubi/work/projects/biotools/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef.tar.bz2
+-rw-rw---- 1 holtgrem_c hpc-ag-cubi 14M Dec 19  2019 /fast/groups/cubi/work/projects/biotools/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef.tar.bz2
+```
+
+First, go to the pipeline directory where you want to run:
+
+```bash
+$ cd variant_calling
+```
+
+Explicitely create any missing conda environment
+
+```bash
+$ snappy-snake --conda-create-envs-only
+[...]
+12-27 17:18 snakemake.logging WARNING  Downloading and installing remote packages.
+[...]
+```
+
+Find out which conda environments use GATK v3
+
+```bash
+$ grep 'gatk.*3' .snakemake/conda/*.yaml
+.snakemake/conda/d76b719b718c942f8e49e55059e956a6.yaml:  - gatk =3
+```
+
+Activate each conda environment and register
+
+```bash
+$ for yaml in $(grep -l 'gatk.*3' .snakemake/conda/*.yaml); do
+        environ=${yaml%.yaml};
+        conda activate $environ
+        gatk3-register /fast/groups/cubi/work/projects/biotools/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef.tar.bz2
+        conda deactivate
+    done
+Moving GenomeAnalysisTK-3.8-1-0-gf15c1c3ef.tar.bz2 to /home/holtgrem_c/miniconda3/envs/gatk3/opt/gatk-3.8
+```
+
+You are now ready to run GATK v3 from this environment.
diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py
@@ -210,6 +210,9 @@ def run(wrapper_args):  # noqa: C901
         snakemake_argv.append("--reason")
     if not wrapper_args.snappy_pipeline_use_profile:
         snakemake_argv += ["--cores", str(wrapper_args.cores or 1)]
+    if wrapper_args.conda_create_envs_only:
+        snakemake_argv.append("--conda-create-envs-only")
+        wrapper_args.use_conda = True
     if wrapper_args.use_conda:
         snakemake_argv.append("--use-conda")
         if mamba_available and wrapper_args.use_mamba:
@@ -403,6 +406,13 @@ def main(argv=None):
         default=True,
         help="Disable usage of conda",
     )
+    group.add_argument(
+        "--conda-create-envs-only",
+        dest="conda_create_envs_only",
+        action="store_true",
+        default=False,
+        help="Prepare all conda environments",
+    )
 
     args = parser.parse_args(argv)
 

diff --git a/snappy_pipeline/workflows/abstract/common.py b/snappy_pipeline/workflows/abstract/common.py
@@ -0,0 +1,20 @@
+"""Commonly used code and types"""
+
+import typing
+
+#: Type for a Snaekmake key/path dict.
+SnakemakeDict = typing.Dict[str, typing.Union[typing.List[str], str]]
+
+#: Type for generating pairs with key/path(s) mapping for our workflows.
+SnakemakeDictItemsGenerator = typing.Generator[
+    typing.Tuple[str, typing.Union[typing.List[str], str]],
+    None,
+    None,
+]
+
+#: Type for generating path(s) mapping for our workflows.
+SnakemakeListItemsGenerator = typing.Generator[
+    str,
+    None,
+    None,
+]
diff --git a/snappy_pipeline/workflows/abstract/exceptions.py b/snappy_pipeline/workflows/abstract/exceptions.py
@@ -0,0 +1,5 @@
+"""Exceptions used in the SNAPPY pipeline"""
+
+
+class InvalidConfigurationException(Exception):
+    """Raised on invalid configuration"""
diff --git a/snappy_pipeline/workflows/abstract/warnings.py b/snappy_pipeline/workflows/abstract/warnings.py
@@ -0,0 +1,5 @@
+"""Warnings used in SNAPPY Pipeline"""
+
+
+class InconsistentPedigreeWarning(UserWarning):
+    """Raised on inconsistencies with pedigree."""
diff --git a/snappy_pipeline/workflows/ngs_mapping/__init__.py b/snappy_pipeline/workflows/ngs_mapping/__init__.py
@@ -1,10 +1,8 @@
-# -*- coding: utf-8 -*-
 """Implementation of the ``ngs_mapping`` step
 
 The ngs_mapping step allows for the alignment of NGS data with standard read mappers, such as BWA
 for DNA data and STAR for RNA data.  Also, it provides functionality to compute post-alignment
-statistics, such as the coverage of target (e.g., exome or panel) regions and genome-wide per base
-pair coverage.
+statistics, such as the coverage of target (e.g., exome or panel) regions.
 
 There is a distinction made between "normal" DNA reads (short reads from Illumina) and "long"
 DNA reads, such as PacBio/Oxford Nanopore.  Again, the NGS mapping step will perform alignment
@@ -17,12 +15,20 @@
 configuration).
 
 ==========
-Stability
+Properties
 ==========
 
-The ngs_mapping pipeline step is considered stable for short Illumina reads (DNA and RNA).
+overall stability
 
-The long read mapping steps may not be stable as they are currently considered experimental.
+    **stable**
+
+applicable to
+
+    germline and somatic read alignment
+
+generally applicable to
+
+    short and long read DNA and RNA sequencing
 
 .. _ngs_mapping_step_input:
 

diff --git a/snappy_pipeline/workflows/roh_calling/__init__.py b/snappy_pipeline/workflows/roh_calling/__init__.py
@@ -45,22 +45,22 @@
 ::
 
     output/
-    +-- bwa.gatk_ug.bcftools_roh.P001-N1-DNA1-WGS1
+    +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1
     |   `-- out
-    │       +-- bwa.gatk_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz
-    │       +-- bwa.gatk_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz.md5
-    │       +-- bwa.gatk_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz.tbi
-    │       +-- bwa.gatk_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz.tbi.md5
-    │       +-- bwa.gatk_ug.bcftools_roh.P001-N1-DNA1-WGS1.txt.gz
-    │       +-- bwa.gatk_ug.bcftools_roh.P001-N1-DNA1-WGS1.txt.gz.md5
-    │       +-- bwa.gatk_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz
-    │       +-- bwa.gatk_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz.md5
-    │       +-- bwa.gatk_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz.tbi
-    │       +-- bwa.gatk_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz.tbi.md5
-    │       +-- bwa.gatk_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz
-    │       +-- bwa.gatk_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz.md5
-    │       +-- bwa.gatk_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz.tbi
-    │       `-- bwa.gatk_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz.tbi.md5
+    │       +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz
+    │       +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz.md5
+    │       +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz.tbi
+    │       +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz.tbi.md5
+    │       +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.txt.gz
+    │       +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.txt.gz.md5
+    │       +-- bwa.gatk3_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz
+    │       +-- bwa.gatk3_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz.md5
+    │       +-- bwa.gatk3_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz.tbi
+    │       +-- bwa.gatk3_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz.tbi.md5
+    │       +-- bwa.gatk3_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz
+    │       +-- bwa.gatk3_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz.md5
+    │       +-- bwa.gatk3_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz.tbi
+    │       `-- bwa.gatk3_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz.tbi.md5
     [...]
 
 ====================
@@ -119,7 +119,7 @@
     tools_ngs_mapping:
       - bwa
     tools_variant_calling:
-      - gatk_hc
+      - gatk3_hc
     tools: [bcftools_roh] # REQUIRED, available: 'bcftools_roh'.
     bcftools_roh:
       path_targets: null

diff --git a/snappy_pipeline/workflows/variant_annotation/__init__.py b/snappy_pipeline/workflows/variant_annotation/__init__.py
@@ -35,17 +35,17 @@
 - ``{mapper}.{var_caller}.{var_annotator}.{lib_name}.vcf.gz.tbi.md5``
 
 
-For example, annotation of gatk_hc variant calls could look like this:
+For example, annotation of gatk3_hc variant calls could look like this:
 
 ::
 
     output/
-    +-- bwa.gatk_hc.jannovar_annotate_vcf.P001-N1-DNA1-WES1
+    +-- bwa.gatk3_hc.jannovar_annotate_vcf.P001-N1-DNA1-WES1
     |   `-- out
-    |       |-- bwa.gatk_hc.jannovar_annotate_vcf.P001-N1-DNA1-WES1.vcf.gz
-    |       |-- bwa.gatk_hc.jannovar_annotate_vcf.P001-N1-DNA1-WES1.vcf.gz.tbi
-    |       |-- bwa.gatk_hc.jannovar_annotate_vcf.P001-N1-DNA1-WES1.vcf.gz.md5
-    |       `-- bwa.gatk_hc.jannovar_annotate_vcf.P001-N1-DNA1-WES1.vcf.gz.tbi.md5
+    |       |-- bwa.gatk3_hc.jannovar_annotate_vcf.P001-N1-DNA1-WES1.vcf.gz
+    |       |-- bwa.gatk3_hc.jannovar_annotate_vcf.P001-N1-DNA1-WES1.vcf.gz.tbi
+    |       |-- bwa.gatk3_hc.jannovar_annotate_vcf.P001-N1-DNA1-WES1.vcf.gz.md5
+    |       `-- bwa.gatk3_hc.jannovar_annotate_vcf.P001-N1-DNA1-WES1.vcf.gz.tbi.md5
     [...]
 
 ====================

diff --git a/snappy_pipeline/workflows/variant_calling/Snakefile b/snappy_pipeline/workflows/variant_calling/Snakefile
@@ -51,45 +51,61 @@ rule variant_calling_write_pedigree_run:
         wf.substep_dispatch("write_pedigree", "run", wildcards, output)
 
 
-# Run bcftools ----------------------------------------------------------------
+# Run bcftools call -----------------------------------------------------------
 
 
-rule variant_calling_bcftools_run:
+rule variant_calling_bcftools_call_run:
     input:
-        unpack(wf.get_input_files("bcftools", "run")),
+        unpack(wf.get_input_files("bcftools_call", "run")),
     output:
-        **wf.get_output_files("bcftools", "run"),
-    threads: wf.get_resource("bcftools", "run", "threads")
+        **wf.get_output_files("bcftools_call", "run"),
+    threads: wf.get_resource("bcftools_call", "run", "threads")
     resources:
-        time=wf.get_resource("bcftools", "run", "time"),
-        memory=wf.get_resource("bcftools", "run", "memory"),
-        partition=wf.get_resource("bcftools", "run", "partition"),
+        time=wf.get_resource("bcftools_call", "run", "time"),
+        memory=wf.get_resource("bcftools_call", "run", "memory"),
+        partition=wf.get_resource("bcftools_call", "run", "partition"),
     log:
-        **wf.get_log_file("bcftools", "run"),
+        **wf.get_log_file("bcftools_call", "run"),
     wrapper:
         wf.wrapper_path("bcftools_call")
 
 
 # Run GATK HaplotypeCaller (direct, pedigree/multi-sample) --------------------
 
 
-rule variant_calling_gatk_hc_run:
+rule variant_calling_gatk3_hc_run:
     input:
-        unpack(wf.get_input_files("gatk_hc", "run")),
+        unpack(wf.get_input_files("gatk3_hc", "run")),
     output:
-        **wf.get_output_files("gatk_hc", "run"),
-    threads: wf.get_resource("gatk_hc", "run", "threads")
+        **wf.get_output_files("gatk3_hc", "run"),
+    threads: wf.get_resource("gatk3_hc", "run", "threads")
     resources:
-        time=wf.get_resource("gatk_hc", "run", "time"),
-        memory=wf.get_resource("gatk_hc", "run", "memory"),
-        partition=wf.get_resource("gatk_hc", "run", "partition"),
-    params:
-        step_key="variant_calling",
-        caller_key="gatk_hc",
+        time=wf.get_resource("gatk3_hc", "run", "time"),
+        memory=wf.get_resource("gatk3_hc", "run", "memory"),
+        partition=wf.get_resource("gatk3_hc", "run", "partition"),
+    log:
+        **wf.get_log_file("gatk3_hc", "run"),
+    wrapper:
+        wf.wrapper_path("gatk3_hc")
+
+
+# Run GATK UnifiedGenotyper ---------------------------------------------------
+
+
+rule variant_calling_gatk3_ug_run:
+    input:
+        unpack(wf.get_input_files("gatk3_ug", "run")),
+    output:
+        **wf.get_output_files("gatk3_ug", "run"),
+    threads: wf.get_resource("gatk3_ug", "run", "threads")
+    resources:
+        time=wf.get_resource("gatk3_ug", "run", "time"),
+        memory=wf.get_resource("gatk3_ug", "run", "memory"),
+        partition=wf.get_resource("gatk3_ug", "run", "partition"),
     log:
-        **wf.get_log_file("gatk_hc", "run"),
+        **wf.get_log_file("gatk3_ug", "run"),
     wrapper:
-        wf.wrapper_path("gatk_hc_par")
+        wf.wrapper_path("gatk3_ug")
 
 
 # Run GATK 4 HaplotypeCaller Joint (direct, pedigree/multi-sample) ------------
@@ -174,28 +190,6 @@ rule variant_calling_gatk4_hc_gvcf_genotype:
         wf.wrapper_path("gatk4_hc/genotype")
 
 
-# Run GATK UnifiedGenotyper ---------------------------------------------------
-
-
-rule variant_calling_gatk_ug_run:
-    input:
-        unpack(wf.get_input_files("gatk_ug", "run")),
-    output:
-        **wf.get_output_files("gatk_ug", "run"),
-    threads: wf.get_resource("gatk_ug", "run", "threads")
-    resources:
-        time=wf.get_resource("gatk_ug", "run", "time"),
-        memory=wf.get_resource("gatk_ug", "run", "memory"),
-        partition=wf.get_resource("gatk_ug", "run", "partition"),
-    params:
-        step_key="variant_calling",
-        caller_key="gatk_ug",
-    log:
-        **wf.get_log_file("gatk_ug", "run"),
-    wrapper:
-        wf.wrapper_path("gatk_ug_par")
-
-
 # QC / Statistics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 # Generate BCFtools stats report ----------------------------------------------