From 3faf66e2b7971b789b3c8475019f9c0893d35967 Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric@Euclide.local>
Date: Mon, 24 Jul 2023 23:16:30 +0200
Subject: [PATCH 1/9] refactor: segment output as bed files

---
 .../somatic_targeted_seq_cnv_calling/__init__.py  |  6 +++---
 .../wrappers/cnvkit/postprocess/environment.yaml  |  1 +
 .../wrappers/cnvkit/postprocess/wrapper.py        | 15 ++++++++++++++-
 ..._workflows_somatic_targeted_seq_cnv_calling.py |  8 ++++----
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py
index ddea1c5f7..1de7934ac 100644
--- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py
+++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py
@@ -621,8 +621,8 @@ def _get_output_files_call():
     @staticmethod
     def _get_output_files_postprocess():
         name_pattern = "{mapper}.cnvkit.{library_name}"
-        tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cns")
-        return {"final": tpl, "final_md5": tpl + ".md5"}
+        tpl = os.path.join("work", name_pattern, "out", name_pattern + ".bed.gz")
+        return {"final": tpl, "final_tbi": tpl + ".tbi", "final_md5": tpl + ".md5", "final_tbi_md5": tpl + ".tbi.md5"}
 
     @dictify
     def _get_output_files_plot(self):
@@ -652,7 +652,7 @@ def _get_output_files_plot(self):
 
     @staticmethod
     def _get_output_files_export():
-        exports = (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi"))
+        exports = (("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi"))
         output_files = {}
         tpl = (
             "work/{{mapper}}.cnvkit.{{library_name}}/out/"
diff --git a/snappy_wrappers/wrappers/cnvkit/postprocess/environment.yaml b/snappy_wrappers/wrappers/cnvkit/postprocess/environment.yaml
index e936461b1..28ebc9a4e 100644
--- a/snappy_wrappers/wrappers/cnvkit/postprocess/environment.yaml
+++ b/snappy_wrappers/wrappers/cnvkit/postprocess/environment.yaml
@@ -5,3 +5,4 @@ channels:
 dependencies:
 - r-base
 - bioconductor-genomicranges
+- htslib
diff --git a/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py b/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py
index f79a46585..f3e4a0106 100644
--- a/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py
+++ b/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py
@@ -30,6 +30,8 @@
 
 # -----------------------------------------------------------------------------
 
+uncompressed=$(echo "{snakemake.output.final}" | sed -e "s/\.gz$//")
+
 cat << _EOF | R --vanilla --slave
 segments <- read.table("{snakemake.input.segment}", sep="\t", header=1, stringsAsFactors=FALSE)
 stopifnot(all(c("chromosome", "start", "end") %in% colnames(segments)))
@@ -63,13 +65,24 @@
 segments[,"start"] <- segments[["start"]]-1
 segments[,"end"]   <- segments[["end"]]+1
 
-write.table(segments, file="{snakemake.output.final}", sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
+# Create bed file (abusing name, score & strand to make it bed-6 compatible)
+segments[,"start"]  <- segments[["start"]]-1     # Bed files are 0-based, semi-open intervals
+segments[,"name"]   <- sprintf("seg_%d_cn_%d", 1:nrow(segments), segments[["cn"]])
+segments[,"score"]  <- pmin(1000, abs(segments[["log2"]]))
+segments[,"strand"] <- "+"
+segments[segments[["log2"]]<0,"strand"] <- "-"
+
+write.table(segments[,c("chromosome", "start", "end", "name", "score", "strand")], file="$uncompressed", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
 _EOF
 
+bgzip $uncompressed
+tabix {snakemake.output.final}
+
 d=$(dirname "{snakemake.output.final}")
 pushd $d
 fn=$(basename "{snakemake.output.final}")
 md5sum $fn > $fn.md5
+md5sum $fn.tbi > $fn.tbi.md5
 popd
 """
 )
diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py
index fd756d9a4..975ccd978 100644
--- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py
@@ -472,8 +472,8 @@ def test_cnvkit_postprocess_step_part_get_input_files(somatic_targeted_seq_cnv_c
 
 
 def test_cnvkit_postprocess_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow):
-    base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns"
-    expected = {"final": base_name_out, "final_md5": base_name_out + ".md5"}
+    base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.bed.gz"
+    expected = {"final": base_name_out, "final_tbi": base_name_out + ".tbi", "final_md5": base_name_out + ".md5", "final_tbi_md5": base_name_out + ".tbi.md5"}
     actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "postprocess")
     assert actual == expected
 
@@ -582,7 +582,7 @@ def test_cnvkit_export_step_part_get_output_files(somatic_targeted_seq_cnv_calli
     # Define expected
     expected = {}
     base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}"
-    for key, ext in (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")):
+    for key, ext in (("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")):
         expected[key] = base_name_out + "." + ext
         expected[key + "_md5"] = expected[key] + ".md5"
     # Get actual
@@ -870,7 +870,7 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call
     expected += [
         tpl.format(i=i, t=t, ext=ext, md5=md5)
         for i, t in ((1, 1), (2, 1), (2, 2))
-        for ext in ("cnr", "cns", "bed", "seg", "vcf.gz", "vcf.gz.tbi")
+        for ext in ("cnr", "bed.gz", "bed.gz.tbi", "seg", "vcf.gz", "vcf.gz.tbi")
         for md5 in ("", ".md5")
     ]
     tpl = (

From b1ebeafbec3bf1dbcdcaf2d0e6e57d0aaf77b5ef Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric@Euclide.local>
Date: Mon, 24 Jul 2023 23:19:14 +0200
Subject: [PATCH 2/9] feat: Add the CNV checking step (BAF of heterozygous SNPs
 vs CNV)

---
 snappy_pipeline/apps/snappy_snake.py          |   2 +
 .../workflows/somatic_cnv_checking/Snakefile  | 132 ++++++
 .../somatic_cnv_checking/__init__.py          | 395 ++++++++++++++++++
 .../heterozygous_variants/environment.yaml    |   1 +
 .../bcftools/heterozygous_variants/wrapper.py |  65 +++
 .../bcftools/pileups/environment.yaml         |   1 +
 .../wrappers/bcftools/pileups/wrapper.py      |  55 +++
 .../somatic_cnv_checking/cnv-check-plot.R     |  82 ++++
 .../somatic_cnv_checking/environment.yaml     |   8 +
 .../wrappers/somatic_cnv_checking/wrapper.py  |  83 ++++
 .../wrappers/vcfpy/add_bed/environment.yaml   |   7 +
 .../wrappers/vcfpy/add_bed/wrapper.py         |  86 ++++
 .../test_workflows_somatic_cnv_checking.py    | 277 ++++++++++++
 13 files changed, 1194 insertions(+)
 create mode 100644 snappy_pipeline/workflows/somatic_cnv_checking/Snakefile
 create mode 100644 snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
 create mode 120000 snappy_wrappers/wrappers/bcftools/heterozygous_variants/environment.yaml
 create mode 100644 snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py
 create mode 120000 snappy_wrappers/wrappers/bcftools/pileups/environment.yaml
 create mode 100644 snappy_wrappers/wrappers/bcftools/pileups/wrapper.py
 create mode 100644 snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R
 create mode 100644 snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml
 create mode 100644 snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py
 create mode 100644 snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml
 create mode 100644 snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py
 create mode 100644 tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py

diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py
index c47f50b94..6948b0682 100644
--- a/snappy_pipeline/apps/snappy_snake.py
+++ b/snappy_pipeline/apps/snappy_snake.py
@@ -31,6 +31,7 @@
     ngs_sanity_checking,
     panel_of_normals,
     repeat_expansion,
+    somatic_cnv_checking,
     somatic_gene_fusion_calling,
     somatic_hla_loh_calling,
     somatic_msi_calling,
@@ -86,6 +87,7 @@
     "panel_of_normals": panel_of_normals,
     "repeat_analysis": repeat_expansion,
     "ngs_sanity_checking": ngs_sanity_checking,
+    "somatic_cnv_checking": somatic_cnv_checking,
     "somatic_gene_fusion_calling": somatic_gene_fusion_calling,
     "somatic_hla_loh_calling": somatic_hla_loh_calling,
     "somatic_msi_calling": somatic_msi_calling,
diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile b/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile
new file mode 100644
index 000000000..258ca1a3c
--- /dev/null
+++ b/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+"""CUBI Pipeline somatic_cnv_checking step Snakefile"""
+
+import os
+
+from snappy_pipeline import expand_ref
+from snappy_pipeline.workflows.somatic_cnv_checking import SomaticCnvCheckingWorkflow
+
+__author__ = "Eric Blanc <eric.blanc@bih-charite.de>"
+
+# Configuration ===============================================================
+
+
+configfile: "config.yaml"
+
+
+# Expand "$ref" JSON pointers in configuration (also works for YAML)
+config, lookup_paths, config_paths = expand_ref("config.yaml", config)
+
+# WorkflowImpl Object Setup ===================================================
+
+wf = SomaticCnvCheckingWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd())
+
+# Rules =======================================================================
+
+
+localrules:
+    # Linking files from work/ to output/ should be done locally
+    somatic_cnv_checking_link_out_run,
+
+
+rule all:
+    input:
+        wf.get_result_files(),
+
+
+# House-Keeping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# Generic linking out ---------------------------------------------------------
+
+
+rule somatic_cnv_checking_link_out_run:
+    input:
+        wf.get_input_files("link_out", "run"),
+    output:
+        wf.get_output_files("link_out", "run"),
+    run:
+        shell(wf.get_shell_cmd("link_out", "run", wildcards))
+
+
+# Somatic CNV Checking ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+# Run pileup to identify heterozygous variants in normal sample ---------------
+
+
+rule somatic_cnv_checking_pileup_normal:
+    input:
+        unpack(wf.get_input_files("pileup", "normal")),
+    output:
+        **wf.get_output_files("pileup", "normal"),
+    threads: wf.get_resource("pileup", "normal", "threads")
+    resources:
+        time=wf.get_resource("pileup", "normal", "time"),
+        memory=wf.get_resource("pileup", "normal", "memory"),
+        partition=wf.get_resource("pileup", "normal", "partition"),
+        tmpdir=wf.get_resource("pileup", "normal", "tmpdir"),
+    log:
+        **wf.get_log_file("pileup", "normal"),
+    wrapper:
+        wf.wrapper_path("bcftools/heterozygous")
+
+
+# Run pileup to add read counts in the tumor sample ---------------------------
+
+
+rule somatic_cnv_checking_pileup_tumor:
+    input:
+        unpack(wf.get_input_files("pileup", "tumor")),
+    output:
+        **wf.get_output_files("pileup", "tumor"),
+    threads: wf.get_resource("pileup", "tumor", "threads")
+    resources:
+        time=wf.get_resource("pileup", "tumor", "time"),
+        memory=wf.get_resource("pileup", "tumor", "memory"),
+        partition=wf.get_resource("pileup", "tumor", "partition"),
+        tmpdir=wf.get_resource("pileup", "tumor", "tmpdir"),
+    log:
+        **wf.get_log_file("pileup", "tumor"),
+    wrapper:
+        wf.wrapper_path("bcftools/pileups")
+
+
+# Add CNV status at the locii -------------------------------------------------
+
+
+rule somatic_cnv_checking_cnv_run:
+    input:
+        **wf.get_input_files("cnv", "run"),
+    output:
+        **wf.get_output_files("cnv", "run"),
+    threads: wf.get_resource("cnv", "run", "threads")
+    resources:
+        time=wf.get_resource("cnv", "run", "time"),
+        memory=wf.get_resource("cnv", "run", "memory"),
+        partition=wf.get_resource("cnv", "run", "partition"),
+        tmpdir=wf.get_resource("cnv", "run", "tmpdir"),
+    log:
+        **wf.get_log_file("cnv", "run"),
+    wrapper:
+        wf.wrapper_path("vcfpy/add_bed")
+
+
+# Generate report & plots -----------------------------------------------------
+
+
+rule somatic_cnv_checking_cnv_report:
+    input:
+        **wf.get_input_files("report", "run"),
+    output:
+        **wf.get_output_files("report", "run"),
+    params:
+        **{"args": wf.get_params("report", "run")}
+    threads: wf.get_resource("report", "run", "threads")
+    resources:
+        time=wf.get_resource("report", "run", "time"),
+        memory=wf.get_resource("report", "run", "memory"),
+        partition=wf.get_resource("report", "run", "partition"),
+        tmpdir=wf.get_resource("report", "run", "tmpdir"),
+    log:
+        **wf.get_log_file("report", "run"),
+    wrapper:
+        wf.wrapper_path("somatic_cnv_checking")
diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
new file mode 100644
index 000000000..46a0a2729
--- /dev/null
+++ b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
@@ -0,0 +1,395 @@
+# -*- coding: utf-8 -*-
+"""Implementation of the ``somatic_cnv_checking`` step
+
+The ``somatic_cnv_checking`` step takes as the input the results of the ``ngs_mapping`` step
+(aligned reads in BAM format) and performs pileups at germline locii to identify heterozygous
+variants, and their B-allele frequency in the paired tumor sample. If one somatic CNV calling step
+(either ``somatic_wgs_cnv_calling`` or ``somatic_targetd_seq_cnv_calling``) is present, the
+log2 convarage ratio and copy number call are added to the output vcf.
+
+==========
+Step Input
+==========
+
+The somatic CNV checking step uses Snakemake sub workflows for using the result of the
+``ngs_mapping`` step. Optionally, it can also use one Snakemake sub workflows for somatic CNV
+calling, either``somatic_wgs_cnv_calling`` or ``somatic_targetd_seq_cnv_calling``.
+
+===========
+Step Output
+===========
+
+For each tumor DNA NGS library with name ``lib_name``/key ``lib_pk`` and each read mapper
+``mapper`` that the library has been aligned with, and the CNV caller ``caller``, the
+pipeline step will create a directory ``output/{mapper}.{caller}.{lib_name}-{lib_pk}/out``
+with symlinks of the following names to the resulting VCF, TBI, and MD5 files.
+
+- ``{mapper}.{var_caller}.{lib_name}-{lib_pk}.vcf.gz``
+- ``{mapper}.{var_caller}.{lib_name}-{lib_pk}.vcf.gz.tbi``
+- ``{mapper}.{var_caller}.{lib_name}-{lib_pk}.vcf.gz.md5``
+- ``{mapper}.{var_caller}.{lib_name}-{lib_pk}.vcf.gz.tbi.md5``
+
+For example, it might look as follows for the example from above:
+
+::
+
+    output/
+    +-- bwa.cnvkit.P001-T1-DNA1-WES1-4
+    |   `-- out
+    |       |-- bwa.cnvkit.P001-T1-DNA1-WES1-4.vcf.gz
+    |       |-- bwa.cnvkit.P001-T1-DNA1-WES1-4.vcf.gz.tbi
+    |       |-- bwa.cnvkit.P001-T1-DNA1-WES1-4.vcf.gz.md5
+    |       `-- bwa.cnvkit.P001-T1-DNA1-WES1-4.vcf.gz.tbi.md5
+    [...]
+
+=====================
+Default Configuration
+=====================
+
+The default configuration is as follows.
+
+.. include:: DEFAULT_CONFIG_somatic_cnv_checking.rst
+
+=======
+Reports
+=======
+
+Currently, no reports are generated.
+"""
+
+from collections import OrderedDict
+import os
+import sys
+
+from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
+from snakemake.io import expand
+
+from snappy_pipeline.utils import dictify, listify
+from snappy_pipeline.workflows.abstract import (
+    BaseStep,
+    BaseStepPart,
+    LinkOutStepPart,
+    ResourceUsage,
+)
+from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow
+
+__author__ = "Eric Blanc <eric.blanc@bih-charite.de>"
+
+#: Extensions of files to create as main payload
+EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5")
+
+#: Names of the files to create for the extension
+EXT_NAMES = ("vcf", "vcf_tbi", "vcf_md5", "vcf_tbi_md5")
+
+#: Default configuration for the somatic_cnv_checking schema
+DEFAULT_CONFIG = r"""
+# Default configuration somatic_cnv_checking
+step_config:
+  somatic_cnv_checking:
+    path_ngs_mapping: ../ngs_mapping  # REQUIRED
+    path_cnv_calling: ""              # Can use for instance ../somatic_targeted_seq_cnv_calling
+    cnv_calling_tool: ""              # Can use "cnvkit"
+    excluded_regions: ""              # Bed file of regions to be excluded
+    max_depth: 10000                  # Max depth for pileups
+    min_cov: 20                       # Minimum depth for reference and alternative alleles to consider variant
+    min_baf: 0.4                      # Maximum BAF to consider variant as heterozygous (between 0 & 1/2)
+"""
+
+
+class SomaticCnvCheckingStepPart(BaseStepPart):
+    """Base class for CNV checking sub-steps"""
+        
+    def __init__(self, parent):
+        super().__init__(parent)
+        # Build shortcut from cancer bio sample name to matched cancer sample
+        self.tumor_ngs_library_to_sample_pair = OrderedDict()
+        for sheet in self.parent.shortcut_sheets:
+            self.tumor_ngs_library_to_sample_pair.update(
+                sheet.all_sample_pairs_by_tumor_dna_ngs_library
+            )
+        # Get shorcut to Snakemake sub workflow
+        self.ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
+
+    def get_params(self, action):
+        # Validate action
+        self._validate_action(action)
+
+        def input_function(wildcards):
+            pair = self.tumor_ngs_library_to_sample_pair[wildcards.library]
+            return {"library_name": pair.tumor_sample.dna_ngs_library.name}
+
+        return input_function
+
+    @dictify
+    def _get_log_file(self, prefix):
+        """Return dict of log files."""
+        # Validate action
+        key_ext = (
+            ("log", ".log"),
+            ("conda_info", ".conda_info.txt"),
+            ("conda_list", ".conda_list.txt"),
+        )
+        for key, ext in key_ext:
+            yield key, prefix + ext
+            yield key + "_md5", prefix + ext + ".md5"
+
+
+class SomaticCnvCheckingPileupStepPart(SomaticCnvCheckingStepPart):
+    """Perform pileups at a set of locations"""
+
+    name = "pileup"
+    actions = ("normal", "tumor")
+
+    def get_input_files(self, action):
+        # Validate action
+        self._validate_action(action)
+
+        def input_function_normal(wildcards):
+            pair = self.tumor_ngs_library_to_sample_pair[wildcards.library]
+            base_path = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
+                normal_library=pair.normal_sample.dna_ngs_library.name, **wildcards
+            )
+            return {
+                "bam": self.ngs_mapping(base_path + ".bam"),
+                "bai": self.ngs_mapping(base_path + ".bam.bai"),
+            }
+
+        def input_function_tumor(wildcards):
+            pair = self.tumor_ngs_library_to_sample_pair[wildcards.library]
+            base_path = "output/{mapper}.{tumor_library}/out/{mapper}.{tumor_library}".format(
+                tumor_library=pair.tumor_sample.dna_ngs_library.name, **wildcards
+            )
+            return {
+                "vcf": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz".format(
+                    **wildcards
+                ),
+                "tbi": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz.tbi".format(
+                    **wildcards
+                ),
+                "bam": self.ngs_mapping(base_path + ".bam"),
+                "bai": self.ngs_mapping(base_path + ".bam.bai"),
+            }
+
+        if action == "normal":
+            return input_function_normal
+        else:
+            return input_function_tumor
+
+    def get_output_files(self, action):
+        """Return output files that all somatic variant calling sub steps must
+        return (VCF + TBI file)
+        """
+        # Validate action
+        self._validate_action(action)
+        base_path_out = "work/{{mapper}}.{{library}}/out/{{mapper}}.{{library}}.{action}{ext}"
+        return dict(zip(EXT_NAMES, expand(base_path_out, action=action, ext=EXT_VALUES)))
+
+    def get_log_file(self, action):
+        # Validate action
+        self._validate_action(action)
+        return self._get_log_file("work/{mapper}.{library}/log/{mapper}.{library}." + action)
+
+    def get_resource_usage(self, action):
+        # Validate action
+        self._validate_action(action)
+        return ResourceUsage(
+            threads=2,
+            time="01:00:00" if action == "tumor" else "12:00:00",  # 1 hour
+            memory=f"{int(3.7 * 1024 * 2)}M",
+        )
+
+
+class SomaticCnvCheckingCnvStepPart(SomaticCnvCheckingStepPart):
+    """Merging heterozygous variants with CNV data"""
+
+    name = "cnv"
+    actions = ("run",)
+
+    def get_input_files(self, action):
+        # Validate action
+        self._validate_action(action)
+        filenames = {}
+        filenames["normal"] = "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz"
+        filenames["normal_tbi"] = filenames["normal"] + ".tbi"
+        filenames["tumor"] = "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz"
+        filenames["tumor_tbi"] = filenames["tumor"] + ".tbi"
+        if self.config["path_cnv_calling"]:
+            tpl = "{{mapper}}.{caller}.{{library}}".format(caller=self.config["cnv_calling_tool"])
+            filenames["cnv"] = os.path.join(
+                self.config["path_cnv_calling"], "output", tpl, "out", tpl + ".bed.gz"
+            )
+            filenames["cnv_tbi"] = filenames["cnv"] + ".tbi"
+        return filenames
+
+    def get_output_files(self, action):
+        # Validate action
+        self._validate_action(action)
+        if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]:
+            name_pattern = "{{mapper}}.{caller}.{{library}}".format(
+                caller=self.config["cnv_calling_tool"]
+            )
+        else:
+            name_pattern = "{mapper}.{library}"
+        base_path_out = "work/" + name_pattern + "/out/" + name_pattern
+        return dict(zip(EXT_NAMES, [base_path_out + ext for ext in EXT_VALUES]))
+
+    def get_log_file(self, action):
+        # Validate action
+        self._validate_action(action)
+        if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]:
+            name_pattern = "{{mapper}}.{caller}.{{library}}".format(
+                caller=self.config["cnv_calling_tool"]
+            )
+        else:
+            name_pattern = "{mapper}.{library}"
+        return self._get_log_file(os.path.join("work", name_pattern, "log", name_pattern))
+
+
+class SomaticCnvCheckingReportStepPart(SomaticCnvCheckingStepPart):
+    """Plots of BAF vs CNV"""
+
+    name = "report"
+    actions = ("run",)
+
+    def get_input_files(self, action):
+        # Validate action
+        self._validate_action(action)
+        name_pattern = "{{mapper}}.{caller}.{{library}}".format(
+            caller=self.config["cnv_calling_tool"]
+        )
+        base_path_out = "work/" + name_pattern + "/out/" + name_pattern
+        return {"vcf": base_path_out + ".vcf.gz"}
+
+    def get_output_files(self, action):
+        # Validate action
+        self._validate_action(action)
+        name_pattern = "{{mapper}}.{caller}.{{library}}".format(
+            caller=self.config["cnv_calling_tool"]
+        )
+        base_path_out = "work/" + name_pattern + "/report/" + name_pattern
+        return {
+            "cnv": base_path_out + ".cnv.pdf",
+            "cnv_md5": base_path_out + ".cnv.pdf.md5",
+            "locus": base_path_out + ".locus.pdf",
+            "locus_md5": base_path_out + ".locus.pdf.md5",
+        }
+
+    def get_log_file(self, action):
+        # Validate action
+        self._validate_action(action)
+        name_pattern = "{{mapper}}.{caller}.{{library}}".format(
+            caller=self.config["cnv_calling_tool"]
+        )
+        return self._get_log_file(os.path.join("work", name_pattern, "log", name_pattern + ".report"))
+
+
+class SomaticCnvCheckingWorkflow(BaseStep):
+    """Perform somatic cnv checking"""
+
+    #: Workflow name
+    name = "somatic_cnv_checking"
+
+    #: Default biomed sheet class
+    sheet_shortcut_class = CancerCaseSheet
+
+    sheet_shortcut_kwargs = {
+        "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True)
+    }
+
+    @classmethod
+    def default_config_yaml(cls):
+        """Return default config YAML, to be overwritten by project-specific one"""
+        return DEFAULT_CONFIG
+
+    def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir):
+        super().__init__(
+            workflow,
+            config,
+            config_lookup_paths,
+            config_paths,
+            workdir,
+            (NgsMappingWorkflow,),
+        )
+        self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])
+        # Register sub step classes so the sub steps are available
+        self.register_sub_step_classes(
+            (
+                SomaticCnvCheckingPileupStepPart,
+                SomaticCnvCheckingCnvStepPart,
+                SomaticCnvCheckingReportStepPart,
+                LinkOutStepPart,
+            )
+        )
+        # Initialize sub-workflows
+
+    @listify
+    def get_result_files(self):
+        """Return list of result files for the NGS mapping workflow
+
+        We will process all NGS libraries of all bio samples in all sample sheets.
+        """
+        if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]:
+            name_pattern = "{{mapper}}.{caller}.{{tumor_library.name}}".format(
+                caller=self.config["cnv_calling_tool"]
+            )
+        else:
+            name_pattern = "{mapper}.{tumor_library.name}"
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
+            mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
+            ext=EXT_VALUES,
+        )
+        yield from self._yield_result_files_matched(
+            os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
+            mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
+            ext=[
+                part + "." + ext + chksum
+                for part in ("", ".normal", ".tumor")
+                for ext in ("log", "conda_info.txt", "conda_list.txt")
+                for chksum in ("", ".md5")
+            ],
+        )
+        if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]:
+            yield from self._yield_result_files_matched(
+                os.path.join("output", name_pattern, "report", name_pattern + "{ext}"),
+                mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
+                ext=(".cnv.pdf", ".cnv.pdf.md5", ".locus.pdf", ".locus.pdf.md5"),
+            )
+            yield from self._yield_result_files_matched(
+                os.path.join("output", name_pattern, "log", name_pattern + ".report" + "{ext}"),
+                mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
+                ext=[
+                    "." + ext + chksum
+                    for ext in ("log", "conda_info.txt", "conda_list.txt")
+                    for chksum in ("", ".md5")
+                ],
+            )
+
+    def _yield_result_files_matched(self, tpl, **kwargs):
+        """Build output paths from path template and extension list.
+
+        This function returns the results from the matched somatic variant callers such as
+        Mutect.
+        """
+        for sheet in filter(is_not_background, self.shortcut_sheets):
+            for sample_pair in sheet.all_sample_pairs:
+                if (
+                    not sample_pair.tumor_sample.dna_ngs_library
+                    or not sample_pair.normal_sample.dna_ngs_library
+                ):
+                    msg = (
+                        "INFO: sample pair for cancer bio sample {} has is missing primary"
+                        "normal or primary cancer NGS library"
+                    )
+                    print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
+                    continue
+                yield from expand(
+                    tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs
+                )
+
+    def check_config(self):
+        """Check that the path to the NGS mapping is present"""
+        self.ensure_w_config(
+            ("step_config", "somatic_cnv_checking", "path_ngs_mapping"),
+            "Path to NGS mapping not configured but required for somatic variant calling",
+        )
diff --git a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/environment.yaml b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/environment.yaml
new file mode 120000
index 000000000..2e107ac86
--- /dev/null
+++ b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/environment.yaml
@@ -0,0 +1 @@
+../environment.yaml
\ No newline at end of file
diff --git a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py
new file mode 100644
index 000000000..43f4f51d6
--- /dev/null
+++ b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+"""Wrapper for finding heterozygous variants with bcftools
+"""
+
+from snakemake.shell import shell
+
+step = snakemake.config["pipeline_step"]["name"]
+config = snakemake.config["step_config"][step]
+
+if "args" in snakemake.params and "intervals" in snakemake.params["args"]:
+    locii = "-r " + snakemake.params["args"]["intervals"]
+elif "locii" in snakemake.input:
+    locii = "-R " + snakemake.input.locii
+elif config["locii"]:
+    locii = "-R " + config["locii"]
+else:
+    locii = ""
+
+# Convert minimum B-allele fraction into ratio of alternative to reference alleles
+min_ratio = config["min_baf"] / (1 - config["min_baf"])
+max_ratio = 1 / min_ratio
+
+shell(
+    r"""
+# -----------------------------------------------------------------------------
+# Redirect stderr to log file by default and enable printing executed commands
+exec &> >(tee -a "{snakemake.log.log}")
+set -x
+# -----------------------------------------------------------------------------
+export TMPDIR=$(mktemp -d)
+trap "rm -rf $TMPDIR" EXIT
+
+# Write out information about conda installation
+conda list > {snakemake.log.conda_list}
+conda info > {snakemake.log.conda_info}
+
+only_one_variant="N_ALT=2 & FORMAT/AD[:2]=0"
+min_coverage="FORMAT/AD[:0]>{config[min_cov]} & FORMAT/AD[:1]>{config[min_cov]}"
+hetero="{min_ratio}*FORMAT/AD[:0]<=FORMAT/AD[:1] & FORMAT/AD[:1]<={max_ratio}*FORMAT/AD[:0]"
+
+bcftools mpileup \
+    {locii} \
+    --max-depth {config[max_depth]} \
+    -f {snakemake.config[static_data_config][reference][path]} \
+    -a "FORMAT/AD" \
+    {snakemake.input.bam} \
+    | bcftools filter \
+    --include "$only_one_variant & $min_coverage & $hetero" \
+    -O z -o {snakemake.output.vcf}
+tabix {snakemake.output.vcf}
+
+pushd $(dirname {snakemake.output.vcf})
+md5sum $(basename {snakemake.output.vcf}) > $(basename {snakemake.output.vcf_md5})
+md5sum $(basename {snakemake.output.vcf_tbi}) > $(basename {snakemake.output.vcf_tbi_md5})
+"""
+)
+
+# Compute MD5 sums of logs
+shell(
+    r"""
+md5sum {snakemake.log.log} > {snakemake.log.log_md5}
+md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5}
+md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5}
+"""
+)
diff --git a/snappy_wrappers/wrappers/bcftools/pileups/environment.yaml b/snappy_wrappers/wrappers/bcftools/pileups/environment.yaml
new file mode 120000
index 000000000..2e107ac86
--- /dev/null
+++ b/snappy_wrappers/wrappers/bcftools/pileups/environment.yaml
@@ -0,0 +1 @@
+../environment.yaml
\ No newline at end of file
diff --git a/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py b/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py
new file mode 100644
index 000000000..daf4c5dd1
--- /dev/null
+++ b/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+"""Wrapper for running bcftools mpileup
+"""
+
+step = snakemake.config["pipeline_step"]["name"]
+config = snakemake.config["step_config"][step]
+
+if "args" in snakemake.params and "intervals" in snakemake.params["args"]:
+    locii = "-r " + snakemake.params["args"]["intervals"]
+elif "locii" in snakemake.input:
+    locii = "-R " + snakemake.input.locii
+elif config["locii"]:
+    locii = "-R " + config["locii"]
+else:
+    locii = ""
+
+# Actually run the script.
+shell(
+    r"""
+# -----------------------------------------------------------------------------
+# Redirect stderr to log file by default and enable printing executed commands
+exec &> >(tee -a "{snakemake.log.log}")
+set -x
+# -----------------------------------------------------------------------------
+export TMPDIR=$(mktemp -d)
+trap "rm -rf $TMPDIR" EXIT
+
+# Write out information about conda installation
+conda list > {snakemake.log.conda_list}
+conda info > {snakemake.log.conda_info}
+
+bcftools mpileup \
+    {locii} \
+    --max-depth {config[max_depth]} \
+    -f {snakemake.config[static_data_config][reference][path]} \
+    -a "FORMAT/AD" \
+    -O z -o {snakemake.output.vcf} \
+    {snakemake.input.bam}
+tabix {snakemake.output.vcf}
+
+pushd $(dirname {snakemake.output.vcf})
+md5sum $(basename {snakemake.output.vcf}) > $(basename {snakemake.output.vcf_md5})
+md5sum $(basename {snakemake.output.vcf_tbi}) > $(basename {snakemake.output.vcf_tbi_md5})
+popd
+"""
+)
+
+# Compute MD5 sums of logs
+shell(
+    r"""
+md5sum {snakemake.log.log} > {snakemake.log.log_md5}
+md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5}
+md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5}
+"""
+)
diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R
new file mode 100644
index 000000000..949b2bdb7
--- /dev/null
+++ b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R
@@ -0,0 +1,82 @@
+vcf_to_table <- function(vcf, sample, amplification=7) {
+     vcf <- VariantAnnotation::readVcf(vcf)
+     stopifnot(all(lengths(VariantAnnotation::alt(vcf)) == 2))
+     
+     x <- data.frame(
+         CHROM=GenomicRanges::seqnames(SummarizedExperiment::rowRanges(vcf)),
+         POS=GenomicRanges::start(SummarizedExperiment::rowRanges(vcf)),
+         REF=VariantAnnotation::ref(vcf),
+         ALT=unlist(lapply(VariantAnnotation::alt(vcf), function(a) a[1])),
+         t(sapply(VariantAnnotation::geno(vcf)[["AD"]][,sample], function(a) a[1:2])),
+         CN=VariantAnnotation::geno(vcf)[["CN"]][,sample],
+         LFC=VariantAnnotation::geno(vcf)[["LFC"]][,sample]
+     )
+     colnames(x)[5:6] <- c("t_ref", "t_alt")
+     x <- x |>
+         dplyr::filter(!is.na(LFC) & !is.na(CN)) |>
+         dplyr::mutate(BAF=pmin(t_ref, t_alt)/(t_ref+t_alt)) |>
+         dplyr::mutate(Call=factor("Neutral", levels=c("Deletion", "Loss", "Neutral", "Gain", "Amplification"))) |>
+         dplyr::mutate(Call=replace(Call, CN == 0, "Deletion")) |>
+         dplyr::mutate(Call=replace(Call, CN == 1, "Loss")) |>
+         dplyr::mutate(Call=replace(Call, 2 < CN & CN < amplification, "Gain")) |>
+         dplyr::mutate(Call=replace(Call, amplification <= CN, "Amplification"))
+    x
+}
+
+cnv_to_table <- function(cnv) {
+    y <- read.table(cnv, sep="\t", header=0)
+    colnames(y) <- c("CHROM", "start", "stop", "name", "LFC", "strand")
+    y <- y |>
+        dplyr::mutate(LFC=replace(LFC, strand == "-", LFC[strand=="-"])) |>
+        dplyr::mutate(start=start + 1)
+    y
+}
+
+chromosome_lengths <- function(genome, use_fai=TRUE, use_dict=TRUE) {
+    if (use_fai && file.exists(paste(genome, "fai", sep="."))) {
+        genome <- read.table(paste(genome, "fai", sep="."), sep="\t", header=0)
+        colnames(genome) <- c("CHROM", "Length", "index", "ncol", "nbyte")
+    } else {
+        if (use_dict && file.exists(paste(genome, "dict", sep="."))) {
+            pattern <- "^SQ\tSN:([^ \t]+)\tLN:([0-9]+)\t.+$"
+            genome <- grep(pattern, readLines(paste(genome, "dict", sep=".")), value=TRUE)
+            genome <- data.frame(CHROM=sub(pattern, "\\1", genome), Length=as.numeric(sub(pattern, "\\2", genome)))
+        } else {
+            genome <- Biostrings::readDNAStringSet(genome)
+            genome <- data.frame(CHROM=names(genome), Length=Biostrings::width(genome))
+        }
+    }
+    genome <- genome |>
+        dplyr::mutate(CHROM=sub(" .*", "", CHROM)) |>
+        dplyr::mutate(Offset=cumsum(as.numeric(dplyr::lag(Length, default=0)))) |>
+        dplyr::select(CHROM, Length, Offset)
+    genome
+}
+
+plot_cnv <- function(x, scale=c("log2", "sqrt")) {
+    scale <- match.arg(scale)
+    if (scale == "sqrt") x_label <- "Segment Fold Change"
+    else                 x_label <- "Segment Log2 Fold Change"
+    if (scale == "sqrt") x <- x |> dplyr::mutate(LFC=2^LFC)
+    p <- x |>
+        ggplot2::ggplot(ggplot2::aes(x=LFC, y=BAF, color=Call)) +
+        ggplot2::geom_point() +
+        ggplot2::labs(x=x_label, y="B-allele fraction") +
+        ggplot2::theme_bw()
+    if (scale == "sqrt") p <- p + ggplot2::scale_x_sqrt()
+    p
+}
+
+plot_locus <- function(x, genome_lengths) {
+    p <- genome_lengths |>
+        ggplot2::ggplot(ggplot2::aes(xmin=Offset+1, xmax=Offset+Length, ymin=0, ymax=0.5, fill=as.character(n%%2))) +
+        ggplot2::geom_rect() +
+        ggplot2::scale_fill_manual(values=c("white", "grey")) +
+        ggplot2::scale_x_continuous(breaks=genome_lengths$Offset+genome_lengths$Length/2, labels=genome_lengths$CHROM) +
+        ggplot2::geom_point(data=x, mapping=ggplot2::aes(x=x, y=BAF, color=Call), inherit.aes=FALSE) +
+        ggplot2::theme_bw() +
+        ggplot2::theme(axis.text.x=ggplot2::element_text(angle=90, vjust=0.5, hjust=1)) +
+        ggplot2::guides(fill="none") +
+        ggplot2::labs(x="Genomic position", y="B-allele fraction")
+    p
+}
diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml b/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml
new file mode 100644
index 000000000..88eb63da5
--- /dev/null
+++ b/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml
@@ -0,0 +1,8 @@
+channels:
+- conda-forge
+- bioconda
+dependencies:
+- r-base
+- r-dplyr
+- bioconductor-variantannotation
+- bioconductor-biostrings
diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py
new file mode 100644
index 000000000..2fbdbda9c
--- /dev/null
+++ b/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+"""Wrapper for running CopywriteR"""
+
+import os
+
+from snakemake import shell
+
+__author__ = "Eric Blanc <eric.blanc@bih-charite.de>"
+
+shell.executable("/bin/bash")
+
+rscript = os.path.join(os.path.dirname(os.path.realpath(__file__)), "cnv-check-plot.R")
+
+reference = snakemake.config["static_data_config"]["reference"]["path"]
+
+shell(
+    r"""
+set -x
+
+export TMPDIR=$(mktemp -d)
+# trap "rm -rf $TMPDIR" EXIT
+
+# Also pipe stderr to log file
+if [[ -n "{snakemake.log.log}" ]]; then
+    if [[ "$(set +e; tty; set -e)" != "" ]]; then
+        rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
+        exec 2> >(tee -a "{snakemake.log.log}" >&2)
+    else
+        rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log})
+        echo "No tty, logging disabled" >"{snakemake.log.log}"
+    fi
+fi
+
+md5() {{
+    filename=$1
+    fn=$(basename $filename)
+    pushd $(dirname $filename) 1> /dev/null 2>&1
+    rslt=$(md5sum $fn)
+    popd 1> /dev/null 2>&1
+    echo "$rslt"
+}}
+
+conda list >{snakemake.log.conda_list}
+conda info >{snakemake.log.conda_info}
+md5 {snakemake.log.conda_list} >{snakemake.log.conda_list_md5}
+md5 {snakemake.log.conda_info} >{snakemake.log.conda_info_md5}
+
+# -------------------------------------------------------------------------------------------------
+# Write helper script and call R
+#
+R --vanilla --slave << __EOF
+source("{rscript}")
+x <- vcf_to_table("{snakemake.input.vcf}", sample="{snakemake.params[args][library_name]}")
+genome_lengths <- chromosome_lengths("{reference}")
+x <- x |> dplyr::left_join(genome_lengths, by="CHROM") |> dplyr::mutate(x=POS + Offset)
+pdf("{snakemake.output.cnv}", height=6.22, width=9.33)
+plot_cnv(x, scale="log2") + ggplot2::ggtitle("{snakemake.params[args][library_name]}")
+plot_cnv(x, scale="sqrt") + ggplot2::ggtitle("{snakemake.params[args][library_name]}")
+dev.off()
+pdf("{snakemake.output.locus}", height=6.22, width=12.44)
+plot_locus(x, genome_lengths |> dplyr::mutate(n=dplyr::row_number()) |> dplyr::filter(CHROM %in% x$CHROM)) +
+    ggplot2::ggtitle("{snakemake.params[args][library_name]}")
+dev.off()
+__EOF
+
+md5 {snakemake.output.cnv} > {snakemake.output.cnv_md5}
+md5 {snakemake.output.locus} > {snakemake.output.locus_md5}
+"""
+)
+
+shell(
+    r"""
+md5() {{
+    filename=$1
+    fn=$(basename $filename)
+    pushd $(dirname $filename) 1> /dev/null 2>&1
+    rslt=$(md5sum $fn)
+    popd 1> /dev/null 2>&1
+    echo "$rslt"
+}}
+md5 {snakemake.log.log} > {snakemake.log.log_md5}
+"""
+)
diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml b/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml
new file mode 100644
index 000000000..388c1737a
--- /dev/null
+++ b/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml
@@ -0,0 +1,7 @@
+channels:
+- conda-forge
+- bioconda
+dependencies:
+- vcfpy
+- pytabix
+- htslib
diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py
new file mode 100644
index 000000000..fdf4c1317
--- /dev/null
+++ b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py
@@ -0,0 +1,86 @@
+import os
+import re
+import subprocess
+
+import tabix
+import vcfpy
+
+# Can't add anything if there's no CNV results
+if "cnv" not in snakemake.input:
+    os.symlink(snakemake.input.vcf, snakemake.output.vcf)
+    os.symlink(snakemake.input.vcf_tbi, snakemake.output.vcf_tbi)
+    return
+
+NAME_PATTERN = re.compile("^seg_([0-9]+)_cn_([0-9]+)$")
+
+LFC = {"ID": "LFC", "Number": 1, "Type": "Float", "Description": "Log2 fold change from coverage"}
+CN = {"ID": "CN", "Number": 1, "Type": "Integer", "Description": "Number of allele called by the CNV caller"}
+
+step = snakemake.config["pipeline_step"]["name"]
+config = snakemake.config["step_config"][step]
+
+bed = tabix.open(snakemake.input.cnv)
+
+ps = []
+nProc = 0
+ps += [subprocess.Popen(["bcftools", "merge", snakemake.input.normal, snakemake.input.tumor], stdout=subprocess.PIPE)]
+nProc += 1
+ps += [subprocess.Popen(["bcftools", "filter", "--include", "N_ALT=3 & FORMAT/AD[:2]=0"], stdin=ps[nProc-1].stdout, stdout=subprocess.PIPE, text=True)]
+ps[nProc-1].stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
+nProc += 1
+if config["excluded_regions"]:
+    ps += [subprocess.Popen(["bcftools", "view", "--regions-file", "^" + config["excluded_regions"], stdin=ps[nProc-1].stdout, stdout=subprocess.PIPE, text=True)]
+    ps[nProc-1].stdout.close()
+    nProc += 1
+reader = vcfpy.Reader.from_stream(ps[nProc-1].stdout)
+
+sample = snakemake.params["args"]["library_name"]
+iSample = reader.header.samples.name_to_idx[sample]
+
+reader.header.add_format_line(LFC)
+reader.header.add_format_line(CN)
+
+writer = vcfpy.Writer.from_path(snakemake.output.vcf, reader.header)
+
+for variant in reader:
+    logFoldChange = None
+    cn = None
+
+    # Add default (missing) values for all samples
+    variant.add_format("LFC", logFoldChange)
+    variant.add_format("CN", cn)
+
+    try:
+        locii = list(bed.query(variant.CHROM, variant.affected_start, variant.affected_end+1))
+    except tabix.TabixError as e:
+        print("Variant {}:{}{}>{} not covered by segmentation of sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample))
+        continue
+
+    if len(locii) == 0:
+        print("Variant {}:{}{}>{} not covered by segmentation of sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample))
+    elif len(locii) == 1:
+        for locus in locii:
+            logFoldChange = float(locus[4])
+            if locus[5] == "-":
+                logFoldChange = -logFoldChange
+            m = NAME_PATTERN.match(locus[3])
+            assert m
+            cn = int(m.groups()[1])
+    else:
+        print("Variant {}:{}{}>{} crosses segments in sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample))
+        for locus in locii:
+            m = NAME_PATTERN.match(locus[3])
+            assert m
+            if cn is None:
+                cn = int(m.groups()[1])
+            else:
+                if cn != int(m.groups()[1]):
+                    cn = None
+                    break
+
+    variant.calls[iSample].data["LFC"] = logFoldChange
+    variant.calls[iSample].data["CN"] = cn
+
+    writer.write_record(variant)
+
+writer.close()
diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py
new file mode 100644
index 000000000..393668fc1
--- /dev/null
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py
@@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+"""Tests for the somatic_cnv_checking workflow module code"""
+
+
+import textwrap
+
+import pytest
+import ruamel.yaml as ruamel_yaml
+from snakemake.io import Wildcards
+
+from snappy_pipeline.workflows.somatic_cnv_checking import (
+    SomaticCnvCheckingWorkflow,
+)
+
+from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict
+from .conftest import patch_module_fs
+
+__author__ = "Manuel Holtgrewe <manuel.holtgrewe@bih-charite.de>"
+
+
+@pytest.fixture(scope="module")  # otherwise: performance issues
+def minimal_config():
+    """Return YAML parsing result for (somatic) configuration"""
+    yaml = ruamel_yaml.YAML()
+    return yaml.load(
+        textwrap.dedent(
+            r"""
+        static_data_config:
+          reference:
+            path: /path/to/ref.fa
+          dbsnp:
+            path: /path/to/dbsnp.vcf.gz
+
+        step_config:
+          ngs_mapping:
+            tools:
+              dna: ['bwa']
+
+          somatic_targeted_seq_cnv_calling:
+            tools: ["cnvkit"]
+
+          somatic_cnv_checking:
+            path_ngs_mapping: ../ngs_mapping
+            path_cnv_calling: ../somatic_targeted_seq_cnv_calling
+            cnv_calling_tool: cnvkit
+
+        data_sets:
+          first_batch:
+            file: sheet.tsv
+            search_patterns:
+            - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'}
+            search_paths: ['/path']
+            type: matched_cancer
+            naming_scheme: only_secondary_id
+        """
+        ).lstrip()
+    )
+
+
+@pytest.fixture
+def somatic_cnv_checking_workflow(
+    dummy_workflow,
+    minimal_config,
+    config_lookup_paths,
+    work_dir,
+    config_paths,
+    cancer_sheet_fake_fs,
+    mocker,
+):
+    """Return SomaticTargetedSeqCnvCallingWorkflow object pre-configured with germline sheet"""
+    # Patch out file-system related things in abstract (the crawling link in step is defined there)
+    patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker)
+    # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we
+    # can obtain paths from the function as if we really had a NGSMappingPipelineStep here
+    dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x}
+    # Construct the workflow object
+    return SomaticCnvCheckingWorkflow(
+        dummy_workflow,
+        minimal_config,
+        config_lookup_paths,
+        config_paths,
+        work_dir,
+    )
+
+
+# Tests for CnvCheckingPileupStepPart --------------------------------------------------------------
+
+
+def test_pileup_normal_step_part_get_input_files(somatic_cnv_checking_workflow):
+    """Tests CnvCheckingPileupStepPart.get_input_files() - action normal"""
+    wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"})
+    expected = {
+        "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam",
+        "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai",
+    }
+    actual = somatic_cnv_checking_workflow.get_input_files("pileup", "normal")(wildcards)
+    assert actual == expected
+
+
+def test_pileup_normal_step_part_get_output_files(somatic_cnv_checking_workflow):
+    """Tests CnvCheckingPileupStepPart.get_output_files() - action normal"""
+    base_name = "work/{mapper}.{library}/out/{mapper}.{library}.normal"
+    expected = get_expected_output_vcf_files_dict(base_out=base_name)
+    actual = somatic_cnv_checking_workflow.get_output_files("pileup", "normal")
+    assert actual == expected
+
+
+def test_pileup_normal_step_part_get_log_file(somatic_cnv_checking_workflow):
+    """Tests CnvCheckingPileupStepPart.get_log_file() - action normal"""
+    base_name = "work/{mapper}.{library}/log/{mapper}.{library}.normal"
+    expected = get_expected_log_files_dict(base_out=base_name)
+    actual = somatic_cnv_checking_workflow.get_log_file("pileup", "normal")
+    assert actual == expected
+
+
+def test_pileup_normal_step_part_get_resource(somatic_cnv_checking_workflow):
+    """Tests CnvCheckingPileupStepPart.get_resource() - action normal"""
+    expected_dict = {"threads": 2, "time": "12:00:00", "memory": "7577M", "partition": "medium"}
+    for resource, expected in expected_dict.items():
+        msg_error = f"Assertion error for resource '{resource}'."
+        actual = somatic_cnv_checking_workflow.get_resource("pileup", "normal", resource)
+        assert actual == expected, msg_error
+
+
+def test_pileup_tumor_step_part_get_input_files(somatic_cnv_checking_workflow):
+    """Tests CnvCheckingPileupStepPart.get_input_files() - action tumor"""
+    wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"})
+    expected = {
+        "vcf": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.normal.vcf.gz",
+        "tbi": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.normal.vcf.gz.tbi",
+        "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam",
+        "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai",
+    }
+    actual = somatic_cnv_checking_workflow.get_input_files("pileup", "tumor")(wildcards)
+    assert actual == expected
+
+
+def test_pileup_tumor_step_part_get_output_files(somatic_cnv_checking_workflow):
+    """Tests CnvCheckingPileupStepPart.get_output_files() - action tumor"""
+    base_name = "work/{mapper}.{library}/out/{mapper}.{library}.tumor"
+    expected = get_expected_output_vcf_files_dict(base_out=base_name)
+    actual = somatic_cnv_checking_workflow.get_output_files("pileup", "tumor")
+    assert actual == expected
+
+
+def test_pileup_tumor_step_part_get_log_file(somatic_cnv_checking_workflow):
+    """Tests CnvCheckingPileupStepPart.get_log_file() - action tumor"""
+    base_name = "work/{mapper}.{library}/log/{mapper}.{library}.tumor"
+    expected = get_expected_log_files_dict(base_out=base_name)
+    actual = somatic_cnv_checking_workflow.get_log_file("pileup", "tumor")
+    assert actual == expected
+
+
+def test_pileup_tumor_step_part_get_resource(somatic_cnv_checking_workflow):
+    """Tests CnvCheckingPileupStepPart.get_resource() - action tumor"""
+    expected_dict = {"threads": 2, "time": "01:00:00", "memory": "7577M", "partition": "medium"}
+    for resource, expected in expected_dict.items():
+        msg_error = f"Assertion error for resource '{resource}'."
+        actual = somatic_cnv_checking_workflow.get_resource("pileup", "tumor", resource)
+        assert actual == expected, msg_error
+
+
+def test_cnv_run_step_part_get_input_files(somatic_cnv_checking_workflow):
+    """Tests SomaticCnvCheckingCnvStepPart.get_input_files()"""
+    expected = {
+        "cnv": "../somatic_targeted_seq_cnv_calling/output/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.bed.gz",
+        "cnv_tbi": "../somatic_targeted_seq_cnv_calling/output/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.bed.gz.tbi",
+        "normal": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz",
+        "normal_tbi": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz.tbi",
+        "tumor": "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz",
+        "tumor_tbi": "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz.tbi",
+    }
+    actual = somatic_cnv_checking_workflow.get_input_files("cnv", "run")
+    assert actual == expected
+
+
+def test_cnv_run_step_part_get_output_files(somatic_cnv_checking_workflow):
+    """Tests SomaticCnvCheckingCnvStepPart.get_output_files()"""
+    base_name = "work/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}"
+    expected = get_expected_output_vcf_files_dict(base_out=base_name)
+    actual = somatic_cnv_checking_workflow.get_output_files("cnv", "run")
+    assert actual == expected
+
+
+def test_cnv_run_step_part_get_log_file(somatic_cnv_checking_workflow):
+    """Tests SomaticCnvCheckingCnvStepPart.get_log_file()"""
+    base_name = "work/{mapper}.cnvkit.{library}/log/{mapper}.cnvkit.{library}"
+    expected = get_expected_log_files_dict(base_out=base_name)
+    actual = somatic_cnv_checking_workflow.get_log_file("cnv", "run")
+    assert actual == expected
+
+
+def test_report_run_step_part_get_input_files(somatic_cnv_checking_workflow):
+    """Tests SomaticCnvCheckingReportStepPart.get_input_files()"""
+    expected = {"vcf": "work/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.vcf.gz"}
+    actual = somatic_cnv_checking_workflow.get_input_files("report", "run")
+    assert actual == expected
+
+
+def test_report_run_step_part_get_output_files(somatic_cnv_checking_workflow):
+    """Tests SomaticCnvCheckingReportStepPart.get_output_files()"""
+    base_name = "work/{mapper}.cnvkit.{library}/report/{mapper}.cnvkit.{library}"
+    expected = {
+        "cnv": base_name + ".cnv.pdf",
+        "cnv_md5": base_name + ".cnv.pdf.md5",
+        "locus": base_name + ".locus.pdf",
+        "locus_md5": base_name + ".locus.pdf.md5",
+    }
+    actual = somatic_cnv_checking_workflow.get_output_files("report", "run")
+    assert actual == expected
+
+
+def test_report_run_step_part_get_log_file(somatic_cnv_checking_workflow):
+    """Tests SomaticCnvCheckingReportStepPart.get_log_file()"""
+    base_name = "work/{mapper}.cnvkit.{library}/log/{mapper}.cnvkit.{library}.report"
+    expected = get_expected_log_files_dict(base_out=base_name)
+    actual = somatic_cnv_checking_workflow.get_log_file("report", "run")
+    assert actual == expected
+
+
+def test_report_run_step_part_get_params(somatic_cnv_checking_workflow):
+    """Tests SomaticCnvCheckingReportStepPart.get_params()"""
+    wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"})
+    expected = {"library_name": "P001-T1-DNA1-WGS1"}
+    actual = somatic_cnv_checking_workflow.get_params("report", "run")(wildcards)
+    assert actual == expected
+
+
+# Tests for SomaticCnvCheckingWorkflow --------------------------------------------------
+
+
+def test_somatic_cnv_checking_workflow(somatic_cnv_checking_workflow):
+    """Test simple functionality of the workflow"""
+    # Check created sub steps
+    expected = ["cnv", "link_out", "pileup", "report"]
+    actual = list(sorted(somatic_cnv_checking_workflow.sub_steps.keys()))
+    assert actual == expected
+
+    # main output
+    tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}"
+    expected = [
+        tpl.format(i=i, t=t, ext=ext)
+        for i, t in ((1, 1), (2, 1), (2, 2))
+        for ext in (
+            "vcf.gz",
+            "vcf.gz.tbi",
+            "vcf.gz.md5",
+            "vcf.gz.tbi.md5",
+        )
+    ]
+
+    # report
+    tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}"
+    expected += [
+        tpl.format(i=i, t=t, ext=ext)
+        for i, t in ((1, 1), (2, 1), (2, 2))
+        for ext in (
+            "cnv.pdf",
+            "cnv.pdf.md5",
+            "locus.pdf",
+            "locus.pdf.md5",
+        )
+    ]
+
+    # logs
+    tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/log/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1{part}.{ext}{chksum}"
+    expected += [
+        tpl.format(i=i, t=t, part=part, ext=ext, chksum=chksum)
+        for i, t in ((1, 1), (2, 1), (2, 2))
+        for part in ("", ".normal", ".tumor", ".report")
+        for ext in ("log", "conda_info.txt", "conda_list.txt")
+        for chksum in ("", ".md5")
+    ]
+
+    expected = list(sorted(expected))
+    actual = list(sorted(somatic_cnv_checking_workflow.get_result_files()))
+    assert expected == actual

From 942f96f359e9c03ba95212bd165331f7c5afa8fa Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Fri, 28 Jul 2023 14:26:01 +0200
Subject: [PATCH 3/9] fix: CN must be integer numbers for cBioPortal export (&
 make black happy)

---
 .../workflows/somatic_targeted_seq_cnv_calling/__init__.py | 7 ++++++-
 snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py     | 2 +-
 .../test_workflows_somatic_targeted_seq_cnv_calling.py     | 7 ++++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py
index 1de7934ac..4f80a78c5 100644
--- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py
+++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py
@@ -622,7 +622,12 @@ def _get_output_files_call():
     def _get_output_files_postprocess():
         name_pattern = "{mapper}.cnvkit.{library_name}"
         tpl = os.path.join("work", name_pattern, "out", name_pattern + ".bed.gz")
-        return {"final": tpl, "final_tbi": tpl + ".tbi", "final_md5": tpl + ".md5", "final_tbi_md5": tpl + ".tbi.md5"}
+        return {
+            "final": tpl,
+            "final_tbi": tpl + ".tbi",
+            "final_md5": tpl + ".md5",
+            "final_tbi_md5": tpl + ".tbi.md5",
+        }
 
     @dictify
     def _get_output_files_plot(self):
diff --git a/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py b/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py
index f3e4a0106..2b35d730d 100644
--- a/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py
+++ b/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py
@@ -59,7 +59,7 @@
 
 # Default value: diploid segment (number of copies = 2)
 segments[,"cn"] <- 2
-segments[S4Vectors::queryHits(i),"cn"] <- calls[S4Vectors::subjectHits(i),"cn"]
+segments[S4Vectors::queryHits(i),"cn"] <- round(calls[S4Vectors::subjectHits(i),"cn"])
 
 # Reset segment boundaries to original values
 segments[,"start"] <- segments[["start"]]-1
diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py
index 975ccd978..ff622624c 100644
--- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py
@@ -473,7 +473,12 @@ def test_cnvkit_postprocess_step_part_get_input_files(somatic_targeted_seq_cnv_c
 
 def test_cnvkit_postprocess_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow):
     base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.bed.gz"
-    expected = {"final": base_name_out, "final_tbi": base_name_out + ".tbi", "final_md5": base_name_out + ".md5", "final_tbi_md5": base_name_out + ".tbi.md5"}
+    expected = {
+        "final": base_name_out,
+        "final_tbi": base_name_out + ".tbi",
+        "final_md5": base_name_out + ".md5",
+        "final_tbi_md5": base_name_out + ".tbi.md5",
+    }
     actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "postprocess")
     assert actual == expected
 

From 3d43f6021e056404ce785ebfd801b6dbb0ae22be Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Fri, 28 Jul 2023 14:31:33 +0200
Subject: [PATCH 4/9] feat: added segment summary table fix: added vcf
 indexation & md5 checksums feat: remove possibility of running script without
 CNV bed file (should be done in Snakefile/workflow) refactor: make black
 happy

---
 .../wrappers/vcfpy/add_bed/environment.yaml   |   1 +
 .../wrappers/vcfpy/add_bed/wrapper.py         | 175 ++++++++++++++++--
 2 files changed, 157 insertions(+), 19 deletions(-)

diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml b/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml
index 388c1737a..8d0c6415c 100644
--- a/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml
+++ b/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml
@@ -2,6 +2,7 @@ channels:
 - conda-forge
 - bioconda
 dependencies:
+- bcftools
 - vcfpy
 - pytabix
 - htslib
diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py
index fdf4c1317..a8d7f5929 100644
--- a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py
+++ b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py
@@ -1,20 +1,21 @@
-import os
+import math
+import numbers
 import re
 import subprocess
 
+from snakemake.shell import shell
 import tabix
 import vcfpy
 
-# Can't add anything if there's no CNV results
-if "cnv" not in snakemake.input:
-    os.symlink(snakemake.input.vcf, snakemake.output.vcf)
-    os.symlink(snakemake.input.vcf_tbi, snakemake.output.vcf_tbi)
-    return
-
 NAME_PATTERN = re.compile("^seg_([0-9]+)_cn_([0-9]+)$")
 
 LFC = {"ID": "LFC", "Number": 1, "Type": "Float", "Description": "Log2 fold change from coverage"}
-CN = {"ID": "CN", "Number": 1, "Type": "Integer", "Description": "Number of allele called by the CNV caller"}
+CN = {
+    "ID": "CN",
+    "Number": 1,
+    "Type": "Integer",
+    "Description": "Number of allele called by the CNV caller",
+}
 
 step = snakemake.config["pipeline_step"]["name"]
 config = snakemake.config["step_config"][step]
@@ -23,18 +24,36 @@
 
 ps = []
 nProc = 0
-ps += [subprocess.Popen(["bcftools", "merge", snakemake.input.normal, snakemake.input.tumor], stdout=subprocess.PIPE)]
+ps += [
+    subprocess.Popen(
+        ["bcftools", "merge", snakemake.input.normal, snakemake.input.tumor], stdout=subprocess.PIPE
+    )
+]
 nProc += 1
-ps += [subprocess.Popen(["bcftools", "filter", "--include", "N_ALT=3 & FORMAT/AD[:2]=0"], stdin=ps[nProc-1].stdout, stdout=subprocess.PIPE, text=True)]
-ps[nProc-1].stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
+ps += [
+    subprocess.Popen(
+        ["bcftools", "filter", "--include", "N_ALT=2 & FORMAT/AD[:2]=0"],
+        stdin=ps[nProc - 1].stdout,
+        stdout=subprocess.PIPE,
+        text=True,
+    )
+]
+ps[nProc - 1].stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
 nProc += 1
 if config["excluded_regions"]:
-    ps += [subprocess.Popen(["bcftools", "view", "--regions-file", "^" + config["excluded_regions"], stdin=ps[nProc-1].stdout, stdout=subprocess.PIPE, text=True)]
-    ps[nProc-1].stdout.close()
+    ps += [
+        subprocess.Popen(
+            ["bcftools", "view", "--targets-file", "^" + config["excluded_regions"]],
+            stdin=ps[nProc - 1].stdout,
+            stdout=subprocess.PIPE,
+            text=True,
+        )
+    ]
+    ps[nProc - 1].stdout.close()
     nProc += 1
-reader = vcfpy.Reader.from_stream(ps[nProc-1].stdout)
+reader = vcfpy.Reader.from_stream(ps[nProc - 1].stdout)
 
-sample = snakemake.params["args"]["library_name"]
+sample = snakemake.wildcards["library_name"]
 iSample = reader.header.samples.name_to_idx[sample]
 
 reader.header.add_format_line(LFC)
@@ -42,6 +61,7 @@
 
 writer = vcfpy.Writer.from_path(snakemake.output.vcf, reader.header)
 
+segments = {}
 for variant in reader:
     logFoldChange = None
     cn = None
@@ -51,13 +71,21 @@
     variant.add_format("CN", cn)
 
     try:
-        locii = list(bed.query(variant.CHROM, variant.affected_start, variant.affected_end+1))
+        locii = list(bed.query(variant.CHROM, variant.affected_start, variant.affected_end + 1))
     except tabix.TabixError as e:
-        print("Variant {}:{}{}>{} not covered by segmentation of sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample))
+        print(
+            "Variant {}:{}{}>{} not covered by segmentation of sample {}".format(
+                variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample
+            )
+        )
         continue
 
     if len(locii) == 0:
-        print("Variant {}:{}{}>{} not covered by segmentation of sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample))
+        print(
+            "Variant {}:{}{}>{} not covered by segmentation of sample {}".format(
+                variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample
+            )
+        )
     elif len(locii) == 1:
         for locus in locii:
             logFoldChange = float(locus[4])
@@ -67,7 +95,11 @@
             assert m
             cn = int(m.groups()[1])
     else:
-        print("Variant {}:{}{}>{} crosses segments in sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample))
+        print(
+            "Variant {}:{}{}>{} crosses segments in sample {}".format(
+                variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample
+            )
+        )
         for locus in locii:
             m = NAME_PATTERN.match(locus[3])
             assert m
@@ -83,4 +115,109 @@
 
     writer.write_record(variant)
 
+    if len(locii) == 1:
+        locus = locii[0]
+        if locus[3] not in segments:
+            segments[locus[3]] = {
+                "logFoldChange": logFoldChange,
+                "CHROM": locus[0],
+                "start": locus[1],
+                "stop": locus[2],
+                "BAFs": [],
+            }
+        depth = variant.call_for_sample[sample].data["AD"]
+        baf = min(depth[0], depth[1]) / (depth[0] + depth[1])
+        segments[locus[3]]["BAFs"] += [baf]
+
 writer.close()
+
+# Implmentation of R's quantile function for continuous quantiles.
+def quantile(x, probs, na_rm=False, method=7):
+    if na_rm:
+        x = list(
+            filter(
+                lambda y: y is not None
+                and isinstance(y, Number)
+                and not math.isnan(y)
+                and not math.isinf(y),
+                x,
+            )
+        )
+    x.sort()
+    n = len(x)
+    if method == 4:
+        m = [0 for p in probs]
+    elif method == 5:
+        m = [0.5 for p in probs]
+    elif method == 6:
+        m = [p for p in probs]
+    elif method == 7:
+        m = [1 - p for p in probs]
+    elif method == 8:
+        m = [(p + 1) / 3 for p in probs]
+    elif method == 9:
+        m = [p / 4 + 3 / 8 for p in probs]
+    else:
+        raise NotImplementedError("Only continuous methods 4 to 9 (R) are implmented")
+    qs = [None] * len(probs)
+    for i in range(len(probs)):
+        j = math.floor(n * probs[i] + m[i])
+        if j < 1:
+            qs[i] = x[j]
+        elif j >= n:
+            qs[i] = x[n - 1]
+        else:
+            g = n * probs[i] + m[i] - j
+            qs[i] = (1 - g) * x[j - 1] + g * x[j]
+    return qs
+
+
+with open(snakemake.output.tsv, "wt") as f:
+    print(
+        "Number\tCHROM\tstart\tstop\tCN\tLFC\tNvariants\tMin\t1st quartile\tMedian\t3rd quartile\tMax",
+        file=f,
+    )
+    for segId, segment in segments.items():
+        m = NAME_PATTERN.match(segId)
+        n = int(m.groups()[0])
+        cn = int(m.groups()[1])
+
+        logFoldChange = segment["logFoldChange"]
+        nVar = len(segment["BAFs"])
+        bafs = sorted(segment["BAFs"])
+
+        qs = quantile(segment["BAFs"], (0.25, 0.5, 0.75), method=8)
+        values = [
+            n,
+            segment["CHROM"],
+            segment["start"],
+            segment["stop"],
+            cn,
+            segment["logFoldChange"],
+            nVar,
+            bafs[0],
+            qs[0],
+            qs[1],
+            qs[2],
+            bafs[nVar - 1],
+        ]
+        print("\t".join(map(str, values)), file=f)
+
+shell(
+    r"""
+tabix {snakemake.output.vcf}
+
+md5() {{
+    filename=$1
+    fn=$(basename $filename)
+    pushd $(dirname $filename) 1> /dev/null 2>&1
+    rslt=$(md5sum $fn)
+    popd 1> /dev/null 2>&1
+    echo "$rslt"
+}}
+
+md5 {snakemake.output.vcf} > {snakemake.output.vcf_md5}
+md5 {snakemake.output.tbi} > {snakemake.output.tbi_md5}
+md5 {snakemake.output.tsv} > {snakemake.output.tsv_md5}
+"""
+)

From d3021cd31d20ea3afd979b131ba361928884990d Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Fri, 28 Jul 2023 14:34:55 +0200
Subject: [PATCH 5/9] fix: corrected handling of regions

---
 .../wrappers/bcftools/heterozygous_variants/wrapper.py      | 4 ++--
 snappy_wrappers/wrappers/bcftools/pileups/wrapper.py        | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py
index 43f4f51d6..385dfbfa5 100644
--- a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py
+++ b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py
@@ -9,9 +9,9 @@
 
 if "args" in snakemake.params and "intervals" in snakemake.params["args"]:
     locii = "-r " + snakemake.params["args"]["intervals"]
-elif "locii" in snakemake.input:
+elif "locii" in snakemake.input.keys():
     locii = "-R " + snakemake.input.locii
-elif config["locii"]:
+elif "locii" in config and config["locii"]:
     locii = "-R " + config["locii"]
 else:
     locii = ""
diff --git a/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py b/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py
index daf4c5dd1..6c70452f2 100644
--- a/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py
+++ b/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py
@@ -2,14 +2,16 @@
 """Wrapper for running bcftools mpileup
 """
 
+from snakemake.shell import shell
+
 step = snakemake.config["pipeline_step"]["name"]
 config = snakemake.config["step_config"][step]
 
 if "args" in snakemake.params and "intervals" in snakemake.params["args"]:
     locii = "-r " + snakemake.params["args"]["intervals"]
-elif "locii" in snakemake.input:
+elif "locii" in snakemake.input.keys():
     locii = "-R " + snakemake.input.locii
-elif config["locii"]:
+elif "locii" in config and config["locii"]:
     locii = "-R " + config["locii"]
 else:
     locii = ""

From f10ba6190bb51ea445b0b0c8aeff7251f136a972 Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Fri, 28 Jul 2023 14:36:41 +0200
Subject: [PATCH 6/9] feat: added segment plot & default colors for calls

---
 .../somatic_cnv_checking/cnv-check-plot.R     | 54 +++++++++++++++----
 .../somatic_cnv_checking/environment.yaml     |  1 +
 .../wrappers/somatic_cnv_checking/wrapper.py  | 25 ++++++---
 3 files changed, 65 insertions(+), 15 deletions(-)

diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R
index 949b2bdb7..5be6106f6 100644
--- a/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R
+++ b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R
@@ -1,3 +1,13 @@
+cn_to_call <- function(cn, amplification=7) {
+    Call <- factor(rep("Neutral", length(cn)), levels=c("Deletion", "Loss", "Neutral", "Gain", "Amplification"))
+    Call[!is.na(cn) & cn==0] <- "Deletion"
+    Call[!is.na(cn) & cn==1] <- "Loss"
+    Call[!is.na(cn) & 2<cn & cn<amplification] <- "Gain"
+    Call[!is.na(cn) & amplification<=cn] <- "Amplification"
+    Call[is.na(cn)] <- NA
+    Call
+}
+
 vcf_to_table <- function(vcf, sample, amplification=7) {
      vcf <- VariantAnnotation::readVcf(vcf)
      stopifnot(all(lengths(VariantAnnotation::alt(vcf)) == 2))
@@ -14,12 +24,8 @@ vcf_to_table <- function(vcf, sample, amplification=7) {
      colnames(x)[5:6] <- c("t_ref", "t_alt")
      x <- x |>
          dplyr::filter(!is.na(LFC) & !is.na(CN)) |>
-         dplyr::mutate(BAF=pmin(t_ref, t_alt)/(t_ref+t_alt)) |>
-         dplyr::mutate(Call=factor("Neutral", levels=c("Deletion", "Loss", "Neutral", "Gain", "Amplification"))) |>
-         dplyr::mutate(Call=replace(Call, CN == 0, "Deletion")) |>
-         dplyr::mutate(Call=replace(Call, CN == 1, "Loss")) |>
-         dplyr::mutate(Call=replace(Call, 2 < CN & CN < amplification, "Gain")) |>
-         dplyr::mutate(Call=replace(Call, amplification <= CN, "Amplification"))
+         dplyr::mutate(BAF=t_alt/(t_ref+t_alt)) |>
+         dplyr::mutate(Call=cn_to_call(CN))
     x
 }
 
@@ -53,27 +59,57 @@ chromosome_lengths <- function(genome, use_fai=TRUE, use_dict=TRUE) {
     genome
 }
 
-plot_cnv <- function(x, scale=c("log2", "sqrt")) {
+default_call_colors <- c(
+    "Deletion"="#E41A1C",
+    "Loss"="#984EA3",
+    "Neutral"="#4DAF4A",
+    "Gain"="#80B1D3",
+    "Amplification"="#377EB8"
+)
+
+plot_cnv <- function(x, scale=c("log2", "sqrt"), call_colors=default_call_colors) {
     scale <- match.arg(scale)
     if (scale == "sqrt") x_label <- "Segment Fold Change"
     else                 x_label <- "Segment Log2 Fold Change"
     if (scale == "sqrt") x <- x |> dplyr::mutate(LFC=2^LFC)
     p <- x |>
+        dplyr::mutate(BAF=pmin(BAF, 1-BAF)) |>
         ggplot2::ggplot(ggplot2::aes(x=LFC, y=BAF, color=Call)) +
         ggplot2::geom_point() +
         ggplot2::labs(x=x_label, y="B-allele fraction") +
+        ggplot2::scale_color_manual(values=call_colors) +
         ggplot2::theme_bw()
     if (scale == "sqrt") p <- p + ggplot2::scale_x_sqrt()
     p
 }
 
-plot_locus <- function(x, genome_lengths) {
+plot_locus <- function(x, genome_lengths, call_colors=default_call_colors) {
     p <- genome_lengths |>
-        ggplot2::ggplot(ggplot2::aes(xmin=Offset+1, xmax=Offset+Length, ymin=0, ymax=0.5, fill=as.character(n%%2))) +
+        ggplot2::ggplot(ggplot2::aes(xmin=Offset+1, xmax=Offset+Length, ymin=0, ymax=1, fill=as.character(n%%2))) +
         ggplot2::geom_rect() +
         ggplot2::scale_fill_manual(values=c("white", "grey")) +
         ggplot2::scale_x_continuous(breaks=genome_lengths$Offset+genome_lengths$Length/2, labels=genome_lengths$CHROM) +
         ggplot2::geom_point(data=x, mapping=ggplot2::aes(x=x, y=BAF, color=Call), inherit.aes=FALSE) +
+        ggplot2::scale_color_manual(values=call_colors) +
+        ggplot2::theme_bw() +
+        ggplot2::theme(axis.text.x=ggplot2::element_text(angle=90, vjust=0.5, hjust=1)) +
+        ggplot2::guides(fill="none") +
+        ggplot2::labs(x="Genomic position", y="B-allele fraction")
+    p
+}
+
+plot_segment <- function(y, genome_lengths, call_colors=default_call_colors) {
+    p <- genome_lengths |>
+        ggplot2::ggplot(ggplot2::aes(xmin=Offset+1, xmax=Offset+Length, ymin=0, ymax=0.5, fill=as.character(n%%2))) +
+        ggplot2::geom_rect() +
+        ggplot2::scale_fill_manual(values=c("white", "grey")) +
+        ggplot2::scale_x_continuous(breaks=genome_lengths$Offset+genome_lengths$Length/2, labels=genome_lengths$CHROM)
+    p <- p +
+        ggplot2::geom_point(data=y, mapping=ggplot2::aes(x=(from+to)/2, y=Median, color=Call, size=Nvariants), inherit.aes=FALSE) +
+        ggplot2::geom_segment(data=y, mapping=ggplot2::aes(x=(from+to)/2, y=`1st quartile`, xend=(from+to)/2, yend=`3rd quartile`, color=Call), inherit.aes=FALSE) +
+        ggplot2::geom_segment(data=y, mapping=ggplot2::aes(x=from, y=Median, xend=to, yend=Median, color=Call), inherit.aes=FALSE) +
+        ggplot2::scale_color_manual(values=call_colors)
+    p <- p +
         ggplot2::theme_bw() +
         ggplot2::theme(axis.text.x=ggplot2::element_text(angle=90, vjust=0.5, hjust=1)) +
         ggplot2::guides(fill="none") +
diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml b/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml
index 88eb63da5..6421a7e1b 100644
--- a/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml
+++ b/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml
@@ -4,5 +4,6 @@ channels:
 dependencies:
 - r-base
 - r-dplyr
+- r-ggplot2
 - bioconductor-variantannotation
 - bioconductor-biostrings
diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py
index 2fbdbda9c..924c194e0 100644
--- a/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py
+++ b/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py
@@ -50,21 +50,34 @@
 #
 R --vanilla --slave << __EOF
 source("{rscript}")
-x <- vcf_to_table("{snakemake.input.vcf}", sample="{snakemake.params[args][library_name]}")
-genome_lengths <- chromosome_lengths("{reference}")
+
+genome_lengths <- chromosome_lengths("{reference}") |> dplyr::mutate(n=dplyr::row_number())
+
+x <- vcf_to_table("{snakemake.input.vcf}", sample="{snakemake.wildcards[library_name]}")
 x <- x |> dplyr::left_join(genome_lengths, by="CHROM") |> dplyr::mutate(x=POS + Offset)
+
+y <- read.table("{snakemake.input.tsv}", sep="\t", header=1, check.names=FALSE)
+y <- y |> dplyr::mutate(Call=cn_to_call(CN))
+y <- y |> dplyr::left_join(genome_lengths, by="CHROM") |> dplyr::mutate(from=start + Offset, to=stop + Offset)
+
 pdf("{snakemake.output.cnv}", height=6.22, width=9.33)
-plot_cnv(x, scale="log2") + ggplot2::ggtitle("{snakemake.params[args][library_name]}")
-plot_cnv(x, scale="sqrt") + ggplot2::ggtitle("{snakemake.params[args][library_name]}")
+plot_cnv(x, scale="log2") + ggplot2::ggtitle("{snakemake.wildcards[library_name]}")
+plot_cnv(x, scale="sqrt") + ggplot2::ggtitle("{snakemake.wildcards[library_name]}")
 dev.off()
+
 pdf("{snakemake.output.locus}", height=6.22, width=12.44)
-plot_locus(x, genome_lengths |> dplyr::mutate(n=dplyr::row_number()) |> dplyr::filter(CHROM %in% x$CHROM)) +
-    ggplot2::ggtitle("{snakemake.params[args][library_name]}")
+plot_locus(x, genome_lengths |> dplyr::filter(CHROM %in% x[["CHROM"]])) +
+    ggplot2::ggtitle("{snakemake.wildcards[library_name]}")
+dev.off()
+
+pdf("{snakemake.output.segment}", height=6.22, width=12.44)
+plot_segment(y, genome_lengths |> dplyr::filter(CHROM %in% y[["CHROM"]]))
 dev.off()
 __EOF
 
 md5 {snakemake.output.cnv} > {snakemake.output.cnv_md5}
 md5 {snakemake.output.locus} > {snakemake.output.locus_md5}
+md5 {snakemake.output.segment} > {snakemake.output.segment_md5}
 """
 )
 

From 51f594772234d96b06119ae72ad5ca27150026f7 Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Fri, 28 Jul 2023 14:39:42 +0200
Subject: [PATCH 7/9] refact: finding hetero variants in normal samples done
 only one in presence of multiple tumors

---
 .../workflows/somatic_cnv_checking/Snakefile  |  75 +++--
 .../somatic_cnv_checking/__init__.py          | 261 +++++++++---------
 .../test_workflows_somatic_cnv_checking.py    | 126 +++++----
 3 files changed, 246 insertions(+), 216 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile b/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile
index 258ca1a3c..f670a39a4 100644
--- a/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile
+++ b/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile
@@ -67,7 +67,7 @@ rule somatic_cnv_checking_pileup_normal:
     log:
         **wf.get_log_file("pileup", "normal"),
     wrapper:
-        wf.wrapper_path("bcftools/heterozygous")
+        wf.wrapper_path("bcftools/heterozygous_variants")
 
 
 # Run pileup to add read counts in the tumor sample ---------------------------
@@ -92,41 +92,38 @@ rule somatic_cnv_checking_pileup_tumor:
 
 # Add CNV status at the locii -------------------------------------------------
 
-
-rule somatic_cnv_checking_cnv_run:
-    input:
-        **wf.get_input_files("cnv", "run"),
-    output:
-        **wf.get_output_files("cnv", "run"),
-    threads: wf.get_resource("cnv", "run", "threads")
-    resources:
-        time=wf.get_resource("cnv", "run", "time"),
-        memory=wf.get_resource("cnv", "run", "memory"),
-        partition=wf.get_resource("cnv", "run", "partition"),
-        tmpdir=wf.get_resource("cnv", "run", "tmpdir"),
-    log:
-        **wf.get_log_file("cnv", "run"),
-    wrapper:
-        wf.wrapper_path("vcfpy/add_bed")
-
-
-# Generate report & plots -----------------------------------------------------
-
-
-rule somatic_cnv_checking_cnv_report:
-    input:
-        **wf.get_input_files("report", "run"),
-    output:
-        **wf.get_output_files("report", "run"),
-    params:
-        **{"args": wf.get_params("report", "run")}
-    threads: wf.get_resource("report", "run", "threads")
-    resources:
-        time=wf.get_resource("report", "run", "time"),
-        memory=wf.get_resource("report", "run", "memory"),
-        partition=wf.get_resource("report", "run", "partition"),
-        tmpdir=wf.get_resource("report", "run", "tmpdir"),
-    log:
-        **wf.get_log_file("report", "run"),
-    wrapper:
-        wf.wrapper_path("somatic_cnv_checking")
+if config["step_config"]["somatic_cnv_checking"]["path_cnv_calling"]:
+
+    rule somatic_cnv_checking_cnv_run:
+        input:
+            unpack(wf.get_input_files("cnv", "run")),
+        output:
+            **wf.get_output_files("cnv", "run"),
+        threads: wf.get_resource("cnv", "run", "threads")
+        resources:
+            time=wf.get_resource("cnv", "run", "time"),
+            memory=wf.get_resource("cnv", "run", "memory"),
+            partition=wf.get_resource("cnv", "run", "partition"),
+            tmpdir=wf.get_resource("cnv", "run", "tmpdir"),
+        log:
+            **wf.get_log_file("cnv", "run"),
+        wrapper:
+            wf.wrapper_path("vcfpy/add_bed")
+
+    # Generate report & plots -----------------------------------------------------
+
+    rule somatic_cnv_checking_cnv_report:
+        input:
+            unpack(wf.get_input_files("report", "run")),
+        output:
+            **wf.get_output_files("report", "run"),
+        threads: wf.get_resource("report", "run", "threads")
+        resources:
+            time=wf.get_resource("report", "run", "time"),
+            memory=wf.get_resource("report", "run", "memory"),
+            partition=wf.get_resource("report", "run", "partition"),
+            tmpdir=wf.get_resource("report", "run", "tmpdir"),
+        log:
+            **wf.get_log_file("report", "run"),
+        wrapper:
+            wf.wrapper_path("somatic_cnv_checking")
diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
index 46a0a2729..3397a7c18 100644
--- a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
+++ b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
@@ -57,13 +57,13 @@
 Currently, no reports are generated.
 """
 
-from collections import OrderedDict
 import os
 import sys
 
 from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
 from snakemake.io import expand
 
+from snappy_pipeline.base import InvalidConfiguration
 from snappy_pipeline.utils import dictify, listify
 from snappy_pipeline.workflows.abstract import (
     BaseStep,
@@ -72,6 +72,10 @@
     ResourceUsage,
 )
 from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow
+from snappy_pipeline.workflows.somatic_targeted_seq_cnv_calling import (
+    SomaticTargetedSeqCnvCallingWorkflow,
+)
+from snappy_pipeline.workflows.somatic_wgs_cnv_calling import SomaticWgsCnvCallingWorkflow
 
 __author__ = "Eric Blanc <eric.blanc@bih-charite.de>"
 
@@ -88,7 +92,7 @@
   somatic_cnv_checking:
     path_ngs_mapping: ../ngs_mapping  # REQUIRED
     path_cnv_calling: ""              # Can use for instance ../somatic_targeted_seq_cnv_calling
-    cnv_calling_tool: ""              # Can use "cnvkit"
+    cnv_assay_type: ""                # Empty: no CNV, WES for somatic_targeted_seq_snv_calling step, WGS for somatic_wgs_cnv_calling step
     excluded_regions: ""              # Bed file of regions to be excluded
     max_depth: 10000                  # Max depth for pileups
     min_cov: 20                       # Minimum depth for reference and alternative alleles to consider variant
@@ -98,32 +102,10 @@
 
 class SomaticCnvCheckingStepPart(BaseStepPart):
     """Base class for CNV checking sub-steps"""
-        
-    def __init__(self, parent):
-        super().__init__(parent)
-        # Build shortcut from cancer bio sample name to matched cancer sample
-        self.tumor_ngs_library_to_sample_pair = OrderedDict()
-        for sheet in self.parent.shortcut_sheets:
-            self.tumor_ngs_library_to_sample_pair.update(
-                sheet.all_sample_pairs_by_tumor_dna_ngs_library
-            )
-        # Get shorcut to Snakemake sub workflow
-        self.ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
-
-    def get_params(self, action):
-        # Validate action
-        self._validate_action(action)
-
-        def input_function(wildcards):
-            pair = self.tumor_ngs_library_to_sample_pair[wildcards.library]
-            return {"library_name": pair.tumor_sample.dna_ngs_library.name}
-
-        return input_function
 
     @dictify
     def _get_log_file(self, prefix):
         """Return dict of log files."""
-        # Validate action
         key_ext = (
             ("log", ".log"),
             ("conda_info", ".conda_info.txt"),
@@ -140,14 +122,17 @@ class SomaticCnvCheckingPileupStepPart(SomaticCnvCheckingStepPart):
     name = "pileup"
     actions = ("normal", "tumor")
 
+    def __init__(self, parent):
+        super().__init__(parent)
+        self.ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
+
     def get_input_files(self, action):
         # Validate action
         self._validate_action(action)
 
         def input_function_normal(wildcards):
-            pair = self.tumor_ngs_library_to_sample_pair[wildcards.library]
-            base_path = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
-                normal_library=pair.normal_sample.dna_ngs_library.name, **wildcards
+            base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(
+                **wildcards
             )
             return {
                 "bam": self.ngs_mapping(base_path + ".bam"),
@@ -155,16 +140,15 @@ def input_function_normal(wildcards):
             }
 
         def input_function_tumor(wildcards):
-            pair = self.tumor_ngs_library_to_sample_pair[wildcards.library]
-            base_path = "output/{mapper}.{tumor_library}/out/{mapper}.{tumor_library}".format(
-                tumor_library=pair.tumor_sample.dna_ngs_library.name, **wildcards
+            base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format(
+                **wildcards
             )
             return {
-                "vcf": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz".format(
-                    **wildcards
+                "locii": "work/{mapper}.{normal_library}/out/{mapper}.{normal_library}.normal.vcf.gz".format(
+                    normal_library=self.parent.tumor_to_normal[wildcards.library_name], **wildcards
                 ),
-                "tbi": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz.tbi".format(
-                    **wildcards
+                "locii_tbi": "work/{mapper}.{normal_library}/out/{mapper}.{normal_library}.normal.vcf.gz.tbi".format(
+                    normal_library=self.parent.tumor_to_normal[wildcards.library_name], **wildcards
                 ),
                 "bam": self.ngs_mapping(base_path + ".bam"),
                 "bai": self.ngs_mapping(base_path + ".bam.bai"),
@@ -181,13 +165,17 @@ def get_output_files(self, action):
         """
         # Validate action
         self._validate_action(action)
-        base_path_out = "work/{{mapper}}.{{library}}/out/{{mapper}}.{{library}}.{action}{ext}"
+        base_path_out = (
+            "work/{{mapper}}.{{library_name}}/out/{{mapper}}.{{library_name}}.{action}{ext}"
+        )
         return dict(zip(EXT_NAMES, expand(base_path_out, action=action, ext=EXT_VALUES)))
 
     def get_log_file(self, action):
         # Validate action
         self._validate_action(action)
-        return self._get_log_file("work/{mapper}.{library}/log/{mapper}.{library}." + action)
+        return self._get_log_file(
+            "work/{mapper}.{library_name}/log/{mapper}.{library_name}." + action
+        )
 
     def get_resource_usage(self, action):
         # Validate action
@@ -208,40 +196,44 @@ class SomaticCnvCheckingCnvStepPart(SomaticCnvCheckingStepPart):
     def get_input_files(self, action):
         # Validate action
         self._validate_action(action)
-        filenames = {}
-        filenames["normal"] = "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz"
-        filenames["normal_tbi"] = filenames["normal"] + ".tbi"
-        filenames["tumor"] = "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz"
-        filenames["tumor_tbi"] = filenames["tumor"] + ".tbi"
-        if self.config["path_cnv_calling"]:
-            tpl = "{{mapper}}.{caller}.{{library}}".format(caller=self.config["cnv_calling_tool"])
-            filenames["cnv"] = os.path.join(
-                self.config["path_cnv_calling"], "output", tpl, "out", tpl + ".bed.gz"
+
+        def input_function(wildcards):
+            normal_library = self.parent.tumor_to_normal[wildcards.library_name]
+            filenames = {}
+            name_pattern = "{mapper}.{normal_library}"
+            tpl = os.path.join("work", name_pattern, "out", name_pattern + ".normal.vcf.gz")
+            filenames["normal"] = tpl.format(normal_library=normal_library, **wildcards)
+            filenames["normal_tbi"] = filenames["normal"] + ".tbi"
+            name_pattern = "{mapper}.{library_name}"
+            tpl = os.path.join("work", name_pattern, "out", name_pattern + ".tumor.vcf.gz")
+            filenames["tumor"] = tpl.format(**wildcards)
+            filenames["tumor_tbi"] = filenames["tumor"] + ".tbi"
+            cnv_calling = self.parent.sub_workflows["cnv_calling"]
+            base_path = "output/{mapper}.{caller}.{library_name}/out/{mapper}.{caller}.{library_name}".format(
+                **wildcards
             )
+            filenames["cnv"] = cnv_calling(base_path + ".bed.gz")
             filenames["cnv_tbi"] = filenames["cnv"] + ".tbi"
-        return filenames
+            return filenames
+
+        return input_function
 
+    @dictify
     def get_output_files(self, action):
         # Validate action
         self._validate_action(action)
-        if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]:
-            name_pattern = "{{mapper}}.{caller}.{{library}}".format(
-                caller=self.config["cnv_calling_tool"]
-            )
-        else:
-            name_pattern = "{mapper}.{library}"
+        key_ext = {"vcf": ".vcf.gz", "tbi": ".vcf.gz.tbi"}
+        name_pattern = "{mapper}.{caller}.{library_name}"
+        key_ext["tsv"] = ".tsv"
         base_path_out = "work/" + name_pattern + "/out/" + name_pattern
-        return dict(zip(EXT_NAMES, [base_path_out + ext for ext in EXT_VALUES]))
+        for key, ext in key_ext.items():
+            yield (key, base_path_out + ext)
+            yield (key + "_md5", base_path_out + ext + ".md5")
 
     def get_log_file(self, action):
         # Validate action
         self._validate_action(action)
-        if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]:
-            name_pattern = "{{mapper}}.{caller}.{{library}}".format(
-                caller=self.config["cnv_calling_tool"]
-            )
-        else:
-            name_pattern = "{mapper}.{library}"
+        name_pattern = "{mapper}.{caller}.{library_name}"
         return self._get_log_file(os.path.join("work", name_pattern, "log", name_pattern))
 
 
@@ -254,33 +246,35 @@ class SomaticCnvCheckingReportStepPart(SomaticCnvCheckingStepPart):
     def get_input_files(self, action):
         # Validate action
         self._validate_action(action)
-        name_pattern = "{{mapper}}.{caller}.{{library}}".format(
-            caller=self.config["cnv_calling_tool"]
-        )
-        base_path_out = "work/" + name_pattern + "/out/" + name_pattern
-        return {"vcf": base_path_out + ".vcf.gz"}
+
+        def input_function(wildcards):
+            name_pattern = "{mapper}.{caller}.{library_name}".format(**wildcards)
+            base_path_out = "work/" + name_pattern + "/out/" + name_pattern
+            return {"vcf": base_path_out + ".vcf.gz", "tsv": base_path_out + ".tsv"}
+
+        return input_function
 
     def get_output_files(self, action):
         # Validate action
         self._validate_action(action)
-        name_pattern = "{{mapper}}.{caller}.{{library}}".format(
-            caller=self.config["cnv_calling_tool"]
-        )
+        name_pattern = "{mapper}.{caller}.{library_name}"
         base_path_out = "work/" + name_pattern + "/report/" + name_pattern
         return {
             "cnv": base_path_out + ".cnv.pdf",
             "cnv_md5": base_path_out + ".cnv.pdf.md5",
             "locus": base_path_out + ".locus.pdf",
             "locus_md5": base_path_out + ".locus.pdf.md5",
+            "segment": base_path_out + ".segment.pdf",
+            "segment_md5": base_path_out + ".segment.pdf.md5",
         }
 
     def get_log_file(self, action):
         # Validate action
         self._validate_action(action)
-        name_pattern = "{{mapper}}.{caller}.{{library}}".format(
-            caller=self.config["cnv_calling_tool"]
+        name_pattern = "{mapper}.{caller}.{library_name}"
+        return self._get_log_file(
+            os.path.join("work", name_pattern, "log", name_pattern + ".report")
         )
-        return self._get_log_file(os.path.join("work", name_pattern, "log", name_pattern + ".report"))
 
 
 class SomaticCnvCheckingWorkflow(BaseStep):
@@ -308,8 +302,24 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
             config_lookup_paths,
             config_paths,
             workdir,
-            (NgsMappingWorkflow,),
+            (
+                SomaticTargetedSeqCnvCallingWorkflow,
+                SomaticWgsCnvCallingWorkflow,
+                NgsMappingWorkflow,
+            ),
         )
+        if self.config["path_cnv_calling"] and self.config["cnv_assay_type"]:
+            if self.config["cnv_assay_type"] == "WES":
+                cnv_calling = "somatic_targeted_seq_cnv_calling"
+            elif self.config["cnv_assay_type"] == "WES":
+                cnv_calling = "somatic_wgs_cnv_calling"
+            else:
+                raise InvalidConfiguration(
+                    "Illegal cnv_assay_type {}, must be either WES or WGS".format(
+                        self.config["cnv_assay_type"]
+                    )
+                )
+            self.register_sub_workflow(cnv_calling, self.config["path_cnv_calling"], "cnv_calling")
         self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])
         # Register sub step classes so the sub steps are available
         self.register_sub_step_classes(
@@ -320,7 +330,23 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
                 LinkOutStepPart,
             )
         )
-        # Initialize sub-workflows
+        # Assemble normal/tumor pairs
+        self.tumor_to_normal = {}
+        for sheet in filter(is_not_background, self.shortcut_sheets):
+            for sample_pair in sheet.all_sample_pairs:
+                if (
+                    not sample_pair.tumor_sample.dna_ngs_library
+                    or not sample_pair.normal_sample.dna_ngs_library
+                ):
+                    msg = (
+                        "INFO: sample pair for cancer bio sample {} has is missing primary"
+                        "normal or primary cancer NGS library"
+                    )
+                    print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
+                    continue
+                tumor_library = sample_pair.tumor_sample.dna_ngs_library.name
+                normal_library = sample_pair.normal_sample.dna_ngs_library.name
+                self.tumor_to_normal[tumor_library] = normal_library
 
     @listify
     def get_result_files(self):
@@ -328,68 +354,55 @@ def get_result_files(self):
 
         We will process all NGS libraries of all bio samples in all sample sheets.
         """
-        if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]:
-            name_pattern = "{{mapper}}.{caller}.{{tumor_library.name}}".format(
-                caller=self.config["cnv_calling_tool"]
-            )
-        else:
-            name_pattern = "{mapper}.{tumor_library.name}"
-        yield from self._yield_result_files_matched(
-            os.path.join("output", name_pattern, "out", name_pattern + "{ext}"),
+        # Log files from normal pileups
+        name_pattern = "{mapper}.{library_name}"
+        chksum = ("", ".md5")
+        ext = ("log", "conda_info.txt", "conda_list.txt")
+        yield from expand(
+            os.path.join("output", name_pattern, "log", name_pattern + ".normal.{ext}{chksum}"),
             mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
-            ext=EXT_VALUES,
+            library_name=set(self.tumor_to_normal.values()),
+            ext=ext,
+            chksum=chksum,
         )
-        yield from self._yield_result_files_matched(
-            os.path.join("output", name_pattern, "log", name_pattern + "{ext}"),
+        yield from expand(
+            os.path.join("output", name_pattern, "log", name_pattern + ".tumor.{ext}{chksum}"),
             mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
-            ext=[
-                part + "." + ext + chksum
-                for part in ("", ".normal", ".tumor")
-                for ext in ("log", "conda_info.txt", "conda_list.txt")
-                for chksum in ("", ".md5")
-            ],
+            library_name=self.tumor_to_normal.keys(),
+            ext=ext,
+            chksum=chksum,
         )
-        if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]:
-            yield from self._yield_result_files_matched(
-                os.path.join("output", name_pattern, "report", name_pattern + "{ext}"),
-                mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
-                ext=(".cnv.pdf", ".cnv.pdf.md5", ".locus.pdf", ".locus.pdf.md5"),
-            )
-            yield from self._yield_result_files_matched(
-                os.path.join("output", name_pattern, "log", name_pattern + ".report" + "{ext}"),
+        # Main result: vcf & optionally segment table if CNV available
+        ext = {"out": [".vcf.gz", ".vcf.gz.tbi"]}
+        if self.config["path_cnv_calling"]:
+            # CNV avaliable
+            name_pattern = "{mapper}.{caller}.{library_name}"
+            callers = self.w_config["step_config"]["somatic_targeted_seq_cnv_calling"]["tools"]
+            ext["out"] += [".tsv"]
+            ext["report"] = (".cnv.pdf", ".locus.pdf", ".segment.pdf")
+            ext["log"] = [
+                suffix + "." + e
+                for suffix in ("", ".report")
+                for e in ("log", "conda_info.txt", "conda_list.txt")
+            ]
+        else:
+            name_pattern = "{mapper}.{library_name}"
+            callers = []
+        for subdir, exts in ext.items():
+            yield from expand(
+                os.path.join("output", name_pattern, subdir, name_pattern + "{ext}{chksum}"),
                 mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"],
-                ext=[
-                    "." + ext + chksum
-                    for ext in ("log", "conda_info.txt", "conda_list.txt")
-                    for chksum in ("", ".md5")
-                ],
+                caller=callers,
+                library_name=self.tumor_to_normal.keys(),
+                ext=exts,
+                chksum=chksum,
             )
 
-    def _yield_result_files_matched(self, tpl, **kwargs):
-        """Build output paths from path template and extension list.
-
-        This function returns the results from the matched somatic variant callers such as
-        Mutect.
-        """
-        for sheet in filter(is_not_background, self.shortcut_sheets):
-            for sample_pair in sheet.all_sample_pairs:
-                if (
-                    not sample_pair.tumor_sample.dna_ngs_library
-                    or not sample_pair.normal_sample.dna_ngs_library
-                ):
-                    msg = (
-                        "INFO: sample pair for cancer bio sample {} has is missing primary"
-                        "normal or primary cancer NGS library"
-                    )
-                    print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
-                    continue
-                yield from expand(
-                    tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs
-                )
-
     def check_config(self):
         """Check that the path to the NGS mapping is present"""
         self.ensure_w_config(
             ("step_config", "somatic_cnv_checking", "path_ngs_mapping"),
             "Path to NGS mapping not configured but required for somatic variant calling",
         )
+        if self.config["path_cnv_calling"]:
+            assert self.config["cnv_assay_type"] in ("WES", "WGS")
diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py
index 393668fc1..2310f7676 100644
--- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py
+++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py
@@ -8,9 +8,7 @@
 import ruamel.yaml as ruamel_yaml
 from snakemake.io import Wildcards
 
-from snappy_pipeline.workflows.somatic_cnv_checking import (
-    SomaticCnvCheckingWorkflow,
-)
+from snappy_pipeline.workflows.somatic_cnv_checking import SomaticCnvCheckingWorkflow
 
 from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict
 from .conftest import patch_module_fs
@@ -42,7 +40,7 @@ def minimal_config():
           somatic_cnv_checking:
             path_ngs_mapping: ../ngs_mapping
             path_cnv_calling: ../somatic_targeted_seq_cnv_calling
-            cnv_calling_tool: cnvkit
+            cnv_assay_type: WES
 
         data_sets:
           first_batch:
@@ -72,7 +70,10 @@ def somatic_cnv_checking_workflow(
     patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker)
     # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we
     # can obtain paths from the function as if we really had a NGSMappingPipelineStep here
-    dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x}
+    dummy_workflow.globals = {
+        "ngs_mapping": lambda x: "NGS_MAPPING/" + x,
+        "cnv_calling": lambda x: "CNV_CALLING/" + x,
+    }
     # Construct the workflow object
     return SomaticCnvCheckingWorkflow(
         dummy_workflow,
@@ -88,7 +89,7 @@ def somatic_cnv_checking_workflow(
 
 def test_pileup_normal_step_part_get_input_files(somatic_cnv_checking_workflow):
     """Tests CnvCheckingPileupStepPart.get_input_files() - action normal"""
-    wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"})
+    wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-N1-DNA1-WGS1"})
     expected = {
         "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam",
         "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai",
@@ -99,7 +100,7 @@ def test_pileup_normal_step_part_get_input_files(somatic_cnv_checking_workflow):
 
 def test_pileup_normal_step_part_get_output_files(somatic_cnv_checking_workflow):
     """Tests CnvCheckingPileupStepPart.get_output_files() - action normal"""
-    base_name = "work/{mapper}.{library}/out/{mapper}.{library}.normal"
+    base_name = "work/{mapper}.{library_name}/out/{mapper}.{library_name}.normal"
     expected = get_expected_output_vcf_files_dict(base_out=base_name)
     actual = somatic_cnv_checking_workflow.get_output_files("pileup", "normal")
     assert actual == expected
@@ -107,7 +108,7 @@ def test_pileup_normal_step_part_get_output_files(somatic_cnv_checking_workflow)
 
 def test_pileup_normal_step_part_get_log_file(somatic_cnv_checking_workflow):
     """Tests CnvCheckingPileupStepPart.get_log_file() - action normal"""
-    base_name = "work/{mapper}.{library}/log/{mapper}.{library}.normal"
+    base_name = "work/{mapper}.{library_name}/log/{mapper}.{library_name}.normal"
     expected = get_expected_log_files_dict(base_out=base_name)
     actual = somatic_cnv_checking_workflow.get_log_file("pileup", "normal")
     assert actual == expected
@@ -124,10 +125,10 @@ def test_pileup_normal_step_part_get_resource(somatic_cnv_checking_workflow):
 
 def test_pileup_tumor_step_part_get_input_files(somatic_cnv_checking_workflow):
     """Tests CnvCheckingPileupStepPart.get_input_files() - action tumor"""
-    wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"})
+    wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"})
     expected = {
-        "vcf": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.normal.vcf.gz",
-        "tbi": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.normal.vcf.gz.tbi",
+        "locii": "work/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.normal.vcf.gz",
+        "locii_tbi": "work/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.normal.vcf.gz.tbi",
         "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam",
         "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai",
     }
@@ -137,7 +138,7 @@ def test_pileup_tumor_step_part_get_input_files(somatic_cnv_checking_workflow):
 
 def test_pileup_tumor_step_part_get_output_files(somatic_cnv_checking_workflow):
     """Tests CnvCheckingPileupStepPart.get_output_files() - action tumor"""
-    base_name = "work/{mapper}.{library}/out/{mapper}.{library}.tumor"
+    base_name = "work/{mapper}.{library_name}/out/{mapper}.{library_name}.tumor"
     expected = get_expected_output_vcf_files_dict(base_out=base_name)
     actual = somatic_cnv_checking_workflow.get_output_files("pileup", "tumor")
     assert actual == expected
@@ -145,7 +146,7 @@ def test_pileup_tumor_step_part_get_output_files(somatic_cnv_checking_workflow):
 
 def test_pileup_tumor_step_part_get_log_file(somatic_cnv_checking_workflow):
     """Tests CnvCheckingPileupStepPart.get_log_file() - action tumor"""
-    base_name = "work/{mapper}.{library}/log/{mapper}.{library}.tumor"
+    base_name = "work/{mapper}.{library_name}/log/{mapper}.{library_name}.tumor"
     expected = get_expected_log_files_dict(base_out=base_name)
     actual = somatic_cnv_checking_workflow.get_log_file("pileup", "tumor")
     assert actual == expected
@@ -162,29 +163,39 @@ def test_pileup_tumor_step_part_get_resource(somatic_cnv_checking_workflow):
 
 def test_cnv_run_step_part_get_input_files(somatic_cnv_checking_workflow):
     """Tests SomaticCnvCheckingCnvStepPart.get_input_files()"""
+    wildcards = Wildcards(
+        fromdict={"mapper": "bwa", "caller": "cnvkit", "library_name": "P001-T1-DNA1-WGS1"}
+    )
     expected = {
-        "cnv": "../somatic_targeted_seq_cnv_calling/output/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.bed.gz",
-        "cnv_tbi": "../somatic_targeted_seq_cnv_calling/output/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.bed.gz.tbi",
-        "normal": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz",
-        "normal_tbi": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz.tbi",
-        "tumor": "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz",
-        "tumor_tbi": "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz.tbi",
+        "cnv": "CNV_CALLING/output/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.bed.gz",
+        "cnv_tbi": "CNV_CALLING/output/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.bed.gz.tbi",
+        "normal": "work/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.normal.vcf.gz",
+        "normal_tbi": "work/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.normal.vcf.gz.tbi",
+        "tumor": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.tumor.vcf.gz",
+        "tumor_tbi": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.tumor.vcf.gz.tbi",
     }
-    actual = somatic_cnv_checking_workflow.get_input_files("cnv", "run")
+    actual = somatic_cnv_checking_workflow.get_input_files("cnv", "run")(wildcards)
     assert actual == expected
 
 
 def test_cnv_run_step_part_get_output_files(somatic_cnv_checking_workflow):
     """Tests SomaticCnvCheckingCnvStepPart.get_output_files()"""
-    base_name = "work/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}"
-    expected = get_expected_output_vcf_files_dict(base_out=base_name)
+    base_name = "work/{mapper}.{caller}.{library_name}/out/{mapper}.{caller}.{library_name}"
+    expected = {
+        "vcf": base_name + ".vcf.gz",
+        "vcf_md5": base_name + ".vcf.gz.md5",
+        "tbi": base_name + ".vcf.gz.tbi",
+        "tbi_md5": base_name + ".vcf.gz.tbi.md5",
+        "tsv": base_name + ".tsv",
+        "tsv_md5": base_name + ".tsv.md5",
+    }
     actual = somatic_cnv_checking_workflow.get_output_files("cnv", "run")
     assert actual == expected
 
 
 def test_cnv_run_step_part_get_log_file(somatic_cnv_checking_workflow):
     """Tests SomaticCnvCheckingCnvStepPart.get_log_file()"""
-    base_name = "work/{mapper}.cnvkit.{library}/log/{mapper}.cnvkit.{library}"
+    base_name = "work/{mapper}.{caller}.{library_name}/log/{mapper}.{caller}.{library_name}"
     expected = get_expected_log_files_dict(base_out=base_name)
     actual = somatic_cnv_checking_workflow.get_log_file("cnv", "run")
     assert actual == expected
@@ -192,19 +203,27 @@ def test_cnv_run_step_part_get_log_file(somatic_cnv_checking_workflow):
 
 def test_report_run_step_part_get_input_files(somatic_cnv_checking_workflow):
     """Tests SomaticCnvCheckingReportStepPart.get_input_files()"""
-    expected = {"vcf": "work/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.vcf.gz"}
-    actual = somatic_cnv_checking_workflow.get_input_files("report", "run")
+    wildcards = Wildcards(
+        fromdict={"mapper": "bwa", "caller": "cnvkit", "library_name": "P001-T1-DNA1-WGS1"}
+    )
+    expected = {
+        "vcf": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.vcf.gz",
+        "tsv": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.tsv",
+    }
+    actual = somatic_cnv_checking_workflow.get_input_files("report", "run")(wildcards)
     assert actual == expected
 
 
 def test_report_run_step_part_get_output_files(somatic_cnv_checking_workflow):
     """Tests SomaticCnvCheckingReportStepPart.get_output_files()"""
-    base_name = "work/{mapper}.cnvkit.{library}/report/{mapper}.cnvkit.{library}"
+    base_name = "work/{mapper}.{caller}.{library_name}/report/{mapper}.{caller}.{library_name}"
     expected = {
         "cnv": base_name + ".cnv.pdf",
         "cnv_md5": base_name + ".cnv.pdf.md5",
         "locus": base_name + ".locus.pdf",
         "locus_md5": base_name + ".locus.pdf.md5",
+        "segment": base_name + ".segment.pdf",
+        "segment_md5": base_name + ".segment.pdf.md5",
     }
     actual = somatic_cnv_checking_workflow.get_output_files("report", "run")
     assert actual == expected
@@ -212,20 +231,12 @@ def test_report_run_step_part_get_output_files(somatic_cnv_checking_workflow):
 
 def test_report_run_step_part_get_log_file(somatic_cnv_checking_workflow):
     """Tests SomaticCnvCheckingReportStepPart.get_log_file()"""
-    base_name = "work/{mapper}.cnvkit.{library}/log/{mapper}.cnvkit.{library}.report"
+    base_name = "work/{mapper}.{caller}.{library_name}/log/{mapper}.{caller}.{library_name}.report"
     expected = get_expected_log_files_dict(base_out=base_name)
     actual = somatic_cnv_checking_workflow.get_log_file("report", "run")
     assert actual == expected
 
 
-def test_report_run_step_part_get_params(somatic_cnv_checking_workflow):
-    """Tests SomaticCnvCheckingReportStepPart.get_params()"""
-    wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"})
-    expected = {"library_name": "P001-T1-DNA1-WGS1"}
-    actual = somatic_cnv_checking_workflow.get_params("report", "run")(wildcards)
-    assert actual == expected
-
-
 # Tests for SomaticCnvCheckingWorkflow --------------------------------------------------
 
 
@@ -237,37 +248,46 @@ def test_somatic_cnv_checking_workflow(somatic_cnv_checking_workflow):
     assert actual == expected
 
     # main output
-    tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}"
+    tpl = (
+        "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{chksum}"
+    )
     expected = [
-        tpl.format(i=i, t=t, ext=ext)
+        tpl.format(i=i, t=t, ext=ext, chksum=chksum)
         for i, t in ((1, 1), (2, 1), (2, 2))
-        for ext in (
-            "vcf.gz",
-            "vcf.gz.tbi",
-            "vcf.gz.md5",
-            "vcf.gz.tbi.md5",
-        )
+        for ext in ("vcf.gz", "vcf.gz.tbi", "tsv")
+        for chksum in ("", ".md5")
     ]
 
     # report
-    tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}"
+    tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{chksum}"
     expected += [
-        tpl.format(i=i, t=t, ext=ext)
+        tpl.format(i=i, t=t, ext=ext, chksum=chksum)
         for i, t in ((1, 1), (2, 1), (2, 2))
-        for ext in (
-            "cnv.pdf",
-            "cnv.pdf.md5",
-            "locus.pdf",
-            "locus.pdf.md5",
-        )
+        for ext in ("cnv.pdf", "locus.pdf", "segment.pdf")
+        for chksum in ("", ".md5")
     ]
 
-    # logs
+    # logs (no caller)
+    tpl = "output/bwa.P00{i}-N1-DNA1-WGS1/log/bwa.P00{i}-N1-DNA1-WGS1.normal.{ext}{chksum}"
+    expected += [
+        tpl.format(i=i, ext=ext, chksum=chksum)
+        for i in (1, 2)
+        for ext in ("log", "conda_info.txt", "conda_list.txt")
+        for chksum in ("", ".md5")
+    ]
+    tpl = "output/bwa.P00{i}-T{t}-DNA1-WGS1/log/bwa.P00{i}-T{t}-DNA1-WGS1.tumor.{ext}{chksum}"
+    expected += [
+        tpl.format(i=i, t=t, ext=ext, chksum=chksum)
+        for i, t in ((1, 1), (2, 1), (2, 2))
+        for ext in ("log", "conda_info.txt", "conda_list.txt")
+        for chksum in ("", ".md5")
+    ]
+    # logs (with caller)
     tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/log/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1{part}.{ext}{chksum}"
     expected += [
         tpl.format(i=i, t=t, part=part, ext=ext, chksum=chksum)
         for i, t in ((1, 1), (2, 1), (2, 2))
-        for part in ("", ".normal", ".tumor", ".report")
+        for part in ("", ".report")
         for ext in ("log", "conda_info.txt", "conda_list.txt")
         for chksum in ("", ".md5")
     ]

From 4996d4da33e1b64e78bc789861b8d98ae80ddb19 Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Fri, 28 Jul 2023 18:59:36 +0200
Subject: [PATCH 8/9] fix: fix problems in no coverage corner cases

---
 .../wrappers/vcfpy/add_bed/wrapper.py         | 58 +++++++++++++------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py
index a8d7f5929..d98068a87 100644
--- a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py
+++ b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py
@@ -40,7 +40,7 @@
 ]
 ps[nProc - 1].stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
 nProc += 1
-if config["excluded_regions"]:
+if "excluded_regions" in config and config["excluded_regions"]:
     ps += [
         subprocess.Popen(
             ["bcftools", "view", "--targets-file", "^" + config["excluded_regions"]],
@@ -124,10 +124,21 @@
                 "start": locus[1],
                 "stop": locus[2],
                 "BAFs": [],
+                "NoData": 0,
             }
         depth = variant.call_for_sample[sample].data["AD"]
-        baf = min(depth[0], depth[1]) / (depth[0] + depth[1])
-        segments[locus[3]]["BAFs"] += [baf]
+        if len(depth) == 0:
+            segments[locus[3]]["NoData"] += 1
+        else:
+            if depth[0] is None:
+                depth[0] = 0
+            if depth[1] is None:
+                depth[1] = 0
+            if depth[0] + depth[1] == 0:
+                segments[locus[3]]["NoData"] += 1
+            else:
+                baf = min(depth[0], depth[1]) / (depth[0] + depth[1])
+                segments[locus[3]]["BAFs"] += [baf]
 
 writer.close()
 
@@ -174,7 +185,7 @@ def quantile(x, probs, na_rm=False, method=7):
 
 with open(snakemake.output.tsv, "wt") as f:
     print(
-        "Number\tCHROM\tstart\tstop\tCN\tLFC\tNvariants\tMin\t1st quartile\tMedian\t3rd quartile\tMax",
+        "Number\tCHROM\tstart\tstop\tCN\tLFC\tNvariants\tMin\t1st quartile\tMedian\t3rd quartile\tMax\tNoData",
         file=f,
     )
     for segId, segment in segments.items():
@@ -186,21 +197,30 @@ def quantile(x, probs, na_rm=False, method=7):
         nVar = len(segment["BAFs"])
         bafs = sorted(segment["BAFs"])
 
-        qs = quantile(segment["BAFs"], (0.25, 0.5, 0.75), method=8)
-        values = [
-            n,
-            segment["CHROM"],
-            segment["start"],
-            segment["stop"],
-            cn,
-            segment["logFoldChange"],
-            nVar,
-            bafs[0],
-            qs[0],
-            qs[1],
-            qs[2],
-            bafs[nVar - 1],
-        ]
+        if nVar > 1:
+            qs = (
+                [bafs[0]]
+                + quantile(segment["BAFs"], (0.25, 0.5, 0.75), method=8)
+                + [bafs[nVar - 1]]
+            )
+        elif nVar == 1:
+            qs = [bafs[0]] * 5
+        else:
+            qs = [""] * 5
+
+        values = (
+            [
+                n,
+                segment["CHROM"],
+                segment["start"],
+                segment["stop"],
+                cn,
+                segment["logFoldChange"],
+                nVar,
+            ]
+            + qs
+            + [segment["NoData"]]
+        )
         print("\t".join(map(str, values)), file=f)
 
 shell(

From 882d3826d9833b076f7780e04a99cdb5e06bab4b Mon Sep 17 00:00:00 2001
From: Eric Blanc <eric.blanc@bihealth.de>
Date: Mon, 21 Aug 2023 12:31:03 +0200
Subject: [PATCH 9/9] fix: Use strand to detect negative log-fold-change values
 encoded in score.

---
 snappy_pipeline/workflows/somatic_cnv_checking/__init__.py    | 2 +-
 .../wrappers/bcftools/heterozygous_variants/wrapper.py        | 4 ++--
 .../wrappers/somatic_cnv_checking/cnv-check-plot.R            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
index 3397a7c18..f5781c417 100644
--- a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
+++ b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
@@ -95,7 +95,7 @@
     cnv_assay_type: ""                # Empty: no CNV, WES for somatic_targeted_seq_snv_calling step, WGS for somatic_wgs_cnv_calling step
     excluded_regions: ""              # Bed file of regions to be excluded
     max_depth: 10000                  # Max depth for pileups
-    min_cov: 20                       # Minimum depth for reference and alternative alleles to consider variant
+    min_depth: 20                     # Minimum depth for reference and alternative alleles to consider variant
     min_baf: 0.4                      # Maximum BAF to consider variant as heterozygous (between 0 & 1/2)
 """
 
diff --git a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py
index 385dfbfa5..2d1770494 100644
--- a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py
+++ b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py
@@ -35,7 +35,7 @@
 conda info > {snakemake.log.conda_info}
 
 only_one_variant="N_ALT=2 & FORMAT/AD[:2]=0"
-min_coverage="FORMAT/AD[:0]>{config[min_cov]} & FORMAT/AD[:1]>{config[min_cov]}"
+min_depth="FORMAT/AD[:0]>{config[min_depth]} & FORMAT/AD[:1]>{config[min_depth]}"
 hetero="{min_ratio}*FORMAT/AD[:0]<=FORMAT/AD[:1] & FORMAT/AD[:1]<={max_ratio}*FORMAT/AD[:0]"
 
 bcftools mpileup \
@@ -45,7 +45,7 @@
     -a "FORMAT/AD" \
     {snakemake.input.bam} \
     | bcftools filter \
-    --include "$only_one_variant & $min_coverage & $hetero" \
+    --include "$only_one_variant & $min_depth & $hetero" \
     -O z -o {snakemake.output.vcf}
 tabix {snakemake.output.vcf}
 
diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R
index 5be6106f6..f6d54977e 100644
--- a/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R
+++ b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R
@@ -33,7 +33,7 @@ cnv_to_table <- function(cnv) {
     y <- read.table(cnv, sep="\t", header=0)
     colnames(y) <- c("CHROM", "start", "stop", "name", "LFC", "strand")
     y <- y |>
-        dplyr::mutate(LFC=replace(LFC, strand == "-", LFC[strand=="-"])) |>
+        dplyr::mutate(LFC=replace(LFC, strand == "-", -abs(LFC[strand=="-"]))) |>
         dplyr::mutate(start=start + 1)
     y
 }