feat: feature-complete end-to-end run of the Becnel dataset on hs37d5…

… & GRCh38.d1.vd1 (#460) Co-authored-by: Manuela Benary <manuela.benary@bihealth.de>
bihealth · Oct 31, 2023 · 4874074 · 4874074
1 parent e394e8c
commit 4874074
Show file tree

Hide file tree

Showing 65 changed files with 1,009 additions and 525 deletions.
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -5,6 +5,7 @@ pytest
 coverage
 pytest-cov
 pytest-mock
+pytest-subprocess
 
 # Fake file system for testing
 pyfakefs

diff --git a/snappy_pipeline/workflows/cbioportal_export/__init__.py b/snappy_pipeline/workflows/cbioportal_export/__init__.py
@@ -77,6 +77,7 @@
   cbioportal_export:
     # Required for RNA expression
     path_ngs_mapping: ""                                         # When missing, no expression data is uploaded to cBioPortal
+    mapping_tool: "bwa"
     expression_tool: "star"
     # Required for somatic variants
     path_somatic_variant: ../somatic_variant_filtration          # REQUIRED (before or after filtration)
@@ -334,7 +335,7 @@ def __init__(self, parent):
             name_pattern = "{mapper}.{caller}.{annotator}.{{library_name}}"
         tpl = os.path.join("work/maf", name_pattern, "out", name_pattern + "{ext}")
         self.input_tpl = tpl.format(
-            mapper="bwa",
+            mapper=self.config["mapping_tool"],
             caller=self.config["somatic_variant_calling_tool"],
             annotator=self.config["somatic_variant_annotation_tool"],
             filter_set=self.config["filter_set"],
@@ -358,8 +359,12 @@ def get_input_files(self, action):
         # Validate action
         self._validate_action(action)
         name_pattern = "{mapper}.{caller}.{tumor_library}"
-        yield "cns", os.path.join(
-            self.config["path_copy_number"], "output", name_pattern, "out", name_pattern + ".cns"
+        yield "DNAcopy", os.path.join(
+            self.config["path_copy_number"],
+            "output",
+            name_pattern,
+            "out",
+            name_pattern + "_dnacopy.seg",
         )
 
     @dictify
@@ -390,10 +395,8 @@ def get_args(self, action):
         # Validate action
         self._validate_action(action)
         return {
-            "features": self.parent.w_config["step_config"]["ngs_mapping"][
-                self.config["expression_tool"]
-            ]["path_features"],
             "pipeline_id": "ENSEMBL",
+            "features": self.parent.w_config["static_data_config"]["features"]["path"],
         }
 
     def get_resource_usage(self, action):
@@ -427,7 +430,9 @@ class cbioportalCnaFilesStepPart(cbioportalExportStepPart):
 
     def __init__(self, parent):
         super().__init__(parent)
-        name_pattern = "bwa." + self.config["copy_number_tool"] + ".{library_name}"
+        name_pattern = (
+            self.config["mapping_tool"] + "." + self.config["copy_number_tool"] + ".{library_name}"
+        )
         self.input_tpl = os.path.join("work/cna", name_pattern, "out", name_pattern + ".cna")
 
     def get_args(self, action):
@@ -482,9 +487,15 @@ class cbioportalSegmentStepPart(cbioportalExportStepPart):
 
     def __init__(self, parent):
         super().__init__(parent)
-        name_pattern = "bwa." + self.config["copy_number_tool"] + ".{library_name}"
+        name_pattern = (
+            self.config["mapping_tool"] + "." + self.config["copy_number_tool"] + ".{library_name}"
+        )
         self.input_tpl = os.path.join(
-            self.config["path_copy_number"], "output", name_pattern, "out", name_pattern + ".cns"
+            self.config["path_copy_number"],
+            "output",
+            name_pattern,
+            "out",
+            name_pattern + "_dnacopy.seg",
         )
 
     def get_resource_usage(self, action):
@@ -537,9 +548,7 @@ def get_args(self, action):
             "action_type": "expression",
             "extra_args": {
                 "pipeline_id": "ENSEMBL",
-                "tx_obj": self.parent.w_config["step_config"]["ngs_mapping"][
-                    self.config["expression_tool"]
-                ]["path_features"],
+                "tx_obj": self.parent.w_config["static_data_config"]["features"]["path"],
             },
         }
 
@@ -813,13 +822,16 @@ def get_result_files(self):
     def check_config(self):
         """Check config attributes for presence"""
         msg = []
-        if self.config["path_somatic_variant"] and (
-            not self.config["somatic_variant_calling_tool"]
-            or not self.config["somatic_variant_annotation_tool"]
-        ):
-            msg += [
-                "Somatic variant calling tool and somatic variant annotation tool must be defined"
-            ]
+        if self.config["path_somatic_variant"]:
+            if not self.config["mapping_tool"]:
+                msg += ["DNA mapping tool must be defined"]
+            if (
+                not self.config["somatic_variant_calling_tool"]
+                or not self.config["somatic_variant_annotation_tool"]
+            ):
+                msg += [
+                    "Somatic variant calling tool and somatic variant annotation tool must be defined"
+                ]
         if self.config["path_copy_number"] and not self.config["copy_number_tool"]:
             msg += [
                 "Somatic copy number calling tool must be defined when CNV results are available"

diff --git a/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile b/snappy_pipeline/workflows/homologous_recombination_deficiency/Snakefile
@@ -75,7 +75,7 @@ rule homologous_recombination_deficiency_scarHRD_install:
     log:
         **wf.get_log_file("scarHRD", "install"),
     wrapper:
-        wf.wrapper_path("r")
+        wf.wrapper_path("scarHRD/install")
 
 
 rule homologous_recombination_deficiency_scarHRD_run:

diff --git a/snappy_pipeline/workflows/ngs_mapping/__init__.py b/snappy_pipeline/workflows/ngs_mapping/__init__.py
@@ -171,13 +171,18 @@
 This removes the need to include gene models into the generation of indices, so that the user can select the
 gene models (either from ENSEMBL or GENCODE, for example).
 
-When the configuration option `path_features` is set, the step will output a table of expression counts
-for all genes, in `output/star.{library_name}/out/star.{library_name}.GeneCounts.tab`.
+The computation of gene counts relies on the features defined in the `static_data_config` section of the
+configuration file. The other steps relying of feature annotations should use this too.
+
+`STAR` outputs the counts for unstranded, forward and reverse strand protocols. When the user doesn't supply the
+protocol code (0 for unstranded, 1 for forward & 2 for reverse), the step runs `infer_experiment`
+(from the `rseqc` library) to infer the protocol strandedness. In both cases, the step generate a copy of
+the `STAR` output containing only the relevant column included.  This final version of the gene counts is found in
+`output/star.{library_name}/out/star.{library_name}.GeneCounts.tab`.
 
 If the configuration option `transcriptome` is set to `true`, the step will output a bam file of reads
 mapped to the transcriptome (`output/stat.{library_name}/out/star.{library_name}.toTranscriptome.bam`).
-`STAR` will rely on the `path_features` configuration option, or on the gene models embedded in
-the indices to generate the mappings. If both are absent, the step will fail.
+As with gene counts, `STAR` will rely on the static data configuration to generate the mappings.
 Note that the mappings to the transcriptome will not be indexes using `samtools index`, because the
 absence of the positional mappings.
 
@@ -387,7 +392,6 @@
     # Configuration for STAR
     star:
       path_index: REQUIRED # Required if listed in ngs_mapping.tools.rna; otherwise, can be removed.
-      path_features: ""    # Required for computing gene counts
       num_threads_align: 16
       num_threads_trimming: 8
       num_threads_bam_view: 4
@@ -801,11 +805,10 @@ def check_config(self):
     def _get_output_files_run_work(self):
         """Override base class' function to make Snakemake aware of extra files for STAR."""
         output_files = super()._get_output_files_run_work()
-        if self.config[self.name]["path_features"]:
-            output_files["gene_counts"] = self.base_path_out.format(
-                mapper=self.name, ext=".GeneCounts.tab"
-            )
-            output_files["gene_counts_md5"] = output_files["gene_counts"] + ".md5"
+        output_files["gene_counts"] = self.base_path_out.format(
+            mapper=self.name, ext=".GeneCounts.tab"
+        )
+        output_files["gene_counts_md5"] = output_files["gene_counts"] + ".md5"
         if self.config[self.name]["transcriptome"]:
             output_files["transcriptome"] = self.base_path_out.format(
                 mapper=self.name, ext=".toTranscriptome.bam"
@@ -900,27 +903,24 @@ def get_output_files(self, action):
 
     def get_result_files(self):
         for mapper in self.config["tools"]["rna"]:
-            if self.config[mapper]["path_features"]:
-                tpl_out = (
-                    "output/{mapper}.{library_name}/out/{mapper}.{library_name}.GeneCounts.tab"
-                )
-                tpl_strandedness = "output/{mapper}.{library_name}/strandedness/{mapper}.{library_name}.decision.json"
-                tpl_log = (
-                    "output/{mapper}.{library_name}/log/{mapper}.{library_name}.strandedness.{ext}"
-                )
-                for library_name, extra_info in self.parent.ngs_library_to_extra_infos.items():
-                    if extra_info["extractionType"] == "RNA":
-                        yield tpl_out.format(mapper=mapper, library_name=library_name)
-                        yield tpl_out.format(mapper=mapper, library_name=library_name) + ".md5"
-                        yield tpl_strandedness.format(mapper=mapper, library_name=library_name)
-                        yield tpl_strandedness.format(
-                            mapper=mapper, library_name=library_name
+            tpl_out = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.GeneCounts.tab"
+            tpl_strandedness = (
+                "output/{mapper}.{library_name}/strandedness/{mapper}.{library_name}.decision.json"
+            )
+            tpl_log = (
+                "output/{mapper}.{library_name}/log/{mapper}.{library_name}.strandedness.{ext}"
+            )
+            for library_name, extra_info in self.parent.ngs_library_to_extra_infos.items():
+                if extra_info["extractionType"] == "RNA":
+                    yield tpl_out.format(mapper=mapper, library_name=library_name)
+                    yield tpl_out.format(mapper=mapper, library_name=library_name) + ".md5"
+                    yield tpl_strandedness.format(mapper=mapper, library_name=library_name)
+                    yield tpl_strandedness.format(mapper=mapper, library_name=library_name) + ".md5"
+                    for ext in ("log", "conda_info.txt", "conda_list.txt"):
+                        yield tpl_log.format(mapper=mapper, library_name=library_name, ext=ext)
+                        yield tpl_log.format(
+                            mapper=mapper, library_name=library_name, ext=ext
                         ) + ".md5"
-                        for ext in ("log", "conda_info.txt", "conda_list.txt"):
-                            yield tpl_log.format(mapper=mapper, library_name=library_name, ext=ext)
-                            yield tpl_log.format(
-                                mapper=mapper, library_name=library_name, ext=ext
-                            ) + ".md5"
 
     @dictify
     def get_log_file(self, action):

diff --git a/snappy_pipeline/workflows/panel_of_normals/__init__.py b/snappy_pipeline/workflows/panel_of_normals/__init__.py
@@ -162,6 +162,7 @@
     purecn:
       path_normals_list: ""       # Optional file listing libraries to include in panel
       path_bait_regions: REQUIRED # Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect), recommended by PureCN author
+      path_genomicsDB: REQUIRED   # Mutect2 genomicsDB created during panel_of_normals
       genome_name: "unknown"      # Must be one from hg18, hg19, hg38, mm9, mm10, rn4, rn5, rn6, canFam3
       enrichment_kit_name: "unknown" # For filename only...
       mappability: "" # GRCh38: /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw
@@ -259,19 +260,19 @@ def get_input_files(self, action):
     @dictify
     def _get_input_files_coverage(self, wildcards):
         yield "container", "work/containers/out/purecn.simg"
-        tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam"
         yield "intervals", "work/purecn/out/{}_{}.list".format(
             self.config["purecn"]["enrichment_kit_name"],
             self.config["purecn"]["genome_name"],
         )
+        tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.bam"
         yield "bam", self.ngs_mapping(tpl.format(**wildcards))
 
     @dictify
     def _get_input_files_create(self, wildcards):
         yield "container", "work/containers/out/purecn.simg"
-        tpl = "work/{mapper}.purecn/out/{mapper}.{normal_library}_coverage_loess.txt.gz"
+        tpl = "work/{mapper}.purecn/out/{mapper}.purecn.{library_name}_coverage_loess.txt.gz"
         yield "normals", [
-            tpl.format(mapper=wildcards.mapper, normal_library=lib) for lib in self.normal_libraries
+            tpl.format(mapper=wildcards.mapper, library_name=lib) for lib in self.normal_libraries
         ]
 
     def get_output_files(self, action):
@@ -294,7 +295,7 @@ def get_output_files(self, action):
             }
         if action == "coverage":
             return {
-                "coverage": "work/{mapper}.purecn/out/{mapper}.{normal_library,.+-DNA[0-9]+-WES[0-9]+}_coverage_loess.txt.gz"
+                "coverage": "work/{mapper}.purecn/out/{mapper}.purecn.{library_name,.+-DNA[0-9]+-WES[0-9]+}_coverage_loess.txt.gz"
             }
         if action == "create_panel":
             return {
@@ -314,7 +315,7 @@ def get_log_file(self, action):
                 self.config["purecn"]["enrichment_kit_name"],
                 self.config["purecn"]["genome_name"],
             ),
-            "coverage": "work/{mapper}.purecn/log/{mapper}.{normal_library,.+-DNA[0-9]+-WES[0-9]+}",
+            "coverage": "work/{mapper}.purecn/log/{mapper}.purecn.{library_name,.+-DNA[0-9]+-WES[0-9]+}",
             "create_panel": "work/{mapper}.purecn/log/{mapper}.purecn.panel_of_normals",
         }
         assert action in self.actions

diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
@@ -212,8 +212,7 @@ def input_function(wildcards):
             base_path = "output/{mapper}.{caller}.{library_name}/out/{mapper}.{caller}.{library_name}".format(
                 **wildcards
             )
-            filenames["cnv"] = cnv_calling(base_path + ".bed.gz")
-            filenames["cnv_tbi"] = filenames["cnv"] + ".tbi"
+            filenames["cnv"] = cnv_calling(base_path + "_dnacopy.seg")
             return filenames
 
         return input_function

diff --git a/snappy_pipeline/workflows/somatic_gene_fusion_calling/__init__.py b/snappy_pipeline/workflows/somatic_gene_fusion_calling/__init__.py
@@ -90,7 +90,6 @@
       path_dataset_directory: REQUIRED
     arriba:
       path_index: REQUIRED       # REQUIRED  STAR path index (preferably 2.7.10 or later)
-      features: REQUIRED         # REQUIRED  Gene features (for ex. ENCODE or ENSEMBL) in gtf format
       blacklist: ""              # optional (provided in the arriba distribution, see /fast/work/groups/cubi/projects/biotools/static_data/app_support/arriba/v2.3.0)
       known_fusions: ""          # optional
       tags: ""                   # optional (can be set to the same path as known_fusions)
@@ -423,10 +422,6 @@ def check_config(self):
             config_keys=("step_config", "somatic_gene_fusion_calling", "arriba", "path_index"),
             msg="Path to STAR indices is required",
         )
-        self.parent.ensure_w_config(
-            config_keys=("step_config", "somatic_gene_fusion_calling", "arriba", "features"),
-            msg="Path to genomic features gtf file is required",
-        )
 
         # Check that the path to the STAR index is valid.
         for fn in ("Genome", "SA", "SAindex"):

diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/Snakefile
@@ -244,6 +244,7 @@ rule somatic_targeted_seq_cnv_calling_sequenza_install:
     params:
         packages=[
             {"name": "aroneklund/copynumber", "repo": "github"},
+            {"name": "sequenzatools/sequenza", "repo": "bitbucket"},
         ],
     threads: wf.get_resource("sequenza", "install", "threads")
     resources:
@@ -254,7 +255,7 @@ rule somatic_targeted_seq_cnv_calling_sequenza_install:
     log:
         **wf.get_log_file("sequenza", "install"),
     wrapper:
-        wf.wrapper_path("r")
+        wf.wrapper_path("sequenza/install")
 
 
 rule somatic_targeted_seq_cnv_calling_sequenza_gcreference:
@@ -272,6 +273,25 @@ rule somatic_targeted_seq_cnv_calling_sequenza_gcreference:
         wf.wrapper_path("sequenza/gcreference")
 
 
+rule somatic_targeted_seq_cnv_calling_sequenza_coverage:
+    input:
+        unpack(wf.get_input_files("sequenza", "coverage")),
+    output:
+        **wf.get_output_files("sequenza", "coverage"),
+    params:
+        sample_id=wf.get_params("sequenza", "coverage"),
+    threads: wf.get_resource("sequenza", "coverage", "threads")
+    resources:
+        time=wf.get_resource("sequenza", "coverage", "time"),
+        memory=wf.get_resource("sequenza", "coverage", "memory"),
+        partition=wf.get_resource("sequenza", "coverage", "partition"),
+        tmpdir=wf.get_resource("sequenza", "coverage", "tmpdir"),
+    log:
+        **wf.get_log_file("sequenza", "coverage"),
+    wrapper:
+        wf.wrapper_path("sequenza/coverage")
+
+
 rule somatic_targeted_seq_cnv_calling_sequenza_run:
     input:
         unpack(wf.get_input_files("sequenza", "run")),
@@ -289,25 +309,6 @@ rule somatic_targeted_seq_cnv_calling_sequenza_run:
         wf.wrapper_path("sequenza/run")
 
 
-rule somatic_targeted_seq_cnv_calling_sequenza_report:
-    input:
-        **wf.get_input_files("sequenza", "report"),
-    output:
-        **wf.get_output_files("sequenza", "report"),
-    params:
-        sample_id=wf.get_params("sequenza", "report"),
-    threads: wf.get_resource("sequenza", "report", "threads")
-    resources:
-        time=wf.get_resource("sequenza", "report", "time"),
-        memory=wf.get_resource("sequenza", "report", "memory"),
-        partition=wf.get_resource("sequenza", "report", "partition"),
-        tmpdir=wf.get_resource("sequenza", "report", "tmpdir"),
-    log:
-        **wf.get_log_file("sequenza", "report"),
-    wrapper:
-        wf.wrapper_path("sequenza/report")
-
-
 # Run PureCN ------------------------------------------------------