feat: support for somatic variant calling without normal (#503)

bihealth · May 6, 2024 · 66f555c · 66f555c
1 parent d7420f7
commit 66f555c
Show file tree

Hide file tree

Showing 14 changed files with 678 additions and 555 deletions.
diff --git a/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2 b/.tests/test-workflow/config/cancer_wes/config.yaml.jinja2
@@ -43,6 +43,7 @@ step_config:
   somatic_variant_filtration:
     path_somatic_variant: ../somatic_variant_annotation
     path_ngs_mapping: ../ngs_mapping
+    filtration_schema: list
     filter_list:
     - dkfz: {}
     - bcftools:

diff --git a/docs/step_generic.rst b/docs/step_generic.rst
@@ -79,3 +79,11 @@ The versioning allows the pipeline step to check whether there are incompatibili
 
     The execution of ``cubi-snake`` in a directory will not automatically generate these files.
     Rather, they are only generated when used in a pipeline step such as ``somatic_targeted_cnv_calling``.
+
+.. note:: **Debugging configuration errors**
+
+    The configuration for a complete pipeline can be long, and copy/paste errors can creep in.
+    When facing errors in configuration, it is advisable to 
+
+    - Use absolute paths for files not part of the workflow (panel of normal files are **NOT** part the of workflow), and
+    - Use the ``snappy-snake --verbose``, which prints the step configuration (and all the dependend workflows) as ``snappy`` has understood it.
diff --git a/snappy_pipeline/workflows/abstract/__init__.py b/snappy_pipeline/workflows/abstract/__init__.py
@@ -671,6 +671,9 @@ def __init__(
         #: Functions from sub workflows, can be used to generate output paths into these workflows
         self.sub_workflows = {}
 
+        if workflow.verbose:
+            self._write_step_config()
+
     def _setup_hooks(self):
         """Setup Snakemake workflow hooks for start/end/error"""
         # In the following, the "log" parameter to the handler functions is set to "_" as we
@@ -931,6 +934,14 @@ def _load_data_search_infos(self):
                 mixed_se_pe=False,
             )
 
+    def _write_step_config(self, f=sys.stdout):
+        print(f"\n\n----- Configuration for step {self.name}:\n", file=f)
+        yaml = ruamel_yaml.YAML()
+        yaml.preserve_quotes = True
+        yaml.indent(sequence=4, mapping=4, offset=4)
+        yaml.dump(self.config, f)
+        print(f"\n------ Configuration for {self.name} ends here\n", file=f)
+
     @classmethod
     def wrapper_path(cls, path):
         """Generate path to wrapper"""

diff --git a/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py b/snappy_pipeline/workflows/somatic_variant_annotation/__init__.py
@@ -238,8 +238,8 @@ def params_function(wildcards):
 
     def get_normal_lib_name(self, wildcards):
         """Return name of normal (non-cancer) library"""
-        pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
-        return pair.normal_sample.dna_ngs_library.name
+        pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None)
+        return pair.normal_sample.dna_ngs_library.name if pair else None
 
 
 class JannovarAnnotateSomaticVcfStepPart(AnnotateSomaticVcfStepPart):
@@ -426,20 +426,19 @@ def _yield_result_files_matched(self, tpl, **kwargs):
         Mutect.
         """
         for sheet in filter(is_not_background, self.shortcut_sheets):
-            for sample_pair in sheet.all_sample_pairs:
-                if (
-                    not sample_pair.tumor_sample.dna_ngs_library
-                    or not sample_pair.normal_sample.dna_ngs_library
-                ):
-                    msg = (
-                        "INFO: sample pair for cancer bio sample {} has is missing primary"
-                        "normal or primary cancer NGS library"
-                    )
-                    print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
-                    continue
-                yield from expand(
-                    tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs
-                )
+            for bio_entity in sheet.sheet.bio_entities.values():
+                for bio_sample in bio_entity.bio_samples.values():
+                    if not bio_sample.extra_infos.get("isTumor", False):
+                        continue
+                    for test_sample in bio_sample.test_samples.values():
+                        extraction_type = test_sample.extra_infos.get("extractionType", "unknown")
+                        if extraction_type.lower() != "dna":
+                            if extraction_type == "unknown":
+                                msg = "INFO: sample {} has missing extraction type, ignored"
+                                print(msg.format(test_sample.name), file=sys.stderr)
+                            continue
+                        for ngs_library in test_sample.ngs_libraries.values():
+                            yield from expand(tpl, tumor_library=[ngs_library], **kwargs)
 
     def _yield_result_files_joint(self, tpl, **kwargs):
         """Build output paths from path template and extension list.

diff --git a/snappy_pipeline/workflows/somatic_variant_calling/__init__.py b/snappy_pipeline/workflows/somatic_variant_calling/__init__.py
@@ -241,7 +241,7 @@
       germline_resource: ''     # Germline variants resource (same as panel of normals)
       common_variants: ''       # Common germline variants for contamination estimation
       extra_arguments: []       # List additional Mutect2 arguments
-                                # Each additional argument xust be in the form:
+                                # Each additional argument must be in the form:
                                 # "--<argument name> <argument value>"
                                 # For example, to filter reads prior to calling & to
                                 # add annotations to the output vcf:
@@ -387,32 +387,41 @@ def input_function(wildcards):
             ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
             # Get names of primary libraries of the selected cancer bio sample and the
             # corresponding primary normal sample
-            normal_base_path = (
-                "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
-                    normal_library=self.get_normal_lib_name(wildcards), **wildcards
-                )
-            )
             tumor_base_path = (
                 "output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}"
             ).format(**wildcards)
-            return {
-                "normal_bam": ngs_mapping(normal_base_path + ".bam"),
-                "normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
+            input_files = {
                 "tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
                 "tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
             }
 
+            normal_library = self.get_normal_lib_name(wildcards)
+            if normal_library:
+                normal_base_path = (
+                    "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
+                        normal_library=normal_library, **wildcards
+                    )
+                )
+                input_files.update(
+                    {
+                        "normal_bam": ngs_mapping(normal_base_path + ".bam"),
+                        "normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
+                    }
+                )
+
+            return input_files
+
         return input_function
 
     def get_normal_lib_name(self, wildcards):
         """Return name of normal (non-cancer) library"""
-        pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
-        return pair.normal_sample.dna_ngs_library.name
+        pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None)
+        return pair.normal_sample.dna_ngs_library.name if pair else None
 
     def get_tumor_lib_name(self, wildcards):
         """Return name of tumor library"""
-        pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
-        return pair.tumor_sample.dna_ngs_library.name
+        pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None)
+        return pair.tumor_sample.dna_ngs_library.name if pair else wildcards.tumor_library
 
     def get_output_files(self, action):
         """Return output files that all somatic variant calling sub steps must
@@ -577,19 +586,30 @@ def _get_input_files_run(self, wildcards):
         ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
         # Get names of primary libraries of the selected cancer bio sample and the
         # corresponding primary normal sample
-        normal_base_path = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
-            normal_library=self.get_normal_lib_name(wildcards), **wildcards
-        )
         tumor_base_path = (
             "output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}"
         ).format(**wildcards)
-        return {
-            "normal_bam": ngs_mapping(normal_base_path + ".bam"),
-            "normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
+        input_files = {
             "tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
             "tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
         }
 
+        normal_library = self.get_normal_lib_name(wildcards)
+        if normal_library:
+            normal_base_path = (
+                "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
+                    normal_library=normal_library, **wildcards
+                )
+            )
+            input_files.update(
+                {
+                    "normal_bam": ngs_mapping(normal_base_path + ".bam"),
+                    "normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
+                }
+            )
+
+        return input_files
+
     def _get_input_files_filter(self, wildcards):
         """Get input files for rule ``filter``.
 
@@ -609,9 +629,10 @@ def _get_input_files_filter(self, wildcards):
             "stats": base_path + ".raw.vcf.stats",
             "f1r2": base_path + ".raw.f1r2_tar.tar.gz",
         }
-        if "contamination" in self.actions:
-            input_files["table"] = base_path + ".contamination.tbl"
-            input_files["segments"] = base_path + ".segments.tbl"
+        if self.get_normal_lib_name(wildcards):
+            if "contamination" in self.actions:
+                input_files["table"] = base_path + ".contamination.tbl"
+                input_files["segments"] = base_path + ".segments.tbl"
         return input_files
 
     def _get_input_files_pileup_normal(self, wildcards):
@@ -1196,20 +1217,19 @@ def _yield_result_files_matched(self, tpl, **kwargs):
         Mutect.
         """
         for sheet in filter(is_not_background, self.shortcut_sheets):
-            for sample_pair in sheet.all_sample_pairs:
-                if (
-                    not sample_pair.tumor_sample.dna_ngs_library
-                    or not sample_pair.normal_sample.dna_ngs_library
-                ):
-                    msg = (
-                        "INFO: sample pair for cancer bio sample {} has is missing primary"
-                        "normal or primary cancer NGS library"
-                    )
-                    print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
-                    continue
-                yield from expand(
-                    tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs
-                )
+            for bio_entity in sheet.sheet.bio_entities.values():
+                for bio_sample in bio_entity.bio_samples.values():
+                    if not bio_sample.extra_infos.get("isTumor", False):
+                        continue
+                    for test_sample in bio_sample.test_samples.values():
+                        extraction_type = test_sample.extra_infos.get("extractionType", "unknown")
+                        if extraction_type.lower() != "dna":
+                            if extraction_type == "unknown":
+                                msg = "INFO: sample {} has missing extraction type, ignored"
+                                print(msg.format(test_sample.name), file=sys.stderr)
+                            continue
+                        for ngs_library in test_sample.ngs_libraries.values():
+                            yield from expand(tpl, tumor_library=[ngs_library], **kwargs)
 
     def _yield_result_files_joint(self, tpl, **kwargs):
         """Build output paths from path template and extension list.