From 3faf66e2b7971b789b3c8475019f9c0893d35967 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 24 Jul 2023 23:16:30 +0200 Subject: [PATCH 1/9] refactor: segment output as bed files --- .../somatic_targeted_seq_cnv_calling/__init__.py | 6 +++--- .../wrappers/cnvkit/postprocess/environment.yaml | 1 + .../wrappers/cnvkit/postprocess/wrapper.py | 15 ++++++++++++++- ..._workflows_somatic_targeted_seq_cnv_calling.py | 8 ++++---- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py index ddea1c5f7..1de7934ac 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py @@ -621,8 +621,8 @@ def _get_output_files_call(): @staticmethod def _get_output_files_postprocess(): name_pattern = "{mapper}.cnvkit.{library_name}" - tpl = os.path.join("work", name_pattern, "out", name_pattern + ".cns") - return {"final": tpl, "final_md5": tpl + ".md5"} + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".bed.gz") + return {"final": tpl, "final_tbi": tpl + ".tbi", "final_md5": tpl + ".md5", "final_tbi_md5": tpl + ".tbi.md5"} @dictify def _get_output_files_plot(self): @@ -652,7 +652,7 @@ def _get_output_files_plot(self): @staticmethod def _get_output_files_export(): - exports = (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")) + exports = (("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")) output_files = {} tpl = ( "work/{{mapper}}.cnvkit.{{library_name}}/out/" diff --git a/snappy_wrappers/wrappers/cnvkit/postprocess/environment.yaml b/snappy_wrappers/wrappers/cnvkit/postprocess/environment.yaml index e936461b1..28ebc9a4e 100644 --- a/snappy_wrappers/wrappers/cnvkit/postprocess/environment.yaml +++ b/snappy_wrappers/wrappers/cnvkit/postprocess/environment.yaml @@ -5,3 +5,4 @@ channels: dependencies: - r-base - bioconductor-genomicranges +- htslib diff --git a/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py b/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py index f79a46585..f3e4a0106 100644 --- a/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py @@ -30,6 +30,8 @@ # ----------------------------------------------------------------------------- +uncompressed=$(echo "{snakemake.output.final}" | sed -e "s/\.gz$//") + cat << _EOF | R --vanilla --slave segments <- read.table("{snakemake.input.segment}", sep="\t", header=1, stringsAsFactors=FALSE) stopifnot(all(c("chromosome", "start", "end") %in% colnames(segments))) @@ -63,13 +65,24 @@ segments[,"start"] <- segments[["start"]]-1 segments[,"end"] <- segments[["end"]]+1 -write.table(segments, file="{snakemake.output.final}", sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE) +# Create bed file (abusing name, score & strand to make it bed-6 compatible) +segments[,"start"] <- segments[["start"]]-1 # Bed files are 0-based, semi-open intervals +segments[,"name"] <- sprintf("seg_%d_cn_%d", 1:nrow(segments), segments[["cn"]]) +segments[,"score"] <- pmin(1000, abs(segments[["log2"]])) +segments[,"strand"] <- "+" +segments[segments[["log2"]]<0,"strand"] <- "-" + +write.table(segments[,c("chromosome", "start", "end", "name", "score", "strand")], file="$uncompressed", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) _EOF +bgzip $uncompressed +tabix {snakemake.output.final} + d=$(dirname "{snakemake.output.final}") pushd $d fn=$(basename "{snakemake.output.final}") md5sum $fn > $fn.md5 +md5sum $fn.tbi > $fn.tbi.md5 popd """ ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index fd756d9a4..975ccd978 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -472,8 +472,8 @@ def test_cnvkit_postprocess_step_part_get_input_files(somatic_targeted_seq_cnv_c def test_cnvkit_postprocess_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): - base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.cns" - expected = {"final": base_name_out, "final_md5": base_name_out + ".md5"} + base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.bed.gz" + expected = {"final": base_name_out, "final_tbi": base_name_out + ".tbi", "final_md5": base_name_out + ".md5", "final_tbi_md5": base_name_out + ".tbi.md5"} actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "postprocess") assert actual == expected @@ -582,7 +582,7 @@ def test_cnvkit_export_step_part_get_output_files(somatic_targeted_seq_cnv_calli # Define expected expected = {} base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}" - for key, ext in (("bed", "bed"), ("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")): + for key, ext in (("seg", "seg"), ("vcf", "vcf.gz"), ("tbi", "vcf.gz.tbi")): expected[key] = base_name_out + "." + ext expected[key + "_md5"] = expected[key] + ".md5" # Get actual @@ -870,7 +870,7 @@ def test_somatic_targeted_seq_cnv_calling_workflow(somatic_targeted_seq_cnv_call expected += [ tpl.format(i=i, t=t, ext=ext, md5=md5) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ("cnr", "cns", "bed", "seg", "vcf.gz", "vcf.gz.tbi") + for ext in ("cnr", "bed.gz", "bed.gz.tbi", "seg", "vcf.gz", "vcf.gz.tbi") for md5 in ("", ".md5") ] tpl = ( From b1ebeafbec3bf1dbcdcaf2d0e6e57d0aaf77b5ef Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 24 Jul 2023 23:19:14 +0200 Subject: [PATCH 2/9] feat: Add the CNV checking step (BAF of heterozygous SNPs vs CNV) --- snappy_pipeline/apps/snappy_snake.py | 2 + .../workflows/somatic_cnv_checking/Snakefile | 132 ++++++ .../somatic_cnv_checking/__init__.py | 395 ++++++++++++++++++ .../heterozygous_variants/environment.yaml | 1 + .../bcftools/heterozygous_variants/wrapper.py | 65 +++ .../bcftools/pileups/environment.yaml | 1 + .../wrappers/bcftools/pileups/wrapper.py | 55 +++ .../somatic_cnv_checking/cnv-check-plot.R | 82 ++++ .../somatic_cnv_checking/environment.yaml | 8 + .../wrappers/somatic_cnv_checking/wrapper.py | 83 ++++ .../wrappers/vcfpy/add_bed/environment.yaml | 7 + .../wrappers/vcfpy/add_bed/wrapper.py | 86 ++++ .../test_workflows_somatic_cnv_checking.py | 277 ++++++++++++ 13 files changed, 1194 insertions(+) create mode 100644 snappy_pipeline/workflows/somatic_cnv_checking/Snakefile create mode 100644 snappy_pipeline/workflows/somatic_cnv_checking/__init__.py create mode 120000 snappy_wrappers/wrappers/bcftools/heterozygous_variants/environment.yaml create mode 100644 snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py create mode 120000 snappy_wrappers/wrappers/bcftools/pileups/environment.yaml create mode 100644 snappy_wrappers/wrappers/bcftools/pileups/wrapper.py create mode 100644 snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R create mode 100644 snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml create mode 100644 snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py create mode 100644 snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml create mode 100644 snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py create mode 100644 tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py index c47f50b94..6948b0682 100644 --- a/snappy_pipeline/apps/snappy_snake.py +++ b/snappy_pipeline/apps/snappy_snake.py @@ -31,6 +31,7 @@ ngs_sanity_checking, panel_of_normals, repeat_expansion, + somatic_cnv_checking, somatic_gene_fusion_calling, somatic_hla_loh_calling, somatic_msi_calling, @@ -86,6 +87,7 @@ "panel_of_normals": panel_of_normals, "repeat_analysis": repeat_expansion, "ngs_sanity_checking": ngs_sanity_checking, + "somatic_cnv_checking": somatic_cnv_checking, "somatic_gene_fusion_calling": somatic_gene_fusion_calling, "somatic_hla_loh_calling": somatic_hla_loh_calling, "somatic_msi_calling": somatic_msi_calling, diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile b/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile new file mode 100644 index 000000000..258ca1a3c --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +"""CUBI Pipeline somatic_cnv_checking step Snakefile""" + +import os + +from snappy_pipeline import expand_ref +from snappy_pipeline.workflows.somatic_cnv_checking import SomaticCnvCheckingWorkflow + +__author__ = "Eric Blanc " + +# Configuration =============================================================== + + +configfile: "config.yaml" + + +# Expand "$ref" JSON pointers in configuration (also works for YAML) +config, lookup_paths, config_paths = expand_ref("config.yaml", config) + +# WorkflowImpl Object Setup =================================================== + +wf = SomaticCnvCheckingWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) + +# Rules ======================================================================= + + +localrules: + # Linking files from work/ to output/ should be done locally + somatic_cnv_checking_link_out_run, + + +rule all: + input: + wf.get_result_files(), + + +# House-Keeping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Generic linking out --------------------------------------------------------- + + +rule somatic_cnv_checking_link_out_run: + input: + wf.get_input_files("link_out", "run"), + output: + wf.get_output_files("link_out", "run"), + run: + shell(wf.get_shell_cmd("link_out", "run", wildcards)) + + +# Somatic CNV Checking ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +# Run pileup to identify heterozygous variants in normal sample --------------- + + +rule somatic_cnv_checking_pileup_normal: + input: + unpack(wf.get_input_files("pileup", "normal")), + output: + **wf.get_output_files("pileup", "normal"), + threads: wf.get_resource("pileup", "normal", "threads") + resources: + time=wf.get_resource("pileup", "normal", "time"), + memory=wf.get_resource("pileup", "normal", "memory"), + partition=wf.get_resource("pileup", "normal", "partition"), + tmpdir=wf.get_resource("pileup", "normal", "tmpdir"), + log: + **wf.get_log_file("pileup", "normal"), + wrapper: + wf.wrapper_path("bcftools/heterozygous") + + +# Run pileup to add read counts in the tumor sample --------------------------- + + +rule somatic_cnv_checking_pileup_tumor: + input: + unpack(wf.get_input_files("pileup", "tumor")), + output: + **wf.get_output_files("pileup", "tumor"), + threads: wf.get_resource("pileup", "tumor", "threads") + resources: + time=wf.get_resource("pileup", "tumor", "time"), + memory=wf.get_resource("pileup", "tumor", "memory"), + partition=wf.get_resource("pileup", "tumor", "partition"), + tmpdir=wf.get_resource("pileup", "tumor", "tmpdir"), + log: + **wf.get_log_file("pileup", "tumor"), + wrapper: + wf.wrapper_path("bcftools/pileups") + + +# Add CNV status at the locii ------------------------------------------------- + + +rule somatic_cnv_checking_cnv_run: + input: + **wf.get_input_files("cnv", "run"), + output: + **wf.get_output_files("cnv", "run"), + threads: wf.get_resource("cnv", "run", "threads") + resources: + time=wf.get_resource("cnv", "run", "time"), + memory=wf.get_resource("cnv", "run", "memory"), + partition=wf.get_resource("cnv", "run", "partition"), + tmpdir=wf.get_resource("cnv", "run", "tmpdir"), + log: + **wf.get_log_file("cnv", "run"), + wrapper: + wf.wrapper_path("vcfpy/add_bed") + + +# Generate report & plots ----------------------------------------------------- + + +rule somatic_cnv_checking_cnv_report: + input: + **wf.get_input_files("report", "run"), + output: + **wf.get_output_files("report", "run"), + params: + **{"args": wf.get_params("report", "run")} + threads: wf.get_resource("report", "run", "threads") + resources: + time=wf.get_resource("report", "run", "time"), + memory=wf.get_resource("report", "run", "memory"), + partition=wf.get_resource("report", "run", "partition"), + tmpdir=wf.get_resource("report", "run", "tmpdir"), + log: + **wf.get_log_file("report", "run"), + wrapper: + wf.wrapper_path("somatic_cnv_checking") diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py new file mode 100644 index 000000000..46a0a2729 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py @@ -0,0 +1,395 @@ +# -*- coding: utf-8 -*- +"""Implementation of the ``somatic_cnv_checking`` step + +The ``somatic_cnv_checking`` step takes as the input the results of the ``ngs_mapping`` step +(aligned reads in BAM format) and performs pileups at germline locii to identify heterozygous +variants, and their B-allele frequency in the paired tumor sample. If one somatic CNV calling step +(either ``somatic_wgs_cnv_calling`` or ``somatic_targetd_seq_cnv_calling``) is present, the +log2 convarage ratio and copy number call are added to the output vcf. + +========== +Step Input +========== + +The somatic CNV checking step uses Snakemake sub workflows for using the result of the +``ngs_mapping`` step. Optionally, it can also use one Snakemake sub workflows for somatic CNV +calling, either``somatic_wgs_cnv_calling`` or ``somatic_targetd_seq_cnv_calling``. + +=========== +Step Output +=========== + +For each tumor DNA NGS library with name ``lib_name``/key ``lib_pk`` and each read mapper +``mapper`` that the library has been aligned with, and the CNV caller ``caller``, the +pipeline step will create a directory ``output/{mapper}.{caller}.{lib_name}-{lib_pk}/out`` +with symlinks of the following names to the resulting VCF, TBI, and MD5 files. + +- ``{mapper}.{var_caller}.{lib_name}-{lib_pk}.vcf.gz`` +- ``{mapper}.{var_caller}.{lib_name}-{lib_pk}.vcf.gz.tbi`` +- ``{mapper}.{var_caller}.{lib_name}-{lib_pk}.vcf.gz.md5`` +- ``{mapper}.{var_caller}.{lib_name}-{lib_pk}.vcf.gz.tbi.md5`` + +For example, it might look as follows for the example from above: + +:: + + output/ + +-- bwa.cnvkit.P001-T1-DNA1-WES1-4 + | `-- out + | |-- bwa.cnvkit.P001-T1-DNA1-WES1-4.vcf.gz + | |-- bwa.cnvkit.P001-T1-DNA1-WES1-4.vcf.gz.tbi + | |-- bwa.cnvkit.P001-T1-DNA1-WES1-4.vcf.gz.md5 + | `-- bwa.cnvkit.P001-T1-DNA1-WES1-4.vcf.gz.tbi.md5 + [...] + +===================== +Default Configuration +===================== + +The default configuration is as follows. + +.. include:: DEFAULT_CONFIG_somatic_cnv_checking.rst + +======= +Reports +======= + +Currently, no reports are generated. +""" + +from collections import OrderedDict +import os +import sys + +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background +from snakemake.io import expand + +from snappy_pipeline.utils import dictify, listify +from snappy_pipeline.workflows.abstract import ( + BaseStep, + BaseStepPart, + LinkOutStepPart, + ResourceUsage, +) +from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow + +__author__ = "Eric Blanc " + +#: Extensions of files to create as main payload +EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5") + +#: Names of the files to create for the extension +EXT_NAMES = ("vcf", "vcf_tbi", "vcf_md5", "vcf_tbi_md5") + +#: Default configuration for the somatic_cnv_checking schema +DEFAULT_CONFIG = r""" +# Default configuration somatic_cnv_checking +step_config: + somatic_cnv_checking: + path_ngs_mapping: ../ngs_mapping # REQUIRED + path_cnv_calling: "" # Can use for instance ../somatic_targeted_seq_cnv_calling + cnv_calling_tool: "" # Can use "cnvkit" + excluded_regions: "" # Bed file of regions to be excluded + max_depth: 10000 # Max depth for pileups + min_cov: 20 # Minimum depth for reference and alternative alleles to consider variant + min_baf: 0.4 # Maximum BAF to consider variant as heterozygous (between 0 & 1/2) +""" + + +class SomaticCnvCheckingStepPart(BaseStepPart): + """Base class for CNV checking sub-steps""" + + def __init__(self, parent): + super().__init__(parent) + # Build shortcut from cancer bio sample name to matched cancer sample + self.tumor_ngs_library_to_sample_pair = OrderedDict() + for sheet in self.parent.shortcut_sheets: + self.tumor_ngs_library_to_sample_pair.update( + sheet.all_sample_pairs_by_tumor_dna_ngs_library + ) + # Get shorcut to Snakemake sub workflow + self.ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + + def get_params(self, action): + # Validate action + self._validate_action(action) + + def input_function(wildcards): + pair = self.tumor_ngs_library_to_sample_pair[wildcards.library] + return {"library_name": pair.tumor_sample.dna_ngs_library.name} + + return input_function + + @dictify + def _get_log_file(self, prefix): + """Return dict of log files.""" + # Validate action + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + +class SomaticCnvCheckingPileupStepPart(SomaticCnvCheckingStepPart): + """Perform pileups at a set of locations""" + + name = "pileup" + actions = ("normal", "tumor") + + def get_input_files(self, action): + # Validate action + self._validate_action(action) + + def input_function_normal(wildcards): + pair = self.tumor_ngs_library_to_sample_pair[wildcards.library] + base_path = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format( + normal_library=pair.normal_sample.dna_ngs_library.name, **wildcards + ) + return { + "bam": self.ngs_mapping(base_path + ".bam"), + "bai": self.ngs_mapping(base_path + ".bam.bai"), + } + + def input_function_tumor(wildcards): + pair = self.tumor_ngs_library_to_sample_pair[wildcards.library] + base_path = "output/{mapper}.{tumor_library}/out/{mapper}.{tumor_library}".format( + tumor_library=pair.tumor_sample.dna_ngs_library.name, **wildcards + ) + return { + "vcf": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz".format( + **wildcards + ), + "tbi": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz.tbi".format( + **wildcards + ), + "bam": self.ngs_mapping(base_path + ".bam"), + "bai": self.ngs_mapping(base_path + ".bam.bai"), + } + + if action == "normal": + return input_function_normal + else: + return input_function_tumor + + def get_output_files(self, action): + """Return output files that all somatic variant calling sub steps must + return (VCF + TBI file) + """ + # Validate action + self._validate_action(action) + base_path_out = "work/{{mapper}}.{{library}}/out/{{mapper}}.{{library}}.{action}{ext}" + return dict(zip(EXT_NAMES, expand(base_path_out, action=action, ext=EXT_VALUES))) + + def get_log_file(self, action): + # Validate action + self._validate_action(action) + return self._get_log_file("work/{mapper}.{library}/log/{mapper}.{library}." + action) + + def get_resource_usage(self, action): + # Validate action + self._validate_action(action) + return ResourceUsage( + threads=2, + time="01:00:00" if action == "tumor" else "12:00:00", # 1 hour + memory=f"{int(3.7 * 1024 * 2)}M", + ) + + +class SomaticCnvCheckingCnvStepPart(SomaticCnvCheckingStepPart): + """Merging heterozygous variants with CNV data""" + + name = "cnv" + actions = ("run",) + + def get_input_files(self, action): + # Validate action + self._validate_action(action) + filenames = {} + filenames["normal"] = "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz" + filenames["normal_tbi"] = filenames["normal"] + ".tbi" + filenames["tumor"] = "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz" + filenames["tumor_tbi"] = filenames["tumor"] + ".tbi" + if self.config["path_cnv_calling"]: + tpl = "{{mapper}}.{caller}.{{library}}".format(caller=self.config["cnv_calling_tool"]) + filenames["cnv"] = os.path.join( + self.config["path_cnv_calling"], "output", tpl, "out", tpl + ".bed.gz" + ) + filenames["cnv_tbi"] = filenames["cnv"] + ".tbi" + return filenames + + def get_output_files(self, action): + # Validate action + self._validate_action(action) + if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]: + name_pattern = "{{mapper}}.{caller}.{{library}}".format( + caller=self.config["cnv_calling_tool"] + ) + else: + name_pattern = "{mapper}.{library}" + base_path_out = "work/" + name_pattern + "/out/" + name_pattern + return dict(zip(EXT_NAMES, [base_path_out + ext for ext in EXT_VALUES])) + + def get_log_file(self, action): + # Validate action + self._validate_action(action) + if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]: + name_pattern = "{{mapper}}.{caller}.{{library}}".format( + caller=self.config["cnv_calling_tool"] + ) + else: + name_pattern = "{mapper}.{library}" + return self._get_log_file(os.path.join("work", name_pattern, "log", name_pattern)) + + +class SomaticCnvCheckingReportStepPart(SomaticCnvCheckingStepPart): + """Plots of BAF vs CNV""" + + name = "report" + actions = ("run",) + + def get_input_files(self, action): + # Validate action + self._validate_action(action) + name_pattern = "{{mapper}}.{caller}.{{library}}".format( + caller=self.config["cnv_calling_tool"] + ) + base_path_out = "work/" + name_pattern + "/out/" + name_pattern + return {"vcf": base_path_out + ".vcf.gz"} + + def get_output_files(self, action): + # Validate action + self._validate_action(action) + name_pattern = "{{mapper}}.{caller}.{{library}}".format( + caller=self.config["cnv_calling_tool"] + ) + base_path_out = "work/" + name_pattern + "/report/" + name_pattern + return { + "cnv": base_path_out + ".cnv.pdf", + "cnv_md5": base_path_out + ".cnv.pdf.md5", + "locus": base_path_out + ".locus.pdf", + "locus_md5": base_path_out + ".locus.pdf.md5", + } + + def get_log_file(self, action): + # Validate action + self._validate_action(action) + name_pattern = "{{mapper}}.{caller}.{{library}}".format( + caller=self.config["cnv_calling_tool"] + ) + return self._get_log_file(os.path.join("work", name_pattern, "log", name_pattern + ".report")) + + +class SomaticCnvCheckingWorkflow(BaseStep): + """Perform somatic cnv checking""" + + #: Workflow name + name = "somatic_cnv_checking" + + #: Default biomed sheet class + sheet_shortcut_class = CancerCaseSheet + + sheet_shortcut_kwargs = { + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) + } + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one""" + return DEFAULT_CONFIG + + def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + (NgsMappingWorkflow,), + ) + self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) + # Register sub step classes so the sub steps are available + self.register_sub_step_classes( + ( + SomaticCnvCheckingPileupStepPart, + SomaticCnvCheckingCnvStepPart, + SomaticCnvCheckingReportStepPart, + LinkOutStepPart, + ) + ) + # Initialize sub-workflows + + @listify + def get_result_files(self): + """Return list of result files for the NGS mapping workflow + + We will process all NGS libraries of all bio samples in all sample sheets. + """ + if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]: + name_pattern = "{{mapper}}.{caller}.{{tumor_library.name}}".format( + caller=self.config["cnv_calling_tool"] + ) + else: + name_pattern = "{mapper}.{tumor_library.name}" + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + ext=EXT_VALUES, + ) + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + ext=[ + part + "." + ext + chksum + for part in ("", ".normal", ".tumor") + for ext in ("log", "conda_info.txt", "conda_list.txt") + for chksum in ("", ".md5") + ], + ) + if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]: + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "report", name_pattern + "{ext}"), + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + ext=(".cnv.pdf", ".cnv.pdf.md5", ".locus.pdf", ".locus.pdf.md5"), + ) + yield from self._yield_result_files_matched( + os.path.join("output", name_pattern, "log", name_pattern + ".report" + "{ext}"), + mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], + ext=[ + "." + ext + chksum + for ext in ("log", "conda_info.txt", "conda_list.txt") + for chksum in ("", ".md5") + ], + ) + + def _yield_result_files_matched(self, tpl, **kwargs): + """Build output paths from path template and extension list. + + This function returns the results from the matched somatic variant callers such as + Mutect. + """ + for sheet in filter(is_not_background, self.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if ( + not sample_pair.tumor_sample.dna_ngs_library + or not sample_pair.normal_sample.dna_ngs_library + ): + msg = ( + "INFO: sample pair for cancer bio sample {} has is missing primary" + "normal or primary cancer NGS library" + ) + print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) + continue + yield from expand( + tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs + ) + + def check_config(self): + """Check that the path to the NGS mapping is present""" + self.ensure_w_config( + ("step_config", "somatic_cnv_checking", "path_ngs_mapping"), + "Path to NGS mapping not configured but required for somatic variant calling", + ) diff --git a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/environment.yaml b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py new file mode 100644 index 000000000..43f4f51d6 --- /dev/null +++ b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +"""Wrapper for finding heterozygous variants with bcftools +""" + +from snakemake.shell import shell + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step] + +if "args" in snakemake.params and "intervals" in snakemake.params["args"]: + locii = "-r " + snakemake.params["args"]["intervals"] +elif "locii" in snakemake.input: + locii = "-R " + snakemake.input.locii +elif config["locii"]: + locii = "-R " + config["locii"] +else: + locii = "" + +# Convert minimum B-allele fraction into ratio of alternative to reference alleles +min_ratio = config["min_baf"] / (1 - config["min_baf"]) +max_ratio = 1 / min_ratio + +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +exec &> >(tee -a "{snakemake.log.log}") +set -x +# ----------------------------------------------------------------------------- +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +# Write out information about conda installation +conda list > {snakemake.log.conda_list} +conda info > {snakemake.log.conda_info} + +only_one_variant="N_ALT=2 & FORMAT/AD[:2]=0" +min_coverage="FORMAT/AD[:0]>{config[min_cov]} & FORMAT/AD[:1]>{config[min_cov]}" +hetero="{min_ratio}*FORMAT/AD[:0]<=FORMAT/AD[:1] & FORMAT/AD[:1]<={max_ratio}*FORMAT/AD[:0]" + +bcftools mpileup \ + {locii} \ + --max-depth {config[max_depth]} \ + -f {snakemake.config[static_data_config][reference][path]} \ + -a "FORMAT/AD" \ + {snakemake.input.bam} \ + | bcftools filter \ + --include "$only_one_variant & $min_coverage & $hetero" \ + -O z -o {snakemake.output.vcf} +tabix {snakemake.output.vcf} + +pushd $(dirname {snakemake.output.vcf}) +md5sum $(basename {snakemake.output.vcf}) > $(basename {snakemake.output.vcf_md5}) +md5sum $(basename {snakemake.output.vcf_tbi}) > $(basename {snakemake.output.vcf_tbi_md5}) +""" +) + +# Compute MD5 sums of logs +shell( + r""" +md5sum {snakemake.log.log} > {snakemake.log.log_md5} +md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5} +""" +) diff --git a/snappy_wrappers/wrappers/bcftools/pileups/environment.yaml b/snappy_wrappers/wrappers/bcftools/pileups/environment.yaml new file mode 120000 index 000000000..2e107ac86 --- /dev/null +++ b/snappy_wrappers/wrappers/bcftools/pileups/environment.yaml @@ -0,0 +1 @@ +../environment.yaml \ No newline at end of file diff --git a/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py b/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py new file mode 100644 index 000000000..daf4c5dd1 --- /dev/null +++ b/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +"""Wrapper for running bcftools mpileup +""" + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step] + +if "args" in snakemake.params and "intervals" in snakemake.params["args"]: + locii = "-r " + snakemake.params["args"]["intervals"] +elif "locii" in snakemake.input: + locii = "-R " + snakemake.input.locii +elif config["locii"]: + locii = "-R " + config["locii"] +else: + locii = "" + +# Actually run the script. +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +exec &> >(tee -a "{snakemake.log.log}") +set -x +# ----------------------------------------------------------------------------- +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +# Write out information about conda installation +conda list > {snakemake.log.conda_list} +conda info > {snakemake.log.conda_info} + +bcftools mpileup \ + {locii} \ + --max-depth {config[max_depth]} \ + -f {snakemake.config[static_data_config][reference][path]} \ + -a "FORMAT/AD" \ + -O z -o {snakemake.output.vcf} \ + {snakemake.input.bam} +tabix {snakemake.output.vcf} + +pushd $(dirname {snakemake.output.vcf}) +md5sum $(basename {snakemake.output.vcf}) > $(basename {snakemake.output.vcf_md5}) +md5sum $(basename {snakemake.output.vcf_tbi}) > $(basename {snakemake.output.vcf_tbi_md5}) +popd +""" +) + +# Compute MD5 sums of logs +shell( + r""" +md5sum {snakemake.log.log} > {snakemake.log.log_md5} +md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5} +""" +) diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R new file mode 100644 index 000000000..949b2bdb7 --- /dev/null +++ b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R @@ -0,0 +1,82 @@ +vcf_to_table <- function(vcf, sample, amplification=7) { + vcf <- VariantAnnotation::readVcf(vcf) + stopifnot(all(lengths(VariantAnnotation::alt(vcf)) == 2)) + + x <- data.frame( + CHROM=GenomicRanges::seqnames(SummarizedExperiment::rowRanges(vcf)), + POS=GenomicRanges::start(SummarizedExperiment::rowRanges(vcf)), + REF=VariantAnnotation::ref(vcf), + ALT=unlist(lapply(VariantAnnotation::alt(vcf), function(a) a[1])), + t(sapply(VariantAnnotation::geno(vcf)[["AD"]][,sample], function(a) a[1:2])), + CN=VariantAnnotation::geno(vcf)[["CN"]][,sample], + LFC=VariantAnnotation::geno(vcf)[["LFC"]][,sample] + ) + colnames(x)[5:6] <- c("t_ref", "t_alt") + x <- x |> + dplyr::filter(!is.na(LFC) & !is.na(CN)) |> + dplyr::mutate(BAF=pmin(t_ref, t_alt)/(t_ref+t_alt)) |> + dplyr::mutate(Call=factor("Neutral", levels=c("Deletion", "Loss", "Neutral", "Gain", "Amplification"))) |> + dplyr::mutate(Call=replace(Call, CN == 0, "Deletion")) |> + dplyr::mutate(Call=replace(Call, CN == 1, "Loss")) |> + dplyr::mutate(Call=replace(Call, 2 < CN & CN < amplification, "Gain")) |> + dplyr::mutate(Call=replace(Call, amplification <= CN, "Amplification")) + x +} + +cnv_to_table <- function(cnv) { + y <- read.table(cnv, sep="\t", header=0) + colnames(y) <- c("CHROM", "start", "stop", "name", "LFC", "strand") + y <- y |> + dplyr::mutate(LFC=replace(LFC, strand == "-", LFC[strand=="-"])) |> + dplyr::mutate(start=start + 1) + y +} + +chromosome_lengths <- function(genome, use_fai=TRUE, use_dict=TRUE) { + if (use_fai && file.exists(paste(genome, "fai", sep="."))) { + genome <- read.table(paste(genome, "fai", sep="."), sep="\t", header=0) + colnames(genome) <- c("CHROM", "Length", "index", "ncol", "nbyte") + } else { + if (use_dict && file.exists(paste(genome, "dict", sep="."))) { + pattern <- "^SQ\tSN:([^ \t]+)\tLN:([0-9]+)\t.+$" + genome <- grep(pattern, readLines(paste(genome, "dict", sep=".")), value=TRUE) + genome <- data.frame(CHROM=sub(pattern, "\\1", genome), Length=as.numeric(sub(pattern, "\\2", genome))) + } else { + genome <- Biostrings::readDNAStringSet(genome) + genome <- data.frame(CHROM=names(genome), Length=Biostrings::width(genome)) + } + } + genome <- genome |> + dplyr::mutate(CHROM=sub(" .*", "", CHROM)) |> + dplyr::mutate(Offset=cumsum(as.numeric(dplyr::lag(Length, default=0)))) |> + dplyr::select(CHROM, Length, Offset) + genome +} + +plot_cnv <- function(x, scale=c("log2", "sqrt")) { + scale <- match.arg(scale) + if (scale == "sqrt") x_label <- "Segment Fold Change" + else x_label <- "Segment Log2 Fold Change" + if (scale == "sqrt") x <- x |> dplyr::mutate(LFC=2^LFC) + p <- x |> + ggplot2::ggplot(ggplot2::aes(x=LFC, y=BAF, color=Call)) + + ggplot2::geom_point() + + ggplot2::labs(x=x_label, y="B-allele fraction") + + ggplot2::theme_bw() + if (scale == "sqrt") p <- p + ggplot2::scale_x_sqrt() + p +} + +plot_locus <- function(x, genome_lengths) { + p <- genome_lengths |> + ggplot2::ggplot(ggplot2::aes(xmin=Offset+1, xmax=Offset+Length, ymin=0, ymax=0.5, fill=as.character(n%%2))) + + ggplot2::geom_rect() + + ggplot2::scale_fill_manual(values=c("white", "grey")) + + ggplot2::scale_x_continuous(breaks=genome_lengths$Offset+genome_lengths$Length/2, labels=genome_lengths$CHROM) + + ggplot2::geom_point(data=x, mapping=ggplot2::aes(x=x, y=BAF, color=Call), inherit.aes=FALSE) + + ggplot2::theme_bw() + + ggplot2::theme(axis.text.x=ggplot2::element_text(angle=90, vjust=0.5, hjust=1)) + + ggplot2::guides(fill="none") + + ggplot2::labs(x="Genomic position", y="B-allele fraction") + p +} diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml b/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml new file mode 100644 index 000000000..88eb63da5 --- /dev/null +++ b/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml @@ -0,0 +1,8 @@ +channels: +- conda-forge +- bioconda +dependencies: +- r-base +- r-dplyr +- bioconductor-variantannotation +- bioconductor-biostrings diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py new file mode 100644 index 000000000..2fbdbda9c --- /dev/null +++ b/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +"""Wrapper for running CopywriteR""" + +import os + +from snakemake import shell + +__author__ = "Eric Blanc " + +shell.executable("/bin/bash") + +rscript = os.path.join(os.path.dirname(os.path.realpath(__file__)), "cnv-check-plot.R") + +reference = snakemake.config["static_data_config"]["reference"]["path"] + +shell( + r""" +set -x + +export TMPDIR=$(mktemp -d) +# trap "rm -rf $TMPDIR" EXIT + +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi + +md5() {{ + filename=$1 + fn=$(basename $filename) + pushd $(dirname $filename) 1> /dev/null 2>&1 + rslt=$(md5sum $fn) + popd 1> /dev/null 2>&1 + echo "$rslt" +}} + +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5 {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5 {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# ------------------------------------------------------------------------------------------------- +# Write helper script and call R +# +R --vanilla --slave << __EOF +source("{rscript}") +x <- vcf_to_table("{snakemake.input.vcf}", sample="{snakemake.params[args][library_name]}") +genome_lengths <- chromosome_lengths("{reference}") +x <- x |> dplyr::left_join(genome_lengths, by="CHROM") |> dplyr::mutate(x=POS + Offset) +pdf("{snakemake.output.cnv}", height=6.22, width=9.33) +plot_cnv(x, scale="log2") + ggplot2::ggtitle("{snakemake.params[args][library_name]}") +plot_cnv(x, scale="sqrt") + ggplot2::ggtitle("{snakemake.params[args][library_name]}") +dev.off() +pdf("{snakemake.output.locus}", height=6.22, width=12.44) +plot_locus(x, genome_lengths |> dplyr::mutate(n=dplyr::row_number()) |> dplyr::filter(CHROM %in% x$CHROM)) + + ggplot2::ggtitle("{snakemake.params[args][library_name]}") +dev.off() +__EOF + +md5 {snakemake.output.cnv} > {snakemake.output.cnv_md5} +md5 {snakemake.output.locus} > {snakemake.output.locus_md5} +""" +) + +shell( + r""" +md5() {{ + filename=$1 + fn=$(basename $filename) + pushd $(dirname $filename) 1> /dev/null 2>&1 + rslt=$(md5sum $fn) + popd 1> /dev/null 2>&1 + echo "$rslt" +}} +md5 {snakemake.log.log} > {snakemake.log.log_md5} +""" +) diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml b/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml new file mode 100644 index 000000000..388c1737a --- /dev/null +++ b/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml @@ -0,0 +1,7 @@ +channels: +- conda-forge +- bioconda +dependencies: +- vcfpy +- pytabix +- htslib diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py new file mode 100644 index 000000000..fdf4c1317 --- /dev/null +++ b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py @@ -0,0 +1,86 @@ +import os +import re +import subprocess + +import tabix +import vcfpy + +# Can't add anything if there's no CNV results +if "cnv" not in snakemake.input: + os.symlink(snakemake.input.vcf, snakemake.output.vcf) + os.symlink(snakemake.input.vcf_tbi, snakemake.output.vcf_tbi) + return + +NAME_PATTERN = re.compile("^seg_([0-9]+)_cn_([0-9]+)$") + +LFC = {"ID": "LFC", "Number": 1, "Type": "Float", "Description": "Log2 fold change from coverage"} +CN = {"ID": "CN", "Number": 1, "Type": "Integer", "Description": "Number of allele called by the CNV caller"} + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step] + +bed = tabix.open(snakemake.input.cnv) + +ps = [] +nProc = 0 +ps += [subprocess.Popen(["bcftools", "merge", snakemake.input.normal, snakemake.input.tumor], stdout=subprocess.PIPE)] +nProc += 1 +ps += [subprocess.Popen(["bcftools", "filter", "--include", "N_ALT=3 & FORMAT/AD[:2]=0"], stdin=ps[nProc-1].stdout, stdout=subprocess.PIPE, text=True)] +ps[nProc-1].stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits. +nProc += 1 +if config["excluded_regions"]: + ps += [subprocess.Popen(["bcftools", "view", "--regions-file", "^" + config["excluded_regions"], stdin=ps[nProc-1].stdout, stdout=subprocess.PIPE, text=True)] + ps[nProc-1].stdout.close() + nProc += 1 +reader = vcfpy.Reader.from_stream(ps[nProc-1].stdout) + +sample = snakemake.params["args"]["library_name"] +iSample = reader.header.samples.name_to_idx[sample] + +reader.header.add_format_line(LFC) +reader.header.add_format_line(CN) + +writer = vcfpy.Writer.from_path(snakemake.output.vcf, reader.header) + +for variant in reader: + logFoldChange = None + cn = None + + # Add default (missing) values for all samples + variant.add_format("LFC", logFoldChange) + variant.add_format("CN", cn) + + try: + locii = list(bed.query(variant.CHROM, variant.affected_start, variant.affected_end+1)) + except tabix.TabixError as e: + print("Variant {}:{}{}>{} not covered by segmentation of sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample)) + continue + + if len(locii) == 0: + print("Variant {}:{}{}>{} not covered by segmentation of sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample)) + elif len(locii) == 1: + for locus in locii: + logFoldChange = float(locus[4]) + if locus[5] == "-": + logFoldChange = -logFoldChange + m = NAME_PATTERN.match(locus[3]) + assert m + cn = int(m.groups()[1]) + else: + print("Variant {}:{}{}>{} crosses segments in sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample)) + for locus in locii: + m = NAME_PATTERN.match(locus[3]) + assert m + if cn is None: + cn = int(m.groups()[1]) + else: + if cn != int(m.groups()[1]): + cn = None + break + + variant.calls[iSample].data["LFC"] = logFoldChange + variant.calls[iSample].data["CN"] = cn + + writer.write_record(variant) + +writer.close() diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py new file mode 100644 index 000000000..393668fc1 --- /dev/null +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py @@ -0,0 +1,277 @@ +# -*- coding: utf-8 -*- +"""Tests for the somatic_cnv_checking workflow module code""" + + +import textwrap + +import pytest +import ruamel.yaml as ruamel_yaml +from snakemake.io import Wildcards + +from snappy_pipeline.workflows.somatic_cnv_checking import ( + SomaticCnvCheckingWorkflow, +) + +from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict +from .conftest import patch_module_fs + +__author__ = "Manuel Holtgrewe " + + +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config(): + """Return YAML parsing result for (somatic) configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + dbsnp: + path: /path/to/dbsnp.vcf.gz + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + + somatic_targeted_seq_cnv_calling: + tools: ["cnvkit"] + + somatic_cnv_checking: + path_ngs_mapping: ../ngs_mapping + path_cnv_calling: ../somatic_targeted_seq_cnv_calling + cnv_calling_tool: cnvkit + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + + +@pytest.fixture +def somatic_cnv_checking_workflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + mocker, +): + """Return SomaticTargetedSeqCnvCallingWorkflow object pre-configured with germline sheet""" + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we + # can obtain paths from the function as if we really had a NGSMappingPipelineStep here + dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} + # Construct the workflow object + return SomaticCnvCheckingWorkflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + config_paths, + work_dir, + ) + + +# Tests for CnvCheckingPileupStepPart -------------------------------------------------------------- + + +def test_pileup_normal_step_part_get_input_files(somatic_cnv_checking_workflow): + """Tests CnvCheckingPileupStepPart.get_input_files() - action normal""" + wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"}) + expected = { + "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", + } + actual = somatic_cnv_checking_workflow.get_input_files("pileup", "normal")(wildcards) + assert actual == expected + + +def test_pileup_normal_step_part_get_output_files(somatic_cnv_checking_workflow): + """Tests CnvCheckingPileupStepPart.get_output_files() - action normal""" + base_name = "work/{mapper}.{library}/out/{mapper}.{library}.normal" + expected = get_expected_output_vcf_files_dict(base_out=base_name) + actual = somatic_cnv_checking_workflow.get_output_files("pileup", "normal") + assert actual == expected + + +def test_pileup_normal_step_part_get_log_file(somatic_cnv_checking_workflow): + """Tests CnvCheckingPileupStepPart.get_log_file() - action normal""" + base_name = "work/{mapper}.{library}/log/{mapper}.{library}.normal" + expected = get_expected_log_files_dict(base_out=base_name) + actual = somatic_cnv_checking_workflow.get_log_file("pileup", "normal") + assert actual == expected + + +def test_pileup_normal_step_part_get_resource(somatic_cnv_checking_workflow): + """Tests CnvCheckingPileupStepPart.get_resource() - action normal""" + expected_dict = {"threads": 2, "time": "12:00:00", "memory": "7577M", "partition": "medium"} + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_cnv_checking_workflow.get_resource("pileup", "normal", resource) + assert actual == expected, msg_error + + +def test_pileup_tumor_step_part_get_input_files(somatic_cnv_checking_workflow): + """Tests CnvCheckingPileupStepPart.get_input_files() - action tumor""" + wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"}) + expected = { + "vcf": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.normal.vcf.gz", + "tbi": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.normal.vcf.gz.tbi", + "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", + "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", + } + actual = somatic_cnv_checking_workflow.get_input_files("pileup", "tumor")(wildcards) + assert actual == expected + + +def test_pileup_tumor_step_part_get_output_files(somatic_cnv_checking_workflow): + """Tests CnvCheckingPileupStepPart.get_output_files() - action tumor""" + base_name = "work/{mapper}.{library}/out/{mapper}.{library}.tumor" + expected = get_expected_output_vcf_files_dict(base_out=base_name) + actual = somatic_cnv_checking_workflow.get_output_files("pileup", "tumor") + assert actual == expected + + +def test_pileup_tumor_step_part_get_log_file(somatic_cnv_checking_workflow): + """Tests CnvCheckingPileupStepPart.get_log_file() - action tumor""" + base_name = "work/{mapper}.{library}/log/{mapper}.{library}.tumor" + expected = get_expected_log_files_dict(base_out=base_name) + actual = somatic_cnv_checking_workflow.get_log_file("pileup", "tumor") + assert actual == expected + + +def test_pileup_tumor_step_part_get_resource(somatic_cnv_checking_workflow): + """Tests CnvCheckingPileupStepPart.get_resource() - action tumor""" + expected_dict = {"threads": 2, "time": "01:00:00", "memory": "7577M", "partition": "medium"} + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_cnv_checking_workflow.get_resource("pileup", "tumor", resource) + assert actual == expected, msg_error + + +def test_cnv_run_step_part_get_input_files(somatic_cnv_checking_workflow): + """Tests SomaticCnvCheckingCnvStepPart.get_input_files()""" + expected = { + "cnv": "../somatic_targeted_seq_cnv_calling/output/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.bed.gz", + "cnv_tbi": "../somatic_targeted_seq_cnv_calling/output/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.bed.gz.tbi", + "normal": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz", + "normal_tbi": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz.tbi", + "tumor": "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz", + "tumor_tbi": "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz.tbi", + } + actual = somatic_cnv_checking_workflow.get_input_files("cnv", "run") + assert actual == expected + + +def test_cnv_run_step_part_get_output_files(somatic_cnv_checking_workflow): + """Tests SomaticCnvCheckingCnvStepPart.get_output_files()""" + base_name = "work/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}" + expected = get_expected_output_vcf_files_dict(base_out=base_name) + actual = somatic_cnv_checking_workflow.get_output_files("cnv", "run") + assert actual == expected + + +def test_cnv_run_step_part_get_log_file(somatic_cnv_checking_workflow): + """Tests SomaticCnvCheckingCnvStepPart.get_log_file()""" + base_name = "work/{mapper}.cnvkit.{library}/log/{mapper}.cnvkit.{library}" + expected = get_expected_log_files_dict(base_out=base_name) + actual = somatic_cnv_checking_workflow.get_log_file("cnv", "run") + assert actual == expected + + +def test_report_run_step_part_get_input_files(somatic_cnv_checking_workflow): + """Tests SomaticCnvCheckingReportStepPart.get_input_files()""" + expected = {"vcf": "work/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.vcf.gz"} + actual = somatic_cnv_checking_workflow.get_input_files("report", "run") + assert actual == expected + + +def test_report_run_step_part_get_output_files(somatic_cnv_checking_workflow): + """Tests SomaticCnvCheckingReportStepPart.get_output_files()""" + base_name = "work/{mapper}.cnvkit.{library}/report/{mapper}.cnvkit.{library}" + expected = { + "cnv": base_name + ".cnv.pdf", + "cnv_md5": base_name + ".cnv.pdf.md5", + "locus": base_name + ".locus.pdf", + "locus_md5": base_name + ".locus.pdf.md5", + } + actual = somatic_cnv_checking_workflow.get_output_files("report", "run") + assert actual == expected + + +def test_report_run_step_part_get_log_file(somatic_cnv_checking_workflow): + """Tests SomaticCnvCheckingReportStepPart.get_log_file()""" + base_name = "work/{mapper}.cnvkit.{library}/log/{mapper}.cnvkit.{library}.report" + expected = get_expected_log_files_dict(base_out=base_name) + actual = somatic_cnv_checking_workflow.get_log_file("report", "run") + assert actual == expected + + +def test_report_run_step_part_get_params(somatic_cnv_checking_workflow): + """Tests SomaticCnvCheckingReportStepPart.get_params()""" + wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"}) + expected = {"library_name": "P001-T1-DNA1-WGS1"} + actual = somatic_cnv_checking_workflow.get_params("report", "run")(wildcards) + assert actual == expected + + +# Tests for SomaticCnvCheckingWorkflow -------------------------------------------------- + + +def test_somatic_cnv_checking_workflow(somatic_cnv_checking_workflow): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["cnv", "link_out", "pileup", "report"] + actual = list(sorted(somatic_cnv_checking_workflow.sub_steps.keys())) + assert actual == expected + + # main output + tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}" + expected = [ + tpl.format(i=i, t=t, ext=ext) + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ( + "vcf.gz", + "vcf.gz.tbi", + "vcf.gz.md5", + "vcf.gz.tbi.md5", + ) + ] + + # report + tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}" + expected += [ + tpl.format(i=i, t=t, ext=ext) + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ( + "cnv.pdf", + "cnv.pdf.md5", + "locus.pdf", + "locus.pdf.md5", + ) + ] + + # logs + tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/log/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1{part}.{ext}{chksum}" + expected += [ + tpl.format(i=i, t=t, part=part, ext=ext, chksum=chksum) + for i, t in ((1, 1), (2, 1), (2, 2)) + for part in ("", ".normal", ".tumor", ".report") + for ext in ("log", "conda_info.txt", "conda_list.txt") + for chksum in ("", ".md5") + ] + + expected = list(sorted(expected)) + actual = list(sorted(somatic_cnv_checking_workflow.get_result_files())) + assert expected == actual From 942f96f359e9c03ba95212bd165331f7c5afa8fa Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 28 Jul 2023 14:26:01 +0200 Subject: [PATCH 3/9] fix: CN must be integer numbers for cBioPortal export (& make black happy) --- .../workflows/somatic_targeted_seq_cnv_calling/__init__.py | 7 ++++++- snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py | 2 +- .../test_workflows_somatic_targeted_seq_cnv_calling.py | 7 ++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py index 1de7934ac..4f80a78c5 100644 --- a/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py +++ b/snappy_pipeline/workflows/somatic_targeted_seq_cnv_calling/__init__.py @@ -622,7 +622,12 @@ def _get_output_files_call(): def _get_output_files_postprocess(): name_pattern = "{mapper}.cnvkit.{library_name}" tpl = os.path.join("work", name_pattern, "out", name_pattern + ".bed.gz") - return {"final": tpl, "final_tbi": tpl + ".tbi", "final_md5": tpl + ".md5", "final_tbi_md5": tpl + ".tbi.md5"} + return { + "final": tpl, + "final_tbi": tpl + ".tbi", + "final_md5": tpl + ".md5", + "final_tbi_md5": tpl + ".tbi.md5", + } @dictify def _get_output_files_plot(self): diff --git a/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py b/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py index f3e4a0106..2b35d730d 100644 --- a/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py +++ b/snappy_wrappers/wrappers/cnvkit/postprocess/wrapper.py @@ -59,7 +59,7 @@ # Default value: diploid segment (number of copies = 2) segments[,"cn"] <- 2 -segments[S4Vectors::queryHits(i),"cn"] <- calls[S4Vectors::subjectHits(i),"cn"] +segments[S4Vectors::queryHits(i),"cn"] <- round(calls[S4Vectors::subjectHits(i),"cn"]) # Reset segment boundaries to original values segments[,"start"] <- segments[["start"]]-1 diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py index 975ccd978..ff622624c 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_targeted_seq_cnv_calling.py @@ -473,7 +473,12 @@ def test_cnvkit_postprocess_step_part_get_input_files(somatic_targeted_seq_cnv_c def test_cnvkit_postprocess_step_part_get_output_files(somatic_targeted_seq_cnv_calling_workflow): base_name_out = "work/{mapper}.cnvkit.{library_name}/out/{mapper}.cnvkit.{library_name}.bed.gz" - expected = {"final": base_name_out, "final_tbi": base_name_out + ".tbi", "final_md5": base_name_out + ".md5", "final_tbi_md5": base_name_out + ".tbi.md5"} + expected = { + "final": base_name_out, + "final_tbi": base_name_out + ".tbi", + "final_md5": base_name_out + ".md5", + "final_tbi_md5": base_name_out + ".tbi.md5", + } actual = somatic_targeted_seq_cnv_calling_workflow.get_output_files("cnvkit", "postprocess") assert actual == expected From 3d43f6021e056404ce785ebfd801b6dbb0ae22be Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 28 Jul 2023 14:31:33 +0200 Subject: [PATCH 4/9] feat: added segment summary table fix: added vcf indexation & md5 checksums feat: remove possibility of running script without CNV bed file (should be done in Snakefile/workflow) refactor: make black happy --- .../wrappers/vcfpy/add_bed/environment.yaml | 1 + .../wrappers/vcfpy/add_bed/wrapper.py | 175 ++++++++++++++++-- 2 files changed, 157 insertions(+), 19 deletions(-) diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml b/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml index 388c1737a..8d0c6415c 100644 --- a/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml +++ b/snappy_wrappers/wrappers/vcfpy/add_bed/environment.yaml @@ -2,6 +2,7 @@ channels: - conda-forge - bioconda dependencies: +- bcftools - vcfpy - pytabix - htslib diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py index fdf4c1317..a8d7f5929 100644 --- a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py +++ b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py @@ -1,20 +1,21 @@ -import os +import math +import numbers import re import subprocess +from snakemake.shell import shell import tabix import vcfpy -# Can't add anything if there's no CNV results -if "cnv" not in snakemake.input: - os.symlink(snakemake.input.vcf, snakemake.output.vcf) - os.symlink(snakemake.input.vcf_tbi, snakemake.output.vcf_tbi) - return - NAME_PATTERN = re.compile("^seg_([0-9]+)_cn_([0-9]+)$") LFC = {"ID": "LFC", "Number": 1, "Type": "Float", "Description": "Log2 fold change from coverage"} -CN = {"ID": "CN", "Number": 1, "Type": "Integer", "Description": "Number of allele called by the CNV caller"} +CN = { + "ID": "CN", + "Number": 1, + "Type": "Integer", + "Description": "Number of allele called by the CNV caller", +} step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step] @@ -23,18 +24,36 @@ ps = [] nProc = 0 -ps += [subprocess.Popen(["bcftools", "merge", snakemake.input.normal, snakemake.input.tumor], stdout=subprocess.PIPE)] +ps += [ + subprocess.Popen( + ["bcftools", "merge", snakemake.input.normal, snakemake.input.tumor], stdout=subprocess.PIPE + ) +] nProc += 1 -ps += [subprocess.Popen(["bcftools", "filter", "--include", "N_ALT=3 & FORMAT/AD[:2]=0"], stdin=ps[nProc-1].stdout, stdout=subprocess.PIPE, text=True)] -ps[nProc-1].stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits. +ps += [ + subprocess.Popen( + ["bcftools", "filter", "--include", "N_ALT=2 & FORMAT/AD[:2]=0"], + stdin=ps[nProc - 1].stdout, + stdout=subprocess.PIPE, + text=True, + ) +] +ps[nProc - 1].stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits. nProc += 1 if config["excluded_regions"]: - ps += [subprocess.Popen(["bcftools", "view", "--regions-file", "^" + config["excluded_regions"], stdin=ps[nProc-1].stdout, stdout=subprocess.PIPE, text=True)] - ps[nProc-1].stdout.close() + ps += [ + subprocess.Popen( + ["bcftools", "view", "--targets-file", "^" + config["excluded_regions"]], + stdin=ps[nProc - 1].stdout, + stdout=subprocess.PIPE, + text=True, + ) + ] + ps[nProc - 1].stdout.close() nProc += 1 -reader = vcfpy.Reader.from_stream(ps[nProc-1].stdout) +reader = vcfpy.Reader.from_stream(ps[nProc - 1].stdout) -sample = snakemake.params["args"]["library_name"] +sample = snakemake.wildcards["library_name"] iSample = reader.header.samples.name_to_idx[sample] reader.header.add_format_line(LFC) @@ -42,6 +61,7 @@ writer = vcfpy.Writer.from_path(snakemake.output.vcf, reader.header) +segments = {} for variant in reader: logFoldChange = None cn = None @@ -51,13 +71,21 @@ variant.add_format("CN", cn) try: - locii = list(bed.query(variant.CHROM, variant.affected_start, variant.affected_end+1)) + locii = list(bed.query(variant.CHROM, variant.affected_start, variant.affected_end + 1)) except tabix.TabixError as e: - print("Variant {}:{}{}>{} not covered by segmentation of sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample)) + print( + "Variant {}:{}{}>{} not covered by segmentation of sample {}".format( + variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample + ) + ) continue if len(locii) == 0: - print("Variant {}:{}{}>{} not covered by segmentation of sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample)) + print( + "Variant {}:{}{}>{} not covered by segmentation of sample {}".format( + variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample + ) + ) elif len(locii) == 1: for locus in locii: logFoldChange = float(locus[4]) @@ -67,7 +95,11 @@ assert m cn = int(m.groups()[1]) else: - print("Variant {}:{}{}>{} crosses segments in sample {}".format(variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample)) + print( + "Variant {}:{}{}>{} crosses segments in sample {}".format( + variant.CHROM, variant.POS, variant.REF, variant.ALT[0].value, sample + ) + ) for locus in locii: m = NAME_PATTERN.match(locus[3]) assert m @@ -83,4 +115,109 @@ writer.write_record(variant) + if len(locii) == 1: + locus = locii[0] + if locus[3] not in segments: + segments[locus[3]] = { + "logFoldChange": logFoldChange, + "CHROM": locus[0], + "start": locus[1], + "stop": locus[2], + "BAFs": [], + } + depth = variant.call_for_sample[sample].data["AD"] + baf = min(depth[0], depth[1]) / (depth[0] + depth[1]) + segments[locus[3]]["BAFs"] += [baf] + writer.close() + +# Implmentation of R's quantile function for continuous quantiles. +def quantile(x, probs, na_rm=False, method=7): + if na_rm: + x = list( + filter( + lambda y: y is not None + and isinstance(y, Number) + and not math.isnan(y) + and not math.isinf(y), + x, + ) + ) + x.sort() + n = len(x) + if method == 4: + m = [0 for p in probs] + elif method == 5: + m = [0.5 for p in probs] + elif method == 6: + m = [p for p in probs] + elif method == 7: + m = [1 - p for p in probs] + elif method == 8: + m = [(p + 1) / 3 for p in probs] + elif method == 9: + m = [p / 4 + 3 / 8 for p in probs] + else: + raise NotImplementedError("Only continuous methods 4 to 9 (R) are implmented") + qs = [None] * len(probs) + for i in range(len(probs)): + j = math.floor(n * probs[i] + m[i]) + if j < 1: + qs[i] = x[j] + elif j >= n: + qs[i] = x[n - 1] + else: + g = n * probs[i] + m[i] - j + qs[i] = (1 - g) * x[j - 1] + g * x[j] + return qs + + +with open(snakemake.output.tsv, "wt") as f: + print( + "Number\tCHROM\tstart\tstop\tCN\tLFC\tNvariants\tMin\t1st quartile\tMedian\t3rd quartile\tMax", + file=f, + ) + for segId, segment in segments.items(): + m = NAME_PATTERN.match(segId) + n = int(m.groups()[0]) + cn = int(m.groups()[1]) + + logFoldChange = segment["logFoldChange"] + nVar = len(segment["BAFs"]) + bafs = sorted(segment["BAFs"]) + + qs = quantile(segment["BAFs"], (0.25, 0.5, 0.75), method=8) + values = [ + n, + segment["CHROM"], + segment["start"], + segment["stop"], + cn, + segment["logFoldChange"], + nVar, + bafs[0], + qs[0], + qs[1], + qs[2], + bafs[nVar - 1], + ] + print("\t".join(map(str, values)), file=f) + +shell( + r""" +tabix {snakemake.output.vcf} + +md5() {{ + filename=$1 + fn=$(basename $filename) + pushd $(dirname $filename) 1> /dev/null 2>&1 + rslt=$(md5sum $fn) + popd 1> /dev/null 2>&1 + echo "$rslt" +}} + +md5 {snakemake.output.vcf} > {snakemake.output.vcf_md5} +md5 {snakemake.output.tbi} > {snakemake.output.tbi_md5} +md5 {snakemake.output.tsv} > {snakemake.output.tsv_md5} +""" +) From d3021cd31d20ea3afd979b131ba361928884990d Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 28 Jul 2023 14:34:55 +0200 Subject: [PATCH 5/9] fix: corrected handling of regions --- .../wrappers/bcftools/heterozygous_variants/wrapper.py | 4 ++-- snappy_wrappers/wrappers/bcftools/pileups/wrapper.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py index 43f4f51d6..385dfbfa5 100644 --- a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py @@ -9,9 +9,9 @@ if "args" in snakemake.params and "intervals" in snakemake.params["args"]: locii = "-r " + snakemake.params["args"]["intervals"] -elif "locii" in snakemake.input: +elif "locii" in snakemake.input.keys(): locii = "-R " + snakemake.input.locii -elif config["locii"]: +elif "locii" in config and config["locii"]: locii = "-R " + config["locii"] else: locii = "" diff --git a/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py b/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py index daf4c5dd1..6c70452f2 100644 --- a/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/pileups/wrapper.py @@ -2,14 +2,16 @@ """Wrapper for running bcftools mpileup """ +from snakemake.shell import shell + step = snakemake.config["pipeline_step"]["name"] config = snakemake.config["step_config"][step] if "args" in snakemake.params and "intervals" in snakemake.params["args"]: locii = "-r " + snakemake.params["args"]["intervals"] -elif "locii" in snakemake.input: +elif "locii" in snakemake.input.keys(): locii = "-R " + snakemake.input.locii -elif config["locii"]: +elif "locii" in config and config["locii"]: locii = "-R " + config["locii"] else: locii = "" From f10ba6190bb51ea445b0b0c8aeff7251f136a972 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 28 Jul 2023 14:36:41 +0200 Subject: [PATCH 6/9] feat: added segment plot & default colors for calls --- .../somatic_cnv_checking/cnv-check-plot.R | 54 +++++++++++++++---- .../somatic_cnv_checking/environment.yaml | 1 + .../wrappers/somatic_cnv_checking/wrapper.py | 25 ++++++--- 3 files changed, 65 insertions(+), 15 deletions(-) diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R index 949b2bdb7..5be6106f6 100644 --- a/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R +++ b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R @@ -1,3 +1,13 @@ +cn_to_call <- function(cn, amplification=7) { + Call <- factor(rep("Neutral", length(cn)), levels=c("Deletion", "Loss", "Neutral", "Gain", "Amplification")) + Call[!is.na(cn) & cn==0] <- "Deletion" + Call[!is.na(cn) & cn==1] <- "Loss" + Call[!is.na(cn) & 2 dplyr::filter(!is.na(LFC) & !is.na(CN)) |> - dplyr::mutate(BAF=pmin(t_ref, t_alt)/(t_ref+t_alt)) |> - dplyr::mutate(Call=factor("Neutral", levels=c("Deletion", "Loss", "Neutral", "Gain", "Amplification"))) |> - dplyr::mutate(Call=replace(Call, CN == 0, "Deletion")) |> - dplyr::mutate(Call=replace(Call, CN == 1, "Loss")) |> - dplyr::mutate(Call=replace(Call, 2 < CN & CN < amplification, "Gain")) |> - dplyr::mutate(Call=replace(Call, amplification <= CN, "Amplification")) + dplyr::mutate(BAF=t_alt/(t_ref+t_alt)) |> + dplyr::mutate(Call=cn_to_call(CN)) x } @@ -53,27 +59,57 @@ chromosome_lengths <- function(genome, use_fai=TRUE, use_dict=TRUE) { genome } -plot_cnv <- function(x, scale=c("log2", "sqrt")) { +default_call_colors <- c( + "Deletion"="#E41A1C", + "Loss"="#984EA3", + "Neutral"="#4DAF4A", + "Gain"="#80B1D3", + "Amplification"="#377EB8" +) + +plot_cnv <- function(x, scale=c("log2", "sqrt"), call_colors=default_call_colors) { scale <- match.arg(scale) if (scale == "sqrt") x_label <- "Segment Fold Change" else x_label <- "Segment Log2 Fold Change" if (scale == "sqrt") x <- x |> dplyr::mutate(LFC=2^LFC) p <- x |> + dplyr::mutate(BAF=pmin(BAF, 1-BAF)) |> ggplot2::ggplot(ggplot2::aes(x=LFC, y=BAF, color=Call)) + ggplot2::geom_point() + ggplot2::labs(x=x_label, y="B-allele fraction") + + ggplot2::scale_color_manual(values=call_colors) + ggplot2::theme_bw() if (scale == "sqrt") p <- p + ggplot2::scale_x_sqrt() p } -plot_locus <- function(x, genome_lengths) { +plot_locus <- function(x, genome_lengths, call_colors=default_call_colors) { p <- genome_lengths |> - ggplot2::ggplot(ggplot2::aes(xmin=Offset+1, xmax=Offset+Length, ymin=0, ymax=0.5, fill=as.character(n%%2))) + + ggplot2::ggplot(ggplot2::aes(xmin=Offset+1, xmax=Offset+Length, ymin=0, ymax=1, fill=as.character(n%%2))) + ggplot2::geom_rect() + ggplot2::scale_fill_manual(values=c("white", "grey")) + ggplot2::scale_x_continuous(breaks=genome_lengths$Offset+genome_lengths$Length/2, labels=genome_lengths$CHROM) + ggplot2::geom_point(data=x, mapping=ggplot2::aes(x=x, y=BAF, color=Call), inherit.aes=FALSE) + + ggplot2::scale_color_manual(values=call_colors) + + ggplot2::theme_bw() + + ggplot2::theme(axis.text.x=ggplot2::element_text(angle=90, vjust=0.5, hjust=1)) + + ggplot2::guides(fill="none") + + ggplot2::labs(x="Genomic position", y="B-allele fraction") + p +} + +plot_segment <- function(y, genome_lengths, call_colors=default_call_colors) { + p <- genome_lengths |> + ggplot2::ggplot(ggplot2::aes(xmin=Offset+1, xmax=Offset+Length, ymin=0, ymax=0.5, fill=as.character(n%%2))) + + ggplot2::geom_rect() + + ggplot2::scale_fill_manual(values=c("white", "grey")) + + ggplot2::scale_x_continuous(breaks=genome_lengths$Offset+genome_lengths$Length/2, labels=genome_lengths$CHROM) + p <- p + + ggplot2::geom_point(data=y, mapping=ggplot2::aes(x=(from+to)/2, y=Median, color=Call, size=Nvariants), inherit.aes=FALSE) + + ggplot2::geom_segment(data=y, mapping=ggplot2::aes(x=(from+to)/2, y=`1st quartile`, xend=(from+to)/2, yend=`3rd quartile`, color=Call), inherit.aes=FALSE) + + ggplot2::geom_segment(data=y, mapping=ggplot2::aes(x=from, y=Median, xend=to, yend=Median, color=Call), inherit.aes=FALSE) + + ggplot2::scale_color_manual(values=call_colors) + p <- p + ggplot2::theme_bw() + ggplot2::theme(axis.text.x=ggplot2::element_text(angle=90, vjust=0.5, hjust=1)) + ggplot2::guides(fill="none") + diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml b/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml index 88eb63da5..6421a7e1b 100644 --- a/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml +++ b/snappy_wrappers/wrappers/somatic_cnv_checking/environment.yaml @@ -4,5 +4,6 @@ channels: dependencies: - r-base - r-dplyr +- r-ggplot2 - bioconductor-variantannotation - bioconductor-biostrings diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py b/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py index 2fbdbda9c..924c194e0 100644 --- a/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py +++ b/snappy_wrappers/wrappers/somatic_cnv_checking/wrapper.py @@ -50,21 +50,34 @@ # R --vanilla --slave << __EOF source("{rscript}") -x <- vcf_to_table("{snakemake.input.vcf}", sample="{snakemake.params[args][library_name]}") -genome_lengths <- chromosome_lengths("{reference}") + +genome_lengths <- chromosome_lengths("{reference}") |> dplyr::mutate(n=dplyr::row_number()) + +x <- vcf_to_table("{snakemake.input.vcf}", sample="{snakemake.wildcards[library_name]}") x <- x |> dplyr::left_join(genome_lengths, by="CHROM") |> dplyr::mutate(x=POS + Offset) + +y <- read.table("{snakemake.input.tsv}", sep="\t", header=1, check.names=FALSE) +y <- y |> dplyr::mutate(Call=cn_to_call(CN)) +y <- y |> dplyr::left_join(genome_lengths, by="CHROM") |> dplyr::mutate(from=start + Offset, to=stop + Offset) + pdf("{snakemake.output.cnv}", height=6.22, width=9.33) -plot_cnv(x, scale="log2") + ggplot2::ggtitle("{snakemake.params[args][library_name]}") -plot_cnv(x, scale="sqrt") + ggplot2::ggtitle("{snakemake.params[args][library_name]}") +plot_cnv(x, scale="log2") + ggplot2::ggtitle("{snakemake.wildcards[library_name]}") +plot_cnv(x, scale="sqrt") + ggplot2::ggtitle("{snakemake.wildcards[library_name]}") dev.off() + pdf("{snakemake.output.locus}", height=6.22, width=12.44) -plot_locus(x, genome_lengths |> dplyr::mutate(n=dplyr::row_number()) |> dplyr::filter(CHROM %in% x$CHROM)) + - ggplot2::ggtitle("{snakemake.params[args][library_name]}") +plot_locus(x, genome_lengths |> dplyr::filter(CHROM %in% x[["CHROM"]])) + + ggplot2::ggtitle("{snakemake.wildcards[library_name]}") +dev.off() + +pdf("{snakemake.output.segment}", height=6.22, width=12.44) +plot_segment(y, genome_lengths |> dplyr::filter(CHROM %in% y[["CHROM"]])) dev.off() __EOF md5 {snakemake.output.cnv} > {snakemake.output.cnv_md5} md5 {snakemake.output.locus} > {snakemake.output.locus_md5} +md5 {snakemake.output.segment} > {snakemake.output.segment_md5} """ ) From 51f594772234d96b06119ae72ad5ca27150026f7 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 28 Jul 2023 14:39:42 +0200 Subject: [PATCH 7/9] refact: finding hetero variants in normal samples done only one in presence of multiple tumors --- .../workflows/somatic_cnv_checking/Snakefile | 75 +++-- .../somatic_cnv_checking/__init__.py | 261 +++++++++--------- .../test_workflows_somatic_cnv_checking.py | 126 +++++---- 3 files changed, 246 insertions(+), 216 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile b/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile index 258ca1a3c..f670a39a4 100644 --- a/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile +++ b/snappy_pipeline/workflows/somatic_cnv_checking/Snakefile @@ -67,7 +67,7 @@ rule somatic_cnv_checking_pileup_normal: log: **wf.get_log_file("pileup", "normal"), wrapper: - wf.wrapper_path("bcftools/heterozygous") + wf.wrapper_path("bcftools/heterozygous_variants") # Run pileup to add read counts in the tumor sample --------------------------- @@ -92,41 +92,38 @@ rule somatic_cnv_checking_pileup_tumor: # Add CNV status at the locii ------------------------------------------------- - -rule somatic_cnv_checking_cnv_run: - input: - **wf.get_input_files("cnv", "run"), - output: - **wf.get_output_files("cnv", "run"), - threads: wf.get_resource("cnv", "run", "threads") - resources: - time=wf.get_resource("cnv", "run", "time"), - memory=wf.get_resource("cnv", "run", "memory"), - partition=wf.get_resource("cnv", "run", "partition"), - tmpdir=wf.get_resource("cnv", "run", "tmpdir"), - log: - **wf.get_log_file("cnv", "run"), - wrapper: - wf.wrapper_path("vcfpy/add_bed") - - -# Generate report & plots ----------------------------------------------------- - - -rule somatic_cnv_checking_cnv_report: - input: - **wf.get_input_files("report", "run"), - output: - **wf.get_output_files("report", "run"), - params: - **{"args": wf.get_params("report", "run")} - threads: wf.get_resource("report", "run", "threads") - resources: - time=wf.get_resource("report", "run", "time"), - memory=wf.get_resource("report", "run", "memory"), - partition=wf.get_resource("report", "run", "partition"), - tmpdir=wf.get_resource("report", "run", "tmpdir"), - log: - **wf.get_log_file("report", "run"), - wrapper: - wf.wrapper_path("somatic_cnv_checking") +if config["step_config"]["somatic_cnv_checking"]["path_cnv_calling"]: + + rule somatic_cnv_checking_cnv_run: + input: + unpack(wf.get_input_files("cnv", "run")), + output: + **wf.get_output_files("cnv", "run"), + threads: wf.get_resource("cnv", "run", "threads") + resources: + time=wf.get_resource("cnv", "run", "time"), + memory=wf.get_resource("cnv", "run", "memory"), + partition=wf.get_resource("cnv", "run", "partition"), + tmpdir=wf.get_resource("cnv", "run", "tmpdir"), + log: + **wf.get_log_file("cnv", "run"), + wrapper: + wf.wrapper_path("vcfpy/add_bed") + + # Generate report & plots ----------------------------------------------------- + + rule somatic_cnv_checking_cnv_report: + input: + unpack(wf.get_input_files("report", "run")), + output: + **wf.get_output_files("report", "run"), + threads: wf.get_resource("report", "run", "threads") + resources: + time=wf.get_resource("report", "run", "time"), + memory=wf.get_resource("report", "run", "memory"), + partition=wf.get_resource("report", "run", "partition"), + tmpdir=wf.get_resource("report", "run", "tmpdir"), + log: + **wf.get_log_file("report", "run"), + wrapper: + wf.wrapper_path("somatic_cnv_checking") diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py index 46a0a2729..3397a7c18 100644 --- a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py @@ -57,13 +57,13 @@ Currently, no reports are generated. """ -from collections import OrderedDict import os import sys from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background from snakemake.io import expand +from snappy_pipeline.base import InvalidConfiguration from snappy_pipeline.utils import dictify, listify from snappy_pipeline.workflows.abstract import ( BaseStep, @@ -72,6 +72,10 @@ ResourceUsage, ) from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow +from snappy_pipeline.workflows.somatic_targeted_seq_cnv_calling import ( + SomaticTargetedSeqCnvCallingWorkflow, +) +from snappy_pipeline.workflows.somatic_wgs_cnv_calling import SomaticWgsCnvCallingWorkflow __author__ = "Eric Blanc " @@ -88,7 +92,7 @@ somatic_cnv_checking: path_ngs_mapping: ../ngs_mapping # REQUIRED path_cnv_calling: "" # Can use for instance ../somatic_targeted_seq_cnv_calling - cnv_calling_tool: "" # Can use "cnvkit" + cnv_assay_type: "" # Empty: no CNV, WES for somatic_targeted_seq_snv_calling step, WGS for somatic_wgs_cnv_calling step excluded_regions: "" # Bed file of regions to be excluded max_depth: 10000 # Max depth for pileups min_cov: 20 # Minimum depth for reference and alternative alleles to consider variant @@ -98,32 +102,10 @@ class SomaticCnvCheckingStepPart(BaseStepPart): """Base class for CNV checking sub-steps""" - - def __init__(self, parent): - super().__init__(parent) - # Build shortcut from cancer bio sample name to matched cancer sample - self.tumor_ngs_library_to_sample_pair = OrderedDict() - for sheet in self.parent.shortcut_sheets: - self.tumor_ngs_library_to_sample_pair.update( - sheet.all_sample_pairs_by_tumor_dna_ngs_library - ) - # Get shorcut to Snakemake sub workflow - self.ngs_mapping = self.parent.sub_workflows["ngs_mapping"] - - def get_params(self, action): - # Validate action - self._validate_action(action) - - def input_function(wildcards): - pair = self.tumor_ngs_library_to_sample_pair[wildcards.library] - return {"library_name": pair.tumor_sample.dna_ngs_library.name} - - return input_function @dictify def _get_log_file(self, prefix): """Return dict of log files.""" - # Validate action key_ext = ( ("log", ".log"), ("conda_info", ".conda_info.txt"), @@ -140,14 +122,17 @@ class SomaticCnvCheckingPileupStepPart(SomaticCnvCheckingStepPart): name = "pileup" actions = ("normal", "tumor") + def __init__(self, parent): + super().__init__(parent) + self.ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + def get_input_files(self, action): # Validate action self._validate_action(action) def input_function_normal(wildcards): - pair = self.tumor_ngs_library_to_sample_pair[wildcards.library] - base_path = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format( - normal_library=pair.normal_sample.dna_ngs_library.name, **wildcards + base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format( + **wildcards ) return { "bam": self.ngs_mapping(base_path + ".bam"), @@ -155,16 +140,15 @@ def input_function_normal(wildcards): } def input_function_tumor(wildcards): - pair = self.tumor_ngs_library_to_sample_pair[wildcards.library] - base_path = "output/{mapper}.{tumor_library}/out/{mapper}.{tumor_library}".format( - tumor_library=pair.tumor_sample.dna_ngs_library.name, **wildcards + base_path = "output/{mapper}.{library_name}/out/{mapper}.{library_name}".format( + **wildcards ) return { - "vcf": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz".format( - **wildcards + "locii": "work/{mapper}.{normal_library}/out/{mapper}.{normal_library}.normal.vcf.gz".format( + normal_library=self.parent.tumor_to_normal[wildcards.library_name], **wildcards ), - "tbi": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz.tbi".format( - **wildcards + "locii_tbi": "work/{mapper}.{normal_library}/out/{mapper}.{normal_library}.normal.vcf.gz.tbi".format( + normal_library=self.parent.tumor_to_normal[wildcards.library_name], **wildcards ), "bam": self.ngs_mapping(base_path + ".bam"), "bai": self.ngs_mapping(base_path + ".bam.bai"), @@ -181,13 +165,17 @@ def get_output_files(self, action): """ # Validate action self._validate_action(action) - base_path_out = "work/{{mapper}}.{{library}}/out/{{mapper}}.{{library}}.{action}{ext}" + base_path_out = ( + "work/{{mapper}}.{{library_name}}/out/{{mapper}}.{{library_name}}.{action}{ext}" + ) return dict(zip(EXT_NAMES, expand(base_path_out, action=action, ext=EXT_VALUES))) def get_log_file(self, action): # Validate action self._validate_action(action) - return self._get_log_file("work/{mapper}.{library}/log/{mapper}.{library}." + action) + return self._get_log_file( + "work/{mapper}.{library_name}/log/{mapper}.{library_name}." + action + ) def get_resource_usage(self, action): # Validate action @@ -208,40 +196,44 @@ class SomaticCnvCheckingCnvStepPart(SomaticCnvCheckingStepPart): def get_input_files(self, action): # Validate action self._validate_action(action) - filenames = {} - filenames["normal"] = "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz" - filenames["normal_tbi"] = filenames["normal"] + ".tbi" - filenames["tumor"] = "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz" - filenames["tumor_tbi"] = filenames["tumor"] + ".tbi" - if self.config["path_cnv_calling"]: - tpl = "{{mapper}}.{caller}.{{library}}".format(caller=self.config["cnv_calling_tool"]) - filenames["cnv"] = os.path.join( - self.config["path_cnv_calling"], "output", tpl, "out", tpl + ".bed.gz" + + def input_function(wildcards): + normal_library = self.parent.tumor_to_normal[wildcards.library_name] + filenames = {} + name_pattern = "{mapper}.{normal_library}" + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".normal.vcf.gz") + filenames["normal"] = tpl.format(normal_library=normal_library, **wildcards) + filenames["normal_tbi"] = filenames["normal"] + ".tbi" + name_pattern = "{mapper}.{library_name}" + tpl = os.path.join("work", name_pattern, "out", name_pattern + ".tumor.vcf.gz") + filenames["tumor"] = tpl.format(**wildcards) + filenames["tumor_tbi"] = filenames["tumor"] + ".tbi" + cnv_calling = self.parent.sub_workflows["cnv_calling"] + base_path = "output/{mapper}.{caller}.{library_name}/out/{mapper}.{caller}.{library_name}".format( + **wildcards ) + filenames["cnv"] = cnv_calling(base_path + ".bed.gz") filenames["cnv_tbi"] = filenames["cnv"] + ".tbi" - return filenames + return filenames + + return input_function + @dictify def get_output_files(self, action): # Validate action self._validate_action(action) - if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]: - name_pattern = "{{mapper}}.{caller}.{{library}}".format( - caller=self.config["cnv_calling_tool"] - ) - else: - name_pattern = "{mapper}.{library}" + key_ext = {"vcf": ".vcf.gz", "tbi": ".vcf.gz.tbi"} + name_pattern = "{mapper}.{caller}.{library_name}" + key_ext["tsv"] = ".tsv" base_path_out = "work/" + name_pattern + "/out/" + name_pattern - return dict(zip(EXT_NAMES, [base_path_out + ext for ext in EXT_VALUES])) + for key, ext in key_ext.items(): + yield (key, base_path_out + ext) + yield (key + "_md5", base_path_out + ext + ".md5") def get_log_file(self, action): # Validate action self._validate_action(action) - if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]: - name_pattern = "{{mapper}}.{caller}.{{library}}".format( - caller=self.config["cnv_calling_tool"] - ) - else: - name_pattern = "{mapper}.{library}" + name_pattern = "{mapper}.{caller}.{library_name}" return self._get_log_file(os.path.join("work", name_pattern, "log", name_pattern)) @@ -254,33 +246,35 @@ class SomaticCnvCheckingReportStepPart(SomaticCnvCheckingStepPart): def get_input_files(self, action): # Validate action self._validate_action(action) - name_pattern = "{{mapper}}.{caller}.{{library}}".format( - caller=self.config["cnv_calling_tool"] - ) - base_path_out = "work/" + name_pattern + "/out/" + name_pattern - return {"vcf": base_path_out + ".vcf.gz"} + + def input_function(wildcards): + name_pattern = "{mapper}.{caller}.{library_name}".format(**wildcards) + base_path_out = "work/" + name_pattern + "/out/" + name_pattern + return {"vcf": base_path_out + ".vcf.gz", "tsv": base_path_out + ".tsv"} + + return input_function def get_output_files(self, action): # Validate action self._validate_action(action) - name_pattern = "{{mapper}}.{caller}.{{library}}".format( - caller=self.config["cnv_calling_tool"] - ) + name_pattern = "{mapper}.{caller}.{library_name}" base_path_out = "work/" + name_pattern + "/report/" + name_pattern return { "cnv": base_path_out + ".cnv.pdf", "cnv_md5": base_path_out + ".cnv.pdf.md5", "locus": base_path_out + ".locus.pdf", "locus_md5": base_path_out + ".locus.pdf.md5", + "segment": base_path_out + ".segment.pdf", + "segment_md5": base_path_out + ".segment.pdf.md5", } def get_log_file(self, action): # Validate action self._validate_action(action) - name_pattern = "{{mapper}}.{caller}.{{library}}".format( - caller=self.config["cnv_calling_tool"] + name_pattern = "{mapper}.{caller}.{library_name}" + return self._get_log_file( + os.path.join("work", name_pattern, "log", name_pattern + ".report") ) - return self._get_log_file(os.path.join("work", name_pattern, "log", name_pattern + ".report")) class SomaticCnvCheckingWorkflow(BaseStep): @@ -308,8 +302,24 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) config_lookup_paths, config_paths, workdir, - (NgsMappingWorkflow,), + ( + SomaticTargetedSeqCnvCallingWorkflow, + SomaticWgsCnvCallingWorkflow, + NgsMappingWorkflow, + ), ) + if self.config["path_cnv_calling"] and self.config["cnv_assay_type"]: + if self.config["cnv_assay_type"] == "WES": + cnv_calling = "somatic_targeted_seq_cnv_calling" + elif self.config["cnv_assay_type"] == "WES": + cnv_calling = "somatic_wgs_cnv_calling" + else: + raise InvalidConfiguration( + "Illegal cnv_assay_type {}, must be either WES or WGS".format( + self.config["cnv_assay_type"] + ) + ) + self.register_sub_workflow(cnv_calling, self.config["path_cnv_calling"], "cnv_calling") self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"]) # Register sub step classes so the sub steps are available self.register_sub_step_classes( @@ -320,7 +330,23 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) LinkOutStepPart, ) ) - # Initialize sub-workflows + # Assemble normal/tumor pairs + self.tumor_to_normal = {} + for sheet in filter(is_not_background, self.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if ( + not sample_pair.tumor_sample.dna_ngs_library + or not sample_pair.normal_sample.dna_ngs_library + ): + msg = ( + "INFO: sample pair for cancer bio sample {} has is missing primary" + "normal or primary cancer NGS library" + ) + print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) + continue + tumor_library = sample_pair.tumor_sample.dna_ngs_library.name + normal_library = sample_pair.normal_sample.dna_ngs_library.name + self.tumor_to_normal[tumor_library] = normal_library @listify def get_result_files(self): @@ -328,68 +354,55 @@ def get_result_files(self): We will process all NGS libraries of all bio samples in all sample sheets. """ - if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]: - name_pattern = "{{mapper}}.{caller}.{{tumor_library.name}}".format( - caller=self.config["cnv_calling_tool"] - ) - else: - name_pattern = "{mapper}.{tumor_library.name}" - yield from self._yield_result_files_matched( - os.path.join("output", name_pattern, "out", name_pattern + "{ext}"), + # Log files from normal pileups + name_pattern = "{mapper}.{library_name}" + chksum = ("", ".md5") + ext = ("log", "conda_info.txt", "conda_list.txt") + yield from expand( + os.path.join("output", name_pattern, "log", name_pattern + ".normal.{ext}{chksum}"), mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - ext=EXT_VALUES, + library_name=set(self.tumor_to_normal.values()), + ext=ext, + chksum=chksum, ) - yield from self._yield_result_files_matched( - os.path.join("output", name_pattern, "log", name_pattern + "{ext}"), + yield from expand( + os.path.join("output", name_pattern, "log", name_pattern + ".tumor.{ext}{chksum}"), mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - ext=[ - part + "." + ext + chksum - for part in ("", ".normal", ".tumor") - for ext in ("log", "conda_info.txt", "conda_list.txt") - for chksum in ("", ".md5") - ], + library_name=self.tumor_to_normal.keys(), + ext=ext, + chksum=chksum, ) - if self.config["path_cnv_calling"] and self.config["cnv_calling_tool"]: - yield from self._yield_result_files_matched( - os.path.join("output", name_pattern, "report", name_pattern + "{ext}"), - mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - ext=(".cnv.pdf", ".cnv.pdf.md5", ".locus.pdf", ".locus.pdf.md5"), - ) - yield from self._yield_result_files_matched( - os.path.join("output", name_pattern, "log", name_pattern + ".report" + "{ext}"), + # Main result: vcf & optionally segment table if CNV available + ext = {"out": [".vcf.gz", ".vcf.gz.tbi"]} + if self.config["path_cnv_calling"]: + # CNV avaliable + name_pattern = "{mapper}.{caller}.{library_name}" + callers = self.w_config["step_config"]["somatic_targeted_seq_cnv_calling"]["tools"] + ext["out"] += [".tsv"] + ext["report"] = (".cnv.pdf", ".locus.pdf", ".segment.pdf") + ext["log"] = [ + suffix + "." + e + for suffix in ("", ".report") + for e in ("log", "conda_info.txt", "conda_list.txt") + ] + else: + name_pattern = "{mapper}.{library_name}" + callers = [] + for subdir, exts in ext.items(): + yield from expand( + os.path.join("output", name_pattern, subdir, name_pattern + "{ext}{chksum}"), mapper=self.w_config["step_config"]["ngs_mapping"]["tools"]["dna"], - ext=[ - "." + ext + chksum - for ext in ("log", "conda_info.txt", "conda_list.txt") - for chksum in ("", ".md5") - ], + caller=callers, + library_name=self.tumor_to_normal.keys(), + ext=exts, + chksum=chksum, ) - def _yield_result_files_matched(self, tpl, **kwargs): - """Build output paths from path template and extension list. - - This function returns the results from the matched somatic variant callers such as - Mutect. - """ - for sheet in filter(is_not_background, self.shortcut_sheets): - for sample_pair in sheet.all_sample_pairs: - if ( - not sample_pair.tumor_sample.dna_ngs_library - or not sample_pair.normal_sample.dna_ngs_library - ): - msg = ( - "INFO: sample pair for cancer bio sample {} has is missing primary" - "normal or primary cancer NGS library" - ) - print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) - continue - yield from expand( - tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs - ) - def check_config(self): """Check that the path to the NGS mapping is present""" self.ensure_w_config( ("step_config", "somatic_cnv_checking", "path_ngs_mapping"), "Path to NGS mapping not configured but required for somatic variant calling", ) + if self.config["path_cnv_calling"]: + assert self.config["cnv_assay_type"] in ("WES", "WGS") diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py index 393668fc1..2310f7676 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_cnv_checking.py @@ -8,9 +8,7 @@ import ruamel.yaml as ruamel_yaml from snakemake.io import Wildcards -from snappy_pipeline.workflows.somatic_cnv_checking import ( - SomaticCnvCheckingWorkflow, -) +from snappy_pipeline.workflows.somatic_cnv_checking import SomaticCnvCheckingWorkflow from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict from .conftest import patch_module_fs @@ -42,7 +40,7 @@ def minimal_config(): somatic_cnv_checking: path_ngs_mapping: ../ngs_mapping path_cnv_calling: ../somatic_targeted_seq_cnv_calling - cnv_calling_tool: cnvkit + cnv_assay_type: WES data_sets: first_batch: @@ -72,7 +70,10 @@ def somatic_cnv_checking_workflow( patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we # can obtain paths from the function as if we really had a NGSMappingPipelineStep here - dummy_workflow.globals = {"ngs_mapping": lambda x: "NGS_MAPPING/" + x} + dummy_workflow.globals = { + "ngs_mapping": lambda x: "NGS_MAPPING/" + x, + "cnv_calling": lambda x: "CNV_CALLING/" + x, + } # Construct the workflow object return SomaticCnvCheckingWorkflow( dummy_workflow, @@ -88,7 +89,7 @@ def somatic_cnv_checking_workflow( def test_pileup_normal_step_part_get_input_files(somatic_cnv_checking_workflow): """Tests CnvCheckingPileupStepPart.get_input_files() - action normal""" - wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"}) + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-N1-DNA1-WGS1"}) expected = { "bam": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam", "bai": "NGS_MAPPING/output/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.bam.bai", @@ -99,7 +100,7 @@ def test_pileup_normal_step_part_get_input_files(somatic_cnv_checking_workflow): def test_pileup_normal_step_part_get_output_files(somatic_cnv_checking_workflow): """Tests CnvCheckingPileupStepPart.get_output_files() - action normal""" - base_name = "work/{mapper}.{library}/out/{mapper}.{library}.normal" + base_name = "work/{mapper}.{library_name}/out/{mapper}.{library_name}.normal" expected = get_expected_output_vcf_files_dict(base_out=base_name) actual = somatic_cnv_checking_workflow.get_output_files("pileup", "normal") assert actual == expected @@ -107,7 +108,7 @@ def test_pileup_normal_step_part_get_output_files(somatic_cnv_checking_workflow) def test_pileup_normal_step_part_get_log_file(somatic_cnv_checking_workflow): """Tests CnvCheckingPileupStepPart.get_log_file() - action normal""" - base_name = "work/{mapper}.{library}/log/{mapper}.{library}.normal" + base_name = "work/{mapper}.{library_name}/log/{mapper}.{library_name}.normal" expected = get_expected_log_files_dict(base_out=base_name) actual = somatic_cnv_checking_workflow.get_log_file("pileup", "normal") assert actual == expected @@ -124,10 +125,10 @@ def test_pileup_normal_step_part_get_resource(somatic_cnv_checking_workflow): def test_pileup_tumor_step_part_get_input_files(somatic_cnv_checking_workflow): """Tests CnvCheckingPileupStepPart.get_input_files() - action tumor""" - wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"}) + wildcards = Wildcards(fromdict={"mapper": "bwa", "library_name": "P001-T1-DNA1-WGS1"}) expected = { - "vcf": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.normal.vcf.gz", - "tbi": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.normal.vcf.gz.tbi", + "locii": "work/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.normal.vcf.gz", + "locii_tbi": "work/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.normal.vcf.gz.tbi", "bam": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam", "bai": "NGS_MAPPING/output/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.bam.bai", } @@ -137,7 +138,7 @@ def test_pileup_tumor_step_part_get_input_files(somatic_cnv_checking_workflow): def test_pileup_tumor_step_part_get_output_files(somatic_cnv_checking_workflow): """Tests CnvCheckingPileupStepPart.get_output_files() - action tumor""" - base_name = "work/{mapper}.{library}/out/{mapper}.{library}.tumor" + base_name = "work/{mapper}.{library_name}/out/{mapper}.{library_name}.tumor" expected = get_expected_output_vcf_files_dict(base_out=base_name) actual = somatic_cnv_checking_workflow.get_output_files("pileup", "tumor") assert actual == expected @@ -145,7 +146,7 @@ def test_pileup_tumor_step_part_get_output_files(somatic_cnv_checking_workflow): def test_pileup_tumor_step_part_get_log_file(somatic_cnv_checking_workflow): """Tests CnvCheckingPileupStepPart.get_log_file() - action tumor""" - base_name = "work/{mapper}.{library}/log/{mapper}.{library}.tumor" + base_name = "work/{mapper}.{library_name}/log/{mapper}.{library_name}.tumor" expected = get_expected_log_files_dict(base_out=base_name) actual = somatic_cnv_checking_workflow.get_log_file("pileup", "tumor") assert actual == expected @@ -162,29 +163,39 @@ def test_pileup_tumor_step_part_get_resource(somatic_cnv_checking_workflow): def test_cnv_run_step_part_get_input_files(somatic_cnv_checking_workflow): """Tests SomaticCnvCheckingCnvStepPart.get_input_files()""" + wildcards = Wildcards( + fromdict={"mapper": "bwa", "caller": "cnvkit", "library_name": "P001-T1-DNA1-WGS1"} + ) expected = { - "cnv": "../somatic_targeted_seq_cnv_calling/output/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.bed.gz", - "cnv_tbi": "../somatic_targeted_seq_cnv_calling/output/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.bed.gz.tbi", - "normal": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz", - "normal_tbi": "work/{mapper}.{library}/out/{mapper}.{library}.normal.vcf.gz.tbi", - "tumor": "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz", - "tumor_tbi": "work/{mapper}.{library}/out/{mapper}.{library}.tumor.vcf.gz.tbi", + "cnv": "CNV_CALLING/output/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.bed.gz", + "cnv_tbi": "CNV_CALLING/output/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.bed.gz.tbi", + "normal": "work/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.normal.vcf.gz", + "normal_tbi": "work/bwa.P001-N1-DNA1-WGS1/out/bwa.P001-N1-DNA1-WGS1.normal.vcf.gz.tbi", + "tumor": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.tumor.vcf.gz", + "tumor_tbi": "work/bwa.P001-T1-DNA1-WGS1/out/bwa.P001-T1-DNA1-WGS1.tumor.vcf.gz.tbi", } - actual = somatic_cnv_checking_workflow.get_input_files("cnv", "run") + actual = somatic_cnv_checking_workflow.get_input_files("cnv", "run")(wildcards) assert actual == expected def test_cnv_run_step_part_get_output_files(somatic_cnv_checking_workflow): """Tests SomaticCnvCheckingCnvStepPart.get_output_files()""" - base_name = "work/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}" - expected = get_expected_output_vcf_files_dict(base_out=base_name) + base_name = "work/{mapper}.{caller}.{library_name}/out/{mapper}.{caller}.{library_name}" + expected = { + "vcf": base_name + ".vcf.gz", + "vcf_md5": base_name + ".vcf.gz.md5", + "tbi": base_name + ".vcf.gz.tbi", + "tbi_md5": base_name + ".vcf.gz.tbi.md5", + "tsv": base_name + ".tsv", + "tsv_md5": base_name + ".tsv.md5", + } actual = somatic_cnv_checking_workflow.get_output_files("cnv", "run") assert actual == expected def test_cnv_run_step_part_get_log_file(somatic_cnv_checking_workflow): """Tests SomaticCnvCheckingCnvStepPart.get_log_file()""" - base_name = "work/{mapper}.cnvkit.{library}/log/{mapper}.cnvkit.{library}" + base_name = "work/{mapper}.{caller}.{library_name}/log/{mapper}.{caller}.{library_name}" expected = get_expected_log_files_dict(base_out=base_name) actual = somatic_cnv_checking_workflow.get_log_file("cnv", "run") assert actual == expected @@ -192,19 +203,27 @@ def test_cnv_run_step_part_get_log_file(somatic_cnv_checking_workflow): def test_report_run_step_part_get_input_files(somatic_cnv_checking_workflow): """Tests SomaticCnvCheckingReportStepPart.get_input_files()""" - expected = {"vcf": "work/{mapper}.cnvkit.{library}/out/{mapper}.cnvkit.{library}.vcf.gz"} - actual = somatic_cnv_checking_workflow.get_input_files("report", "run") + wildcards = Wildcards( + fromdict={"mapper": "bwa", "caller": "cnvkit", "library_name": "P001-T1-DNA1-WGS1"} + ) + expected = { + "vcf": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.vcf.gz", + "tsv": "work/bwa.cnvkit.P001-T1-DNA1-WGS1/out/bwa.cnvkit.P001-T1-DNA1-WGS1.tsv", + } + actual = somatic_cnv_checking_workflow.get_input_files("report", "run")(wildcards) assert actual == expected def test_report_run_step_part_get_output_files(somatic_cnv_checking_workflow): """Tests SomaticCnvCheckingReportStepPart.get_output_files()""" - base_name = "work/{mapper}.cnvkit.{library}/report/{mapper}.cnvkit.{library}" + base_name = "work/{mapper}.{caller}.{library_name}/report/{mapper}.{caller}.{library_name}" expected = { "cnv": base_name + ".cnv.pdf", "cnv_md5": base_name + ".cnv.pdf.md5", "locus": base_name + ".locus.pdf", "locus_md5": base_name + ".locus.pdf.md5", + "segment": base_name + ".segment.pdf", + "segment_md5": base_name + ".segment.pdf.md5", } actual = somatic_cnv_checking_workflow.get_output_files("report", "run") assert actual == expected @@ -212,20 +231,12 @@ def test_report_run_step_part_get_output_files(somatic_cnv_checking_workflow): def test_report_run_step_part_get_log_file(somatic_cnv_checking_workflow): """Tests SomaticCnvCheckingReportStepPart.get_log_file()""" - base_name = "work/{mapper}.cnvkit.{library}/log/{mapper}.cnvkit.{library}.report" + base_name = "work/{mapper}.{caller}.{library_name}/log/{mapper}.{caller}.{library_name}.report" expected = get_expected_log_files_dict(base_out=base_name) actual = somatic_cnv_checking_workflow.get_log_file("report", "run") assert actual == expected -def test_report_run_step_part_get_params(somatic_cnv_checking_workflow): - """Tests SomaticCnvCheckingReportStepPart.get_params()""" - wildcards = Wildcards(fromdict={"mapper": "bwa", "library": "P001-T1-DNA1-WGS1"}) - expected = {"library_name": "P001-T1-DNA1-WGS1"} - actual = somatic_cnv_checking_workflow.get_params("report", "run")(wildcards) - assert actual == expected - - # Tests for SomaticCnvCheckingWorkflow -------------------------------------------------- @@ -237,37 +248,46 @@ def test_somatic_cnv_checking_workflow(somatic_cnv_checking_workflow): assert actual == expected # main output - tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}" + tpl = ( + "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/out/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{chksum}" + ) expected = [ - tpl.format(i=i, t=t, ext=ext) + tpl.format(i=i, t=t, ext=ext, chksum=chksum) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ( - "vcf.gz", - "vcf.gz.tbi", - "vcf.gz.md5", - "vcf.gz.tbi.md5", - ) + for ext in ("vcf.gz", "vcf.gz.tbi", "tsv") + for chksum in ("", ".md5") ] # report - tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}" + tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/report/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1.{ext}{chksum}" expected += [ - tpl.format(i=i, t=t, ext=ext) + tpl.format(i=i, t=t, ext=ext, chksum=chksum) for i, t in ((1, 1), (2, 1), (2, 2)) - for ext in ( - "cnv.pdf", - "cnv.pdf.md5", - "locus.pdf", - "locus.pdf.md5", - ) + for ext in ("cnv.pdf", "locus.pdf", "segment.pdf") + for chksum in ("", ".md5") ] - # logs + # logs (no caller) + tpl = "output/bwa.P00{i}-N1-DNA1-WGS1/log/bwa.P00{i}-N1-DNA1-WGS1.normal.{ext}{chksum}" + expected += [ + tpl.format(i=i, ext=ext, chksum=chksum) + for i in (1, 2) + for ext in ("log", "conda_info.txt", "conda_list.txt") + for chksum in ("", ".md5") + ] + tpl = "output/bwa.P00{i}-T{t}-DNA1-WGS1/log/bwa.P00{i}-T{t}-DNA1-WGS1.tumor.{ext}{chksum}" + expected += [ + tpl.format(i=i, t=t, ext=ext, chksum=chksum) + for i, t in ((1, 1), (2, 1), (2, 2)) + for ext in ("log", "conda_info.txt", "conda_list.txt") + for chksum in ("", ".md5") + ] + # logs (with caller) tpl = "output/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1/log/bwa.cnvkit.P00{i}-T{t}-DNA1-WGS1{part}.{ext}{chksum}" expected += [ tpl.format(i=i, t=t, part=part, ext=ext, chksum=chksum) for i, t in ((1, 1), (2, 1), (2, 2)) - for part in ("", ".normal", ".tumor", ".report") + for part in ("", ".report") for ext in ("log", "conda_info.txt", "conda_list.txt") for chksum in ("", ".md5") ] From 4996d4da33e1b64e78bc789861b8d98ae80ddb19 Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Fri, 28 Jul 2023 18:59:36 +0200 Subject: [PATCH 8/9] fix: fix problems in no coverage corner cases --- .../wrappers/vcfpy/add_bed/wrapper.py | 58 +++++++++++++------ 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py index a8d7f5929..d98068a87 100644 --- a/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py +++ b/snappy_wrappers/wrappers/vcfpy/add_bed/wrapper.py @@ -40,7 +40,7 @@ ] ps[nProc - 1].stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits. nProc += 1 -if config["excluded_regions"]: +if "excluded_regions" in config and config["excluded_regions"]: ps += [ subprocess.Popen( ["bcftools", "view", "--targets-file", "^" + config["excluded_regions"]], @@ -124,10 +124,21 @@ "start": locus[1], "stop": locus[2], "BAFs": [], + "NoData": 0, } depth = variant.call_for_sample[sample].data["AD"] - baf = min(depth[0], depth[1]) / (depth[0] + depth[1]) - segments[locus[3]]["BAFs"] += [baf] + if len(depth) == 0: + segments[locus[3]]["NoData"] += 1 + else: + if depth[0] is None: + depth[0] = 0 + if depth[1] is None: + depth[1] = 0 + if depth[0] + depth[1] == 0: + segments[locus[3]]["NoData"] += 1 + else: + baf = min(depth[0], depth[1]) / (depth[0] + depth[1]) + segments[locus[3]]["BAFs"] += [baf] writer.close() @@ -174,7 +185,7 @@ def quantile(x, probs, na_rm=False, method=7): with open(snakemake.output.tsv, "wt") as f: print( - "Number\tCHROM\tstart\tstop\tCN\tLFC\tNvariants\tMin\t1st quartile\tMedian\t3rd quartile\tMax", + "Number\tCHROM\tstart\tstop\tCN\tLFC\tNvariants\tMin\t1st quartile\tMedian\t3rd quartile\tMax\tNoData", file=f, ) for segId, segment in segments.items(): @@ -186,21 +197,30 @@ def quantile(x, probs, na_rm=False, method=7): nVar = len(segment["BAFs"]) bafs = sorted(segment["BAFs"]) - qs = quantile(segment["BAFs"], (0.25, 0.5, 0.75), method=8) - values = [ - n, - segment["CHROM"], - segment["start"], - segment["stop"], - cn, - segment["logFoldChange"], - nVar, - bafs[0], - qs[0], - qs[1], - qs[2], - bafs[nVar - 1], - ] + if nVar > 1: + qs = ( + [bafs[0]] + + quantile(segment["BAFs"], (0.25, 0.5, 0.75), method=8) + + [bafs[nVar - 1]] + ) + elif nVar == 1: + qs = [bafs[0]] * 5 + else: + qs = [""] * 5 + + values = ( + [ + n, + segment["CHROM"], + segment["start"], + segment["stop"], + cn, + segment["logFoldChange"], + nVar, + ] + + qs + + [segment["NoData"]] + ) print("\t".join(map(str, values)), file=f) shell( From 882d3826d9833b076f7780e04a99cdb5e06bab4b Mon Sep 17 00:00:00 2001 From: Eric Blanc Date: Mon, 21 Aug 2023 12:31:03 +0200 Subject: [PATCH 9/9] fix: Use strand to detect negative log-fold-change values encoded in score. --- snappy_pipeline/workflows/somatic_cnv_checking/__init__.py | 2 +- .../wrappers/bcftools/heterozygous_variants/wrapper.py | 4 ++-- .../wrappers/somatic_cnv_checking/cnv-check-plot.R | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py index 3397a7c18..f5781c417 100644 --- a/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py +++ b/snappy_pipeline/workflows/somatic_cnv_checking/__init__.py @@ -95,7 +95,7 @@ cnv_assay_type: "" # Empty: no CNV, WES for somatic_targeted_seq_snv_calling step, WGS for somatic_wgs_cnv_calling step excluded_regions: "" # Bed file of regions to be excluded max_depth: 10000 # Max depth for pileups - min_cov: 20 # Minimum depth for reference and alternative alleles to consider variant + min_depth: 20 # Minimum depth for reference and alternative alleles to consider variant min_baf: 0.4 # Maximum BAF to consider variant as heterozygous (between 0 & 1/2) """ diff --git a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py index 385dfbfa5..2d1770494 100644 --- a/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools/heterozygous_variants/wrapper.py @@ -35,7 +35,7 @@ conda info > {snakemake.log.conda_info} only_one_variant="N_ALT=2 & FORMAT/AD[:2]=0" -min_coverage="FORMAT/AD[:0]>{config[min_cov]} & FORMAT/AD[:1]>{config[min_cov]}" +min_depth="FORMAT/AD[:0]>{config[min_depth]} & FORMAT/AD[:1]>{config[min_depth]}" hetero="{min_ratio}*FORMAT/AD[:0]<=FORMAT/AD[:1] & FORMAT/AD[:1]<={max_ratio}*FORMAT/AD[:0]" bcftools mpileup \ @@ -45,7 +45,7 @@ -a "FORMAT/AD" \ {snakemake.input.bam} \ | bcftools filter \ - --include "$only_one_variant & $min_coverage & $hetero" \ + --include "$only_one_variant & $min_depth & $hetero" \ -O z -o {snakemake.output.vcf} tabix {snakemake.output.vcf} diff --git a/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R index 5be6106f6..f6d54977e 100644 --- a/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R +++ b/snappy_wrappers/wrappers/somatic_cnv_checking/cnv-check-plot.R @@ -33,7 +33,7 @@ cnv_to_table <- function(cnv) { y <- read.table(cnv, sep="\t", header=0) colnames(y) <- c("CHROM", "start", "stop", "name", "LFC", "strand") y <- y |> - dplyr::mutate(LFC=replace(LFC, strand == "-", LFC[strand=="-"])) |> + dplyr::mutate(LFC=replace(LFC, strand == "-", -abs(LFC[strand=="-"]))) |> dplyr::mutate(start=start + 1) y }