Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support for somatic variant calling without normal #503

Merged
merged 13 commits into from
May 6, 2024
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 12 additions & 17 deletions snappy_pipeline/workflows/somatic_variant_annotation/__init__.py
Original file line number Diff line number Diff line change
@@ -75,7 +75,6 @@

from collections import OrderedDict
import os
import sys

from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
from snakemake.io import expand
@@ -238,8 +237,8 @@ def params_function(wildcards):

def get_normal_lib_name(self, wildcards):
"""Return name of normal (non-cancer) library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
return pair.normal_sample.dna_ngs_library.name
pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None)
return pair.normal_sample.dna_ngs_library.name if pair else None


class JannovarAnnotateSomaticVcfStepPart(AnnotateSomaticVcfStepPart):
@@ -426,20 +425,16 @@ def _yield_result_files_matched(self, tpl, **kwargs):
Mutect.
"""
for sheet in filter(is_not_background, self.shortcut_sheets):
for sample_pair in sheet.all_sample_pairs:
if (
not sample_pair.tumor_sample.dna_ngs_library
or not sample_pair.normal_sample.dna_ngs_library
):
msg = (
"INFO: sample pair for cancer bio sample {} has is missing primary"
"normal or primary cancer NGS library"
)
print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
continue
yield from expand(
tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs
)
for bio_entity in sheet.sheet.bio_entities.values():
for bio_sample in bio_entity.bio_samples.values():
if not bio_sample.extra_infos.get("isTumor", False):
continue
for test_sample in bio_sample.test_samples.values():
extraction_type = test_sample.extra_infos.get("extractionType", "unknown")
if extraction_type.lower() != "dna":
continue
for ngs_library in test_sample.ngs_libraries.values():
yield from expand(tpl, tumor_library=[ngs_library], **kwargs)
tedil marked this conversation as resolved.
Show resolved Hide resolved

def _yield_result_files_joint(self, tpl, **kwargs):
"""Build output paths from path template and extension list.
98 changes: 55 additions & 43 deletions snappy_pipeline/workflows/somatic_variant_calling/__init__.py
Original file line number Diff line number Diff line change
@@ -101,7 +101,6 @@
from collections import OrderedDict
from itertools import chain
import os
import sys

from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background
from snakemake.io import expand
@@ -241,7 +240,7 @@
germline_resource: '' # Germline variants resource (same as panel of normals)
common_variants: '' # Common germline variants for contamination estimation
extra_arguments: [] # List additional Mutect2 arguments
# Each additional argument xust be in the form:
# Each additional argument must be in the form:
# "--<argument name> <argument value>"
# For example, to filter reads prior to calling & to
# add annotations to the output vcf:
@@ -387,32 +386,39 @@ def input_function(wildcards):
ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
# Get names of primary libraries of the selected cancer bio sample and the
# corresponding primary normal sample
normal_base_path = (
"output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
normal_library=self.get_normal_lib_name(wildcards), **wildcards
)
)
tumor_base_path = (
"output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}"
).format(**wildcards)
return {
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
"normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
"tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}
normal_library = self.get_normal_lib_name(wildcards)
if normal_library:
normal_base_path = (
"output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
normal_library=normal_library, **wildcards
)
)
return {
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
"normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
"tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}
else:
return {
"tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}
tedil marked this conversation as resolved.
Show resolved Hide resolved

return input_function

def get_normal_lib_name(self, wildcards):
"""Return name of normal (non-cancer) library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
return pair.normal_sample.dna_ngs_library.name
pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None)
return pair.normal_sample.dna_ngs_library.name if pair else None

def get_tumor_lib_name(self, wildcards):
"""Return name of tumor library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
return pair.tumor_sample.dna_ngs_library.name
pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None)
return pair.tumor_sample.dna_ngs_library.name if pair else wildcards.tumor_library

def get_output_files(self, action):
"""Return output files that all somatic variant calling sub steps must
@@ -577,18 +583,27 @@ def _get_input_files_run(self, wildcards):
ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
# Get names of primary libraries of the selected cancer bio sample and the
# corresponding primary normal sample
normal_base_path = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
normal_library=self.get_normal_lib_name(wildcards), **wildcards
)
tumor_base_path = (
"output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}"
).format(**wildcards)
return {
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
"normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
"tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}
normal_library = self.get_normal_lib_name(wildcards)
if normal_library:
normal_base_path = (
"output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
normal_library=normal_library, **wildcards
)
)
return {
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
"normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
"tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}
else:
return {
"tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}
tedil marked this conversation as resolved.
Show resolved Hide resolved

def _get_input_files_filter(self, wildcards):
"""Get input files for rule ``filter``.
@@ -609,9 +624,10 @@ def _get_input_files_filter(self, wildcards):
"stats": base_path + ".raw.vcf.stats",
"f1r2": base_path + ".raw.f1r2_tar.tar.gz",
}
if "contamination" in self.actions:
input_files["table"] = base_path + ".contamination.tbl"
input_files["segments"] = base_path + ".segments.tbl"
if self.get_normal_lib_name(wildcards):
if "contamination" in self.actions:
input_files["table"] = base_path + ".contamination.tbl"
input_files["segments"] = base_path + ".segments.tbl"
return input_files

def _get_input_files_pileup_normal(self, wildcards):
@@ -1196,20 +1212,16 @@ def _yield_result_files_matched(self, tpl, **kwargs):
Mutect.
"""
for sheet in filter(is_not_background, self.shortcut_sheets):
for sample_pair in sheet.all_sample_pairs:
if (
not sample_pair.tumor_sample.dna_ngs_library
or not sample_pair.normal_sample.dna_ngs_library
):
msg = (
"INFO: sample pair for cancer bio sample {} has is missing primary"
"normal or primary cancer NGS library"
)
print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
continue
yield from expand(
tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs
)
for bio_entity in sheet.sheet.bio_entities.values():
for bio_sample in bio_entity.bio_samples.values():
if not bio_sample.extra_infos.get("isTumor", False):
continue
for test_sample in bio_sample.test_samples.values():
extraction_type = test_sample.extra_infos.get("extractionType", "unknown")
if extraction_type.lower() != "dna":
continue
for ngs_library in test_sample.ngs_libraries.values():
yield from expand(tpl, tumor_library=[ngs_library], **kwargs)
tedil marked this conversation as resolved.
Show resolved Hide resolved

def _yield_result_files_joint(self, tpl, **kwargs):
"""Build output paths from path template and extension list.
Loading