Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support for somatic variant calling without normal #503

Merged
merged 13 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .tests/test-workflow/config/cancer_wes/config.yaml.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ step_config:
somatic_variant_filtration:
path_somatic_variant: ../somatic_variant_annotation
path_ngs_mapping: ../ngs_mapping
filtration_schema: list
filter_list:
- dkfz: {}
- bcftools:
Expand Down
8 changes: 8 additions & 0 deletions docs/step_generic.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,11 @@ The versioning allows the pipeline step to check whether there are incompatibili

The execution of ``cubi-snake`` in a directory will not automatically generate these files.
Rather, they are only generated when used in a pipeline step such as ``somatic_targeted_cnv_calling``.

.. note:: **Debugging configuration errors**

The configuration for a complete pipeline can be long, and copy/paste errors can creep in.
When facing errors in configuration, it is advisable to

- Use absolute paths for files not part of the workflow (panel of normal files are **NOT** part the of workflow), and
- Use the ``snappy-snake --verbose``, which prints the step configuration (and all the dependend workflows) as ``snappy`` has understood it.
11 changes: 11 additions & 0 deletions snappy_pipeline/workflows/abstract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,9 @@ def __init__(
#: Functions from sub workflows, can be used to generate output paths into these workflows
self.sub_workflows = {}

if workflow.verbose:
self._write_step_config()

def _setup_hooks(self):
"""Setup Snakemake workflow hooks for start/end/error"""
# In the following, the "log" parameter to the handler functions is set to "_" as we
Expand Down Expand Up @@ -931,6 +934,14 @@ def _load_data_search_infos(self):
mixed_se_pe=False,
)

def _write_step_config(self, f=sys.stdout):
print(f"\n\n----- Configuration for step {self.name}:\n", file=f)
yaml = ruamel_yaml.YAML()
yaml.preserve_quotes = True
yaml.indent(sequence=4, mapping=4, offset=4)
yaml.dump(self.config, f)
print(f"\n------ Configuration for {self.name} ends here\n", file=f)

@classmethod
def wrapper_path(cls, path):
"""Generate path to wrapper"""
Expand Down
31 changes: 15 additions & 16 deletions snappy_pipeline/workflows/somatic_variant_annotation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,8 @@ def params_function(wildcards):

def get_normal_lib_name(self, wildcards):
"""Return name of normal (non-cancer) library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
return pair.normal_sample.dna_ngs_library.name
pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None)
return pair.normal_sample.dna_ngs_library.name if pair else None


class JannovarAnnotateSomaticVcfStepPart(AnnotateSomaticVcfStepPart):
Expand Down Expand Up @@ -426,20 +426,19 @@ def _yield_result_files_matched(self, tpl, **kwargs):
Mutect.
"""
for sheet in filter(is_not_background, self.shortcut_sheets):
for sample_pair in sheet.all_sample_pairs:
if (
not sample_pair.tumor_sample.dna_ngs_library
or not sample_pair.normal_sample.dna_ngs_library
):
msg = (
"INFO: sample pair for cancer bio sample {} has is missing primary"
"normal or primary cancer NGS library"
)
print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
continue
yield from expand(
tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs
)
for bio_entity in sheet.sheet.bio_entities.values():
for bio_sample in bio_entity.bio_samples.values():
if not bio_sample.extra_infos.get("isTumor", False):
continue
for test_sample in bio_sample.test_samples.values():
extraction_type = test_sample.extra_infos.get("extractionType", "unknown")
if extraction_type.lower() != "dna":
if extraction_type == "unknown":
msg = "INFO: sample {} has missing extraction type, ignored"
print(msg.format(test_sample.name), file=sys.stderr)
continue
for ngs_library in test_sample.ngs_libraries.values():
yield from expand(tpl, tumor_library=[ngs_library], **kwargs)

def _yield_result_files_joint(self, tpl, **kwargs):
"""Build output paths from path template and extension list.
Expand Down
92 changes: 56 additions & 36 deletions snappy_pipeline/workflows/somatic_variant_calling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@
germline_resource: '' # Germline variants resource (same as panel of normals)
common_variants: '' # Common germline variants for contamination estimation
extra_arguments: [] # List additional Mutect2 arguments
# Each additional argument xust be in the form:
# Each additional argument must be in the form:
# "--<argument name> <argument value>"
# For example, to filter reads prior to calling & to
# add annotations to the output vcf:
Expand Down Expand Up @@ -387,32 +387,41 @@ def input_function(wildcards):
ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
# Get names of primary libraries of the selected cancer bio sample and the
# corresponding primary normal sample
normal_base_path = (
"output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
normal_library=self.get_normal_lib_name(wildcards), **wildcards
)
)
tumor_base_path = (
"output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}"
).format(**wildcards)
return {
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
"normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
input_files = {
"tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}

normal_library = self.get_normal_lib_name(wildcards)
if normal_library:
normal_base_path = (
"output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
normal_library=normal_library, **wildcards
)
)
input_files.update(
{
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
"normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
}
)

return input_files

return input_function

def get_normal_lib_name(self, wildcards):
"""Return name of normal (non-cancer) library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
return pair.normal_sample.dna_ngs_library.name
pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None)
return pair.normal_sample.dna_ngs_library.name if pair else None

def get_tumor_lib_name(self, wildcards):
"""Return name of tumor library"""
pair = self.tumor_ngs_library_to_sample_pair[wildcards.tumor_library]
return pair.tumor_sample.dna_ngs_library.name
pair = self.tumor_ngs_library_to_sample_pair.get(wildcards.tumor_library, None)
return pair.tumor_sample.dna_ngs_library.name if pair else wildcards.tumor_library

def get_output_files(self, action):
"""Return output files that all somatic variant calling sub steps must
Expand Down Expand Up @@ -577,19 +586,30 @@ def _get_input_files_run(self, wildcards):
ngs_mapping = self.parent.sub_workflows["ngs_mapping"]
# Get names of primary libraries of the selected cancer bio sample and the
# corresponding primary normal sample
normal_base_path = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
normal_library=self.get_normal_lib_name(wildcards), **wildcards
)
tumor_base_path = (
"output/{mapper}.{tumor_library}/out/" "{mapper}.{tumor_library}"
).format(**wildcards)
return {
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
"normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
input_files = {
"tumor_bam": ngs_mapping(tumor_base_path + ".bam"),
"tumor_bai": ngs_mapping(tumor_base_path + ".bam.bai"),
}

normal_library = self.get_normal_lib_name(wildcards)
if normal_library:
normal_base_path = (
"output/{mapper}.{normal_library}/out/{mapper}.{normal_library}".format(
normal_library=normal_library, **wildcards
)
)
input_files.update(
{
"normal_bam": ngs_mapping(normal_base_path + ".bam"),
"normal_bai": ngs_mapping(normal_base_path + ".bam.bai"),
}
)

return input_files

def _get_input_files_filter(self, wildcards):
"""Get input files for rule ``filter``.

Expand All @@ -609,9 +629,10 @@ def _get_input_files_filter(self, wildcards):
"stats": base_path + ".raw.vcf.stats",
"f1r2": base_path + ".raw.f1r2_tar.tar.gz",
}
if "contamination" in self.actions:
input_files["table"] = base_path + ".contamination.tbl"
input_files["segments"] = base_path + ".segments.tbl"
if self.get_normal_lib_name(wildcards):
if "contamination" in self.actions:
input_files["table"] = base_path + ".contamination.tbl"
input_files["segments"] = base_path + ".segments.tbl"
return input_files

def _get_input_files_pileup_normal(self, wildcards):
Expand Down Expand Up @@ -1196,20 +1217,19 @@ def _yield_result_files_matched(self, tpl, **kwargs):
Mutect.
"""
for sheet in filter(is_not_background, self.shortcut_sheets):
for sample_pair in sheet.all_sample_pairs:
if (
not sample_pair.tumor_sample.dna_ngs_library
or not sample_pair.normal_sample.dna_ngs_library
):
msg = (
"INFO: sample pair for cancer bio sample {} has is missing primary"
"normal or primary cancer NGS library"
)
print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr)
continue
yield from expand(
tpl, tumor_library=[sample_pair.tumor_sample.dna_ngs_library], **kwargs
)
for bio_entity in sheet.sheet.bio_entities.values():
for bio_sample in bio_entity.bio_samples.values():
if not bio_sample.extra_infos.get("isTumor", False):
continue
for test_sample in bio_sample.test_samples.values():
extraction_type = test_sample.extra_infos.get("extractionType", "unknown")
if extraction_type.lower() != "dna":
if extraction_type == "unknown":
msg = "INFO: sample {} has missing extraction type, ignored"
print(msg.format(test_sample.name), file=sys.stderr)
continue
for ngs_library in test_sample.ngs_libraries.values():
yield from expand(tpl, tumor_library=[ngs_library], **kwargs)

def _yield_result_files_joint(self, tpl, **kwargs):
"""Build output paths from path template and extension list.
Expand Down
Loading
Loading