Skip to content

Commit

Permalink
feat: allow gcnv model building to handle more than one kit at a time (
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Dec 22, 2022
1 parent 221157c commit a1a19dc
Show file tree
Hide file tree
Showing 8 changed files with 427 additions and 426 deletions.
229 changes: 108 additions & 121 deletions snappy_pipeline/workflows/gcnv/gcnv_build_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,68 +5,78 @@
from snakemake.io import touch

from snappy_pipeline.utils import dictify
from snappy_pipeline.workflows.gcnv.gcnv_common import GcnvStepPart
from snappy_pipeline.workflows.gcnv.gcnv_common import GcnvCommonStepPart


class BuildGcnvModelStepPart(GcnvStepPart):
"""Class with methods to build GATK4 gCNV models"""

#: Class available actions
actions = (
"preprocess_intervals",
"annotate_gc",
"filter_intervals",
"scatter_intervals",
"coverage",
"contig_ploidy",
"call_cnvs",
"post_germline_calls",
)

def __init__(self, parent):
super().__init__(parent)
class AnnotateGcMixin:
"""Mixin providing functions for ``annotate_gc``"""

@staticmethod
@dictify
def _get_input_files_annotate_gc(wildcards):
name_pattern = "gcnv_preprocess_intervals.{wildcards.library_kit}".format(
wildcards=wildcards
)
def _get_input_files_annotate_gc(self, wildcards):
name_pattern = f"gcnv_preprocess_intervals.{wildcards.library_kit}"
ext = "interval_list"
yield ext, "work/{name_pattern}/out/{name_pattern}.{ext}".format(
name_pattern=name_pattern, ext=ext
)
yield ext, f"work/{name_pattern}/out/{name_pattern}.{ext}"

@dictify
def _get_output_files_annotate_gc(self):
ext = "tsv"
name_pattern = "gcnv_annotate_gc.{library_kit}"
yield ext, f"work/{name_pattern}/out/{name_pattern}.{ext}"

def _get_log_file_annotate_gc(self):
name_pattern = "gcnv_annotate_gc.{library_kit}"
return f"work/{name_pattern}/log/{name_pattern}.log"


class FilterIntervalsMixin:
"""Mixin providing functions for ``filter_intervals``"""

@dictify
def _get_input_files_filter_intervals(self, wildcards):
yield from self._get_input_files_annotate_gc(wildcards).items()
name_pattern = "gcnv_annotate_gc.{wildcards.library_kit}".format(wildcards=wildcards)
name_pattern = f"gcnv_annotate_gc.{wildcards.library_kit}"
ext = "tsv"
yield ext, "work/{name_pattern}/out/{name_pattern}.{ext}".format(
name_pattern=name_pattern, ext=ext
)
yield ext, f"work/{name_pattern}/out/{name_pattern}.{ext}"
key = "covs"
covs = []
for lib in sorted(self.index_ngs_library_to_donor):
if self.ngs_library_to_kit.get(lib) == wildcards.library_kit:
name_pattern = "{mapper}.gcnv_coverage.{library_name}".format(
mapper=wildcards.mapper, library_name=lib
)
covs.append(
"work/{name_pattern}/out/{name_pattern}.{ext}".format(
name_pattern=name_pattern, ext="tsv"
)
)
name_pattern = f"{wildcards.mapper}.gcnv_coverage.{lib}"
ext = "tsv"
covs.append(f"work/{name_pattern}/out/{name_pattern}.{ext}")
yield key, covs

@staticmethod
@dictify
def _get_input_files_scatter_intervals(wildcards):
def _get_output_files_filter_intervals(self):
ext = "interval_list"
name_pattern = "{mapper}.gcnv_filter_intervals.{library_kit}".format(**wildcards)
yield ext, "work/{name_pattern}/out/{name_pattern}.{ext}".format(
name_pattern=name_pattern, ext=ext
)
name_pattern = "{mapper}.gcnv_filter_intervals.{library_kit}"
yield ext, f"work/{name_pattern}/out/{name_pattern}.{ext}"

def _get_log_file_filter_intervals(self):
name_pattern = "{mapper}.gcnv_filter_intervals.{library_kit}"
return f"work/{name_pattern}/log/{name_pattern}.log"


class ScatterIntervalsMixin:
"""Mixin providing functions for ``scatter_intervals``"""

@dictify
def _get_input_files_scatter_intervals(self, wildcards):
ext = "interval_list"
name_pattern = f"{wildcards.mapper}.gcnv_filter_intervals.{wildcards.library_kit}"
yield ext, f"work/{name_pattern}/out/{name_pattern}.{ext}"

def _get_output_files_scatter_intervals(self):
name_pattern = "{mapper}.gcnv_scatter_intervals.{library_kit}"
return f"work/{name_pattern}/out/{name_pattern}"

def _get_log_file_scatter_intervals(self):
name_pattern = "{mapper}.gcnv_scatter_intervals.{library_kit}"
return f"work/{name_pattern}/log/{name_pattern}.log"


class ContigPloidyMixin:
"""Mixin providing functions for ``contig_ploidy``"""

@dictify
def _get_input_files_contig_ploidy(self, wildcards):
Expand All @@ -78,23 +88,26 @@ def _get_input_files_contig_ploidy(self, wildcards):
"""
ext = "interval_list"
name_pattern = "{mapper}.gcnv_filter_intervals.{library_kit}"
yield ext, "work/{name_pattern}/out/{name_pattern}.{ext}".format(
name_pattern=name_pattern, ext=ext
)
yield ext, f"work/{name_pattern}/out/{name_pattern}.{ext}"
ext = "tsv"
tsvs = []
for lib in sorted(self.index_ngs_library_to_donor):
if self.ngs_library_to_kit.get(lib) == wildcards.library_kit:
name_pattern = "{mapper}.gcnv_coverage.{library_name}".format(
mapper=wildcards.mapper, library_name=lib
)
tsvs.append(
"work/{name_pattern}/out/{name_pattern}.{ext}".format(
name_pattern=name_pattern, ext=ext
)
)
name_pattern = f"{wildcards.mapper}.gcnv_coverage.{lib}"
tsvs.append(f"work/{name_pattern}/out/{name_pattern}.{ext}")
yield ext, tsvs

@dictify
def _get_output_files_contig_ploidy(self):
"""Yield dictionary with output files for ``contig_ploidy`` rule in COHORT MODE."""
ext = "done"
name_pattern = "{mapper}.gcnv_contig_ploidy.{library_kit}"
yield ext, touch(f"work/{name_pattern}/out/{name_pattern}/.{ext}")


class CallCnvsMixin:
"""Mixin providing functions for ``call_cnvs``"""

@dictify
def _get_input_files_call_cnvs(self, wildcards):
"""Yield input files for ``call_cnvs`` in COHORT mode.
Expand All @@ -103,89 +116,63 @@ def _get_input_files_call_cnvs(self, wildcards):
and 'library_kit' (e.g., 'Agilent_SureSelect_Human_All_Exon_V6').
:type wildcards: snakemake.io.Wildcards
"""
name_pattern = "{mapper}.gcnv_scatter_intervals.{library_kit}"
path_pattern = (
"work/{name_pattern}/out/{name_pattern}/temp_{{shard}}/scattered.interval_list"
f"work/{name_pattern}/out/{name_pattern}/temp_{{shard}}/scattered.interval_list"
)
name_pattern = "{mapper}.gcnv_scatter_intervals.{library_kit}"
yield "interval_list_shard", path_pattern.format(name_pattern=name_pattern)
yield "interval_list_shard", path_pattern
ext = "tsv"
tsvs = []
for lib in sorted(self.index_ngs_library_to_donor):
if self.ngs_library_to_kit.get(lib) == wildcards.library_kit:
path_pattern = "{mapper}.gcnv_coverage.{library_name}".format(
mapper=wildcards.mapper, library_name=lib
)
tsvs.append(
"work/{name_pattern}/out/{name_pattern}.{ext}".format(
name_pattern=path_pattern, ext=ext
)
)
path_pattern = f"{wildcards.mapper}.gcnv_coverage.{lib}"
tsvs.append(f"work/{path_pattern}/out/{path_pattern}.{ext}")
yield ext, tsvs
ext = "ploidy"
path_pattern = "{mapper}.gcnv_contig_ploidy.{library_kit}".format(**wildcards)
yield ext, "work/{name_pattern}/out/{name_pattern}/.done".format(name_pattern=path_pattern)
path_pattern = f"{wildcards.mapper}.gcnv_contig_ploidy.{wildcards.library_kit}"
yield ext, f"work/{path_pattern}/out/{path_pattern}/.done"
key = "intervals"
path_pattern = "gcnv_annotate_gc.{library_kit}"
yield key, "work/{name_pattern}/out/{name_pattern}.{ext}".format(
name_pattern=path_pattern, ext="tsv"
)

@staticmethod
@dictify
def _get_output_files_annotate_gc():
ext = "tsv"
name_pattern = "gcnv_annotate_gc.{library_kit}"
yield ext, "work/{name_pattern}/out/{name_pattern}.{ext}".format(
name_pattern=name_pattern, ext=ext
)

@staticmethod
@dictify
def _get_output_files_filter_intervals():
ext = "interval_list"
name_pattern = "{mapper}.gcnv_filter_intervals.{library_kit}"
yield ext, "work/{name_pattern}/out/{name_pattern}.{ext}".format(
name_pattern=name_pattern, ext=ext
)

@staticmethod
def _get_output_files_scatter_intervals():
return "work/{name_pattern}/out/{name_pattern}".format(
name_pattern="{mapper}.gcnv_scatter_intervals.{library_kit}"
)

@staticmethod
@dictify
def _get_output_files_contig_ploidy():
"""Yield dictionary with output files for ``contig_ploidy`` rule in COHORT MODE."""
ext = "done"
name_pattern = "{mapper}.gcnv_contig_ploidy.{library_kit}"
yield ext, touch(
"work/{name_pattern}/out/{name_pattern}/.{ext}".format(
name_pattern=name_pattern, ext=ext
)
)
yield key, f"work/{path_pattern}/out/{path_pattern}.tsv"

@staticmethod
@dictify
def _get_output_files_call_cnvs():
def _get_output_files_call_cnvs(self):
"""Yield dictionary with output files for ``call_cnvs`` rle in COHORT MODE."""
ext = "done"
name_pattern = "{mapper}.gcnv_call_cnvs.{library_kit}.{shard}"
yield ext, touch(
"work/{name_pattern}/out/{name_pattern}/.{ext}".format(
name_pattern=name_pattern, ext=ext
)
)
yield ext, touch(f"work/{name_pattern}/out/{name_pattern}/.{ext}")


class PostGermlineCallsMixin:
"""Mixin providing functions for ``post_germline_calls``"""

@staticmethod
@dictify
def _get_output_files_post_germline_calls():
def _get_output_files_post_germline_calls(self):
name_pattern = "{mapper}.gcnv_post_germline_calls.{library_name}"
pairs = {"ratio_tsv": ".ratio.tsv", "itv_vcf": ".interval.vcf.gz", "seg_vcf": ".vcf.gz"}
for key, ext in pairs.items():
yield key, touch(
"work/{name_pattern}/out/{name_pattern}{ext}".format(
name_pattern=name_pattern, ext=ext
)
)
yield key, touch(f"work/{name_pattern}/out/{name_pattern}{ext}")


class BuildGcnvModelStepPart(
AnnotateGcMixin,
FilterIntervalsMixin,
ScatterIntervalsMixin,
ContigPloidyMixin,
CallCnvsMixin,
PostGermlineCallsMixin,
GcnvCommonStepPart,
):
"""Class with methods to build GATK4 gCNV models"""

#: Class available actions
actions = (
"preprocess_intervals",
"annotate_gc",
"filter_intervals",
"scatter_intervals",
"coverage",
"contig_ploidy",
"call_cnvs",
"post_germline_calls",
)
Loading

0 comments on commit a1a19dc

Please sign in to comment.