Skip to content

Commit

Permalink
fix: various fixes to make gCNV work properly for WGS and targeted (#370
Browse files Browse the repository at this point in the history
)
  • Loading branch information
holtgrewe authored Feb 9, 2023
1 parent 33785b2 commit 14d86e3
Show file tree
Hide file tree
Showing 13 changed files with 196 additions and 13 deletions.
1 change: 1 addition & 0 deletions snappy_pipeline/workflows/common/gcnv/gcnv_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def get_resource_usage(self, action):
high_resource_action_list = (
"call_cnvs",
"post_germline_calls",
"joint_germline_cnv_segmentation",
)
if action in high_resource_action_list:
return self.resource_usage_dict.get("high_resource")
Expand Down
12 changes: 12 additions & 0 deletions snappy_pipeline/workflows/helper_gcnv_model_targeted/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,23 @@ wf = HelperBuildTargetSeqGcnvModelWorkflow(
# Rules =======================================================================


localrules:
# Writing out pedigrees can be done locally
sv_calling_targeted_write_pedigree_run,


rule all:
input:
wf.get_result_files(),


rule build_gcnv_model_write_pedigree_run:
output:
wf.get_output_files("write_pedigree", "run"),
run:
wf.substep_dispatch("write_pedigree", "run", wildcards, output)


rule build_gcnv_model_preprocess_intervals:
input:
unpack(wf.get_input_files("gcnv", "preprocess_intervals")),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
from snakemake.io import glob_wildcards

from snappy_pipeline.utils import DictQuery, dictify, listify
from snappy_pipeline.workflows.abstract import BaseStep
from snappy_pipeline.workflows.abstract import BaseStep, WritePedigreeStepPart
from snappy_pipeline.workflows.common.gcnv.gcnv_build_model import BuildGcnvModelStepPart
from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow

Expand Down Expand Up @@ -195,7 +195,12 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
(NgsMappingWorkflow,),
)
# Register sub step classes so the sub steps are available
self.register_sub_step_classes((BuildGcnvTargetSeqModelStepPart,))
self.register_sub_step_classes(
(
WritePedigreeStepPart,
BuildGcnvTargetSeqModelStepPart,
)
)
# Register sub workflows
self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])
# Build mapping from NGS DNA library to library kit
Expand Down
12 changes: 12 additions & 0 deletions snappy_pipeline/workflows/helper_gcnv_model_wgs/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,23 @@ wf = HelperBuildWgsGcnvModelWorkflow(workflow, config, lookup_paths, config_path
# Rules =======================================================================


localrules:
# Writing out pedigrees can be done locally
sv_calling_targeted_write_pedigree_run,


rule all:
input:
wf.get_result_files(),


rule build_gcnv_model_write_pedigree_run:
output:
wf.get_output_files("write_pedigree", "run"),
run:
wf.substep_dispatch("write_pedigree", "run", wildcards, output)


rule build_gcnv_model_preprocess_intervals:
input:
unpack(wf.get_input_files("gcnv", "preprocess_intervals")),
Expand Down
9 changes: 7 additions & 2 deletions snappy_pipeline/workflows/helper_gcnv_model_wgs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
from snakemake.io import glob_wildcards

from snappy_pipeline.utils import dictify, listify
from snappy_pipeline.workflows.abstract import BaseStep
from snappy_pipeline.workflows.abstract import BaseStep, WritePedigreeStepPart
from snappy_pipeline.workflows.common.gcnv.gcnv_build_model import BuildGcnvModelStepPart
from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow

Expand Down Expand Up @@ -231,7 +231,12 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir)
(NgsMappingWorkflow,),
)
# Register sub step classes so the sub steps are available
self.register_sub_step_classes((BuildGcnvWgsModelStepPart,))
self.register_sub_step_classes(
(
WritePedigreeStepPart,
BuildGcnvWgsModelStepPart,
)
)
# Register sub workflows
self.register_sub_workflow("ngs_mapping", self.config["path_ngs_mapping"])

Expand Down
1 change: 1 addition & 0 deletions snappy_pipeline/workflows/sv_calling_targeted/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ rule sv_calling_targeted_gcnv_contig_ploidy:
wf.get_log_file("gcnv", "contig_ploidy"),
params:
args=wf.get_params("gcnv", "contig_ploidy"),
step_key="sv_calling_targeted",
wrapper:
wf.wrapper_path("gcnv/contig_ploidy_case_mode")

Expand Down
17 changes: 15 additions & 2 deletions snappy_pipeline/workflows/sv_calling_targeted/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
path_target_interval_list_mapping: []
gcnv:
# Path to interval block list with PAR region for contig calling.
path_par_intervals: null # REQUIRED
# Path to gCNV model - will execute analysis in CASE MODE.
#
# Example:
Expand All @@ -59,17 +61,25 @@
# contig_ploidy: /path/to/ploidy-model # Output from `DetermineGermlineContigPloidy`
# model_pattern: /path/to/model_* # Output from `GermlineCNVCaller`
precomputed_model_paths: []
# Skip processing of the following libraries. If the library is in
# family/pedigree then all of the family/pedigree will be skipped.
skip_libraries: []
delly2:
path_exclude_tsv: null # optional
max_threads: 16
map_qual: 1
geno_qual: 5
qual_tra: 20
mad_cutoff: 9
# Skip processing of the following libraries. If the library is in
# family/pedigree then all of the family/pedigree will be skipped.
skip_libraries: []
manta:
max_threads: 16
num_threads: 16
# Skip processing of the following libraries. If the library is in
# family/pedigree then all of the family/pedigree will be skipped.
skip_libraries: []
melt:
me_refs_infix: 1KGP_Hg19
Expand All @@ -79,6 +89,9 @@
- SVA
jar_file: REQUIRED
genes_file: add_bed_files/1KGP_Hg19/hg19.genes.bed # adjust, e.g., Hg38/Hg38.genes.bed
# Skip processing of the following libraries. If the library is in
# family/pedigree then all of the family/pedigree will be skipped.
skip_libraries: []
"""


Expand Down
1 change: 1 addition & 0 deletions snappy_pipeline/workflows/sv_calling_wgs/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ rule sv_calling_wgs_gcnv_contig_ploidy:
wf.get_log_file("gcnv", "contig_ploidy"),
params:
args=wf.get_params("gcnv", "contig_ploidy"),
step_key="sv_calling_targeted",
wrapper:
wf.wrapper_path("gcnv/contig_ploidy_case_mode")

Expand Down
25 changes: 21 additions & 4 deletions snappy_pipeline/workflows/sv_calling_wgs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,38 @@
# Short-read SV calling tool configuration
delly2:
path_exclude_tsv: null # optional
max_threads: 16
map_qual: 1
geno_qual: 5
qual_tra: 20
mad_cutoff: 9
# Skip processing of the following libraries. If the library is in
# family/pedigree then all of the family/pedigree will be skipped.
skip_libraries: []
manta:
max_threads: 16
num_threads: 16
# Skip processing of the following libraries. If the library is in
# family/pedigree then all of the family/pedigree will be skipped.
skip_libraries: []
popdel:
window_size: 10000000
max_sv_size: 20000 # == padding
# Skip processing of the following libraries. If the library is in
# family/pedigree then all of the family/pedigree will be skipped.
skip_libraries: []
gcnv:
# Path to interval block list with PAR region for contig calling.
path_par_intervals: null # REQUIRED
# Path to gCNV model - will execute analysis in CASE MODE.
#
# Example of precomputed model:
# - library: "Agilent SureSelect Human All Exon V6" # Library name
# contig_ploidy: /path/to/ploidy-model # Output from `DetermineGermlineContigPloidy`
# model_pattern: /path/to/model_* # Output from `GermlineCNVCaller`
precomputed_model_paths: [] # REQUIRED
# Path to BED file with uniquely mappable regions.
path_uniquely_mapable_bed: null # REQUIRED
# Skip processing of the following libraries. If the library is in
# family/pedigree then all of the family/pedigree will be skipped.
skip_libraries: []
melt:
me_refs_infix: 1KGP_Hg19
me_types:
Expand All @@ -79,10 +90,16 @@
- SVA
jar_file: REQUIRED
genes_file: add_bed_files/1KGP_Hg19/hg19.genes.bed # adjust, e.g., Hg38/Hg38.genes.bed
# Skip processing of the following libraries. If the library is in
# family/pedigree then all of the family/pedigree will be skipped.
skip_libraries: []
# Long-read SV calling tool configuration
sniffles2:
tandem_repeats: /fast/groups/cubi/work/projects/biotools/sniffles2/trf/GRCh37/human_hs37d5.trf.bed # REQUIRED
# Skip processing of the following libraries. If the library is in
# family/pedigree then all of the family/pedigree will be skipped.
skip_libraries: []
# Common configuration
ignore_chroms:
Expand Down
61 changes: 60 additions & 1 deletion snappy_wrappers/wrappers/gcnv/contig_ploidy/wrapper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,39 @@
# -*- coding: utf-8 -*-
import pathlib

from snakemake.shell import shell

# We will first call the GATK tool. However, there are issues with the reliability of the contig
# ploidy calls. This causes an exception in the JointGermlineCNVSegmentation further downstream.
# We thus rewrite the genotype calls based on the ones from the PED files as a workaround.
#
# cf. https://github.com/broadinstitute/gatk/issues/8164

out_path = pathlib.Path(snakemake.output.done).parent

MALE = "male"
FEMALE = "female"

sex_map = {}
for ped_path in snakemake.input.ped:
with open(ped_path, "rt") as inputf:
for line in inputf:
arr = line.strip().split("\t")
sex_map[arr[1]] = {"1": MALE, "2": FEMALE}[arr[4]]

ploidy_x = {MALE: 1, FEMALE: 2}
ploidy_y = {MALE: 1, FEMALE: 0}

paths_tsv = " ".join(snakemake.input.tsv)

# Add interval block list for PAR regions if configured.
par_intervals = snakemake.config["step_config"]["helper_gcnv_model_targeted"]["gcnv"].get(
"path_par_intervals"
)
if par_intervals:
par_args = f"-XL {par_intervals}"
else:
par_args = ""

shell(
r"""
export TMPDIR=$(mktemp -d)
Expand All @@ -26,10 +56,39 @@
gatk DetermineGermlineContigPloidy \
-L {snakemake.input.interval_list} \
{par_args} \
--interval-merging-rule OVERLAPPING_ONLY \
$(for tsv in {paths_tsv}; do echo -I $tsv; done) \
--contig-ploidy-priors $PRIORS \
--output $(dirname {snakemake.output}) \
--output-prefix ploidy
"""
)

ploidy_calls = out_path / "ploidy-calls"
for sample_dir in ploidy_calls.glob("SAMPLE_*"):
path_name = sample_dir / "sample_name.txt"
with path_name.open("rt") as inputf:
sample_name = inputf.read().strip()
sample_sex = sex_map[sample_name]
path_call = sample_dir / "contig_ploidy.tsv"
with path_call.open("rt") as inputf:
call_lines = inputf.readlines()
for i in range(len(call_lines)):
line = call_lines[i]
arr = line.split("\t")
chrom = arr[0].lower()
if "x" in chrom or "y" in chrom:
ploidy = int(arr[1])
if "x" in arr[0].lower():
if ploidy != ploidy_x[sample_sex]:
arr[1] = str(ploidy_x[sample_sex])
arr[2] = f"42.000{ploidy}" # magic marker
elif "y" in arr[0].lower():
if ploidy != ploidy_y[sample_sex]:
arr[1] = str(ploidy_y[sample_sex])
arr[2] = f"42.000{ploidy}" # magic marker
call_lines[i] = "\t".join(arr)
with path_call.open("wt") as outputf:
for line in call_lines:
print(line.strip(), file=outputf)
58 changes: 57 additions & 1 deletion snappy_wrappers/wrappers/gcnv/contig_ploidy_case_mode/wrapper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,37 @@
# -*- coding: utf-8 -*-
import pathlib

from snakemake.shell import shell

# We will first call the GATK tool. However, there are issues with the reliability of the contig
# ploidy calls. This causes an exception in the JointGermlineCNVSegmentation further downstream.
# We thus rewrite the genotype calls based on the ones from the PED files as a workaround.
#
# cf. https://github.com/broadinstitute/gatk/issues/8164

out_path = pathlib.Path(snakemake.output.done).parent

MALE = "male"
FEMALE = "female"

sex_map = {}
for ped_path in snakemake.input.ped:
with open(ped_path, "rt") as inputf:
for line in inputf:
arr = line.strip().split("\t")
sex_map[arr[1]] = {"1": MALE, "2": FEMALE}[arr[4]]

ploidy_x = {MALE: 1, FEMALE: 2}
ploidy_y = {MALE: 1, FEMALE: 0}

paths_tsv = " ".join(snakemake.input.tsv)

## Add interval block list for PAR regions if configured.
# par_intervals = snakemake.config["step_config"]["helper_gcnv_model_targeted"].get("path_par_intervals")
# if par_intervals:
# par_args = f"-XL {par_intervals}"
# else:
# par_args = ""

shell(
r"""
export THEANO_FLAGS="base_compiledir=$TMPDIR/theano_compile_dir"
Expand All @@ -17,3 +45,31 @@
--output-prefix ploidy
"""
)

ploidy_calls = out_path / "ploidy-calls"
for sample_dir in ploidy_calls.glob("SAMPLE_*"):
path_name = sample_dir / "sample_name.txt"
with path_name.open("rt") as inputf:
sample_name = inputf.read().strip()
sample_sex = sex_map[sample_name]
path_call = sample_dir / "contig_ploidy.tsv"
with path_call.open("rt") as inputf:
call_lines = inputf.readlines()
for i in range(len(call_lines)):
line = call_lines[i]
arr = line.split("\t")
chrom = arr[0].lower()
if "x" in chrom or "y" in chrom:
ploidy = int(arr[1])
if "x" in arr[0].lower():
if ploidy != ploidy_x[sample_sex]:
arr[1] = str(ploidy_x[sample_sex])
arr[2] = f"42.000{ploidy}" # magic marker
elif "y" in arr[0].lower():
if ploidy != ploidy_y[sample_sex]:
arr[1] = str(ploidy_y[sample_sex])
arr[2] = f"42.000{ploidy}" # magic marker
call_lines[i] = "\t".join(arr)
with path_call.open("wt") as outputf:
for line in call_lines:
print(line.strip(), file=outputf)
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
target_interval_bed = item["path"]
break
else: # of for, did not break out
raise Exception("Found no target intervals for %s" % item["name"])
raise Exception("Found no target intervals for %s" % snakemake.wildcards.library_kit)

shell(
r"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ def test_gcnv_step_part_get_resource_usage(sv_calling_targeted_workflow):
high_resource_actions = (
"call_cnvs",
"post_germline_calls",
"joint_germline_cnv_segmentation",
)
all_actions = sv_calling_targeted_workflow.substep_getattr("gcnv", "actions")
default_actions = [action for action in all_actions if action not in high_resource_actions]
Expand Down

0 comments on commit 14d86e3

Please sign in to comment.