Skip to content

Commit

Permalink
feat: feature-complete end-to-end run of the Becnel dataset on hs37d5…
Browse files Browse the repository at this point in the history
… & GRCh38.d1.vd1 (#460)

Co-authored-by: Manuela Benary <manuela.benary@bihealth.de>
  • Loading branch information
ericblanc20 and Manuela Benary authored Oct 31, 2023
1 parent e394e8c commit 4874074
Show file tree
Hide file tree
Showing 65 changed files with 1,009 additions and 525 deletions.
1 change: 1 addition & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pytest
coverage
pytest-cov
pytest-mock
pytest-subprocess

# Fake file system for testing
pyfakefs
Expand Down
50 changes: 31 additions & 19 deletions snappy_pipeline/workflows/cbioportal_export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
cbioportal_export:
# Required for RNA expression
path_ngs_mapping: "" # When missing, no expression data is uploaded to cBioPortal
mapping_tool: "bwa"
expression_tool: "star"
# Required for somatic variants
path_somatic_variant: ../somatic_variant_filtration # REQUIRED (before or after filtration)
Expand Down Expand Up @@ -334,7 +335,7 @@ def __init__(self, parent):
name_pattern = "{mapper}.{caller}.{annotator}.{{library_name}}"
tpl = os.path.join("work/maf", name_pattern, "out", name_pattern + "{ext}")
self.input_tpl = tpl.format(
mapper="bwa",
mapper=self.config["mapping_tool"],
caller=self.config["somatic_variant_calling_tool"],
annotator=self.config["somatic_variant_annotation_tool"],
filter_set=self.config["filter_set"],
Expand All @@ -358,8 +359,12 @@ def get_input_files(self, action):
# Validate action
self._validate_action(action)
name_pattern = "{mapper}.{caller}.{tumor_library}"
yield "cns", os.path.join(
self.config["path_copy_number"], "output", name_pattern, "out", name_pattern + ".cns"
yield "DNAcopy", os.path.join(
self.config["path_copy_number"],
"output",
name_pattern,
"out",
name_pattern + "_dnacopy.seg",
)

@dictify
Expand Down Expand Up @@ -390,10 +395,8 @@ def get_args(self, action):
# Validate action
self._validate_action(action)
return {
"features": self.parent.w_config["step_config"]["ngs_mapping"][
self.config["expression_tool"]
]["path_features"],
"pipeline_id": "ENSEMBL",
"features": self.parent.w_config["static_data_config"]["features"]["path"],
}

def get_resource_usage(self, action):
Expand Down Expand Up @@ -427,7 +430,9 @@ class cbioportalCnaFilesStepPart(cbioportalExportStepPart):

def __init__(self, parent):
super().__init__(parent)
name_pattern = "bwa." + self.config["copy_number_tool"] + ".{library_name}"
name_pattern = (
self.config["mapping_tool"] + "." + self.config["copy_number_tool"] + ".{library_name}"
)
self.input_tpl = os.path.join("work/cna", name_pattern, "out", name_pattern + ".cna")

def get_args(self, action):
Expand Down Expand Up @@ -482,9 +487,15 @@ class cbioportalSegmentStepPart(cbioportalExportStepPart):

def __init__(self, parent):
super().__init__(parent)
name_pattern = "bwa." + self.config["copy_number_tool"] + ".{library_name}"
name_pattern = (
self.config["mapping_tool"] + "." + self.config["copy_number_tool"] + ".{library_name}"
)
self.input_tpl = os.path.join(
self.config["path_copy_number"], "output", name_pattern, "out", name_pattern + ".cns"
self.config["path_copy_number"],
"output",
name_pattern,
"out",
name_pattern + "_dnacopy.seg",
)

def get_resource_usage(self, action):
Expand Down Expand Up @@ -537,9 +548,7 @@ def get_args(self, action):
"action_type": "expression",
"extra_args": {
"pipeline_id": "ENSEMBL",
"tx_obj": self.parent.w_config["step_config"]["ngs_mapping"][
self.config["expression_tool"]
]["path_features"],
"tx_obj": self.parent.w_config["static_data_config"]["features"]["path"],
},
}

Expand Down Expand Up @@ -813,13 +822,16 @@ def get_result_files(self):
def check_config(self):
"""Check config attributes for presence"""
msg = []
if self.config["path_somatic_variant"] and (
not self.config["somatic_variant_calling_tool"]
or not self.config["somatic_variant_annotation_tool"]
):
msg += [
"Somatic variant calling tool and somatic variant annotation tool must be defined"
]
if self.config["path_somatic_variant"]:
if not self.config["mapping_tool"]:
msg += ["DNA mapping tool must be defined"]
if (
not self.config["somatic_variant_calling_tool"]
or not self.config["somatic_variant_annotation_tool"]
):
msg += [
"Somatic variant calling tool and somatic variant annotation tool must be defined"
]
if self.config["path_copy_number"] and not self.config["copy_number_tool"]:
msg += [
"Somatic copy number calling tool must be defined when CNV results are available"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ rule homologous_recombination_deficiency_scarHRD_install:
log:
**wf.get_log_file("scarHRD", "install"),
wrapper:
wf.wrapper_path("r")
wf.wrapper_path("scarHRD/install")


rule homologous_recombination_deficiency_scarHRD_run:
Expand Down
60 changes: 30 additions & 30 deletions snappy_pipeline/workflows/ngs_mapping/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,18 @@
This removes the need to include gene models into the generation of indices, so that the user can select the
gene models (either from ENSEMBL or GENCODE, for example).
When the configuration option `path_features` is set, the step will output a table of expression counts
for all genes, in `output/star.{library_name}/out/star.{library_name}.GeneCounts.tab`.
The computation of gene counts relies on the features defined in the `static_data_config` section of the
configuration file. The other steps relying of feature annotations should use this too.
`STAR` outputs the counts for unstranded, forward and reverse strand protocols. When the user doesn't supply the
protocol code (0 for unstranded, 1 for forward & 2 for reverse), the step runs `infer_experiment`
(from the `rseqc` library) to infer the protocol strandedness. In both cases, the step generate a copy of
the `STAR` output containing only the relevant column included. This final version of the gene counts is found in
`output/star.{library_name}/out/star.{library_name}.GeneCounts.tab`.
If the configuration option `transcriptome` is set to `true`, the step will output a bam file of reads
mapped to the transcriptome (`output/stat.{library_name}/out/star.{library_name}.toTranscriptome.bam`).
`STAR` will rely on the `path_features` configuration option, or on the gene models embedded in
the indices to generate the mappings. If both are absent, the step will fail.
As with gene counts, `STAR` will rely on the static data configuration to generate the mappings.
Note that the mappings to the transcriptome will not be indexes using `samtools index`, because the
absence of the positional mappings.
Expand Down Expand Up @@ -387,7 +392,6 @@
# Configuration for STAR
star:
path_index: REQUIRED # Required if listed in ngs_mapping.tools.rna; otherwise, can be removed.
path_features: "" # Required for computing gene counts
num_threads_align: 16
num_threads_trimming: 8
num_threads_bam_view: 4
Expand Down Expand Up @@ -801,11 +805,10 @@ def check_config(self):
def _get_output_files_run_work(self):
"""Override base class' function to make Snakemake aware of extra files for STAR."""
output_files = super()._get_output_files_run_work()
if self.config[self.name]["path_features"]:
output_files["gene_counts"] = self.base_path_out.format(
mapper=self.name, ext=".GeneCounts.tab"
)
output_files["gene_counts_md5"] = output_files["gene_counts"] + ".md5"
output_files["gene_counts"] = self.base_path_out.format(
mapper=self.name, ext=".GeneCounts.tab"
)
output_files["gene_counts_md5"] = output_files["gene_counts"] + ".md5"
if self.config[self.name]["transcriptome"]:
output_files["transcriptome"] = self.base_path_out.format(
mapper=self.name, ext=".toTranscriptome.bam"
Expand Down Expand Up @@ -900,27 +903,24 @@ def get_output_files(self, action):

def get_result_files(self):
for mapper in self.config["tools"]["rna"]:
if self.config[mapper]["path_features"]:
tpl_out = (
"output/{mapper}.{library_name}/out/{mapper}.{library_name}.GeneCounts.tab"
)
tpl_strandedness = "output/{mapper}.{library_name}/strandedness/{mapper}.{library_name}.decision.json"
tpl_log = (
"output/{mapper}.{library_name}/log/{mapper}.{library_name}.strandedness.{ext}"
)
for library_name, extra_info in self.parent.ngs_library_to_extra_infos.items():
if extra_info["extractionType"] == "RNA":
yield tpl_out.format(mapper=mapper, library_name=library_name)
yield tpl_out.format(mapper=mapper, library_name=library_name) + ".md5"
yield tpl_strandedness.format(mapper=mapper, library_name=library_name)
yield tpl_strandedness.format(
mapper=mapper, library_name=library_name
tpl_out = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.GeneCounts.tab"
tpl_strandedness = (
"output/{mapper}.{library_name}/strandedness/{mapper}.{library_name}.decision.json"
)
tpl_log = (
"output/{mapper}.{library_name}/log/{mapper}.{library_name}.strandedness.{ext}"
)
for library_name, extra_info in self.parent.ngs_library_to_extra_infos.items():
if extra_info["extractionType"] == "RNA":
yield tpl_out.format(mapper=mapper, library_name=library_name)
yield tpl_out.format(mapper=mapper, library_name=library_name) + ".md5"
yield tpl_strandedness.format(mapper=mapper, library_name=library_name)
yield tpl_strandedness.format(mapper=mapper, library_name=library_name) + ".md5"
for ext in ("log", "conda_info.txt", "conda_list.txt"):
yield tpl_log.format(mapper=mapper, library_name=library_name, ext=ext)
yield tpl_log.format(
mapper=mapper, library_name=library_name, ext=ext
) + ".md5"
for ext in ("log", "conda_info.txt", "conda_list.txt"):
yield tpl_log.format(mapper=mapper, library_name=library_name, ext=ext)
yield tpl_log.format(
mapper=mapper, library_name=library_name, ext=ext
) + ".md5"

@dictify
def get_log_file(self, action):
Expand Down
11 changes: 6 additions & 5 deletions snappy_pipeline/workflows/panel_of_normals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@
purecn:
path_normals_list: "" # Optional file listing libraries to include in panel
path_bait_regions: REQUIRED # Bed files of enrichment kit sequences (MergedProbes for Agilent SureSelect), recommended by PureCN author
path_genomicsDB: REQUIRED # Mutect2 genomicsDB created during panel_of_normals
genome_name: "unknown" # Must be one from hg18, hg19, hg38, mm9, mm10, rn4, rn5, rn6, canFam3
enrichment_kit_name: "unknown" # For filename only...
mappability: "" # GRCh38: /fast/work/groups/cubi/projects/biotools/static_data/app_support/PureCN/hg38/mappability.bw
Expand Down Expand Up @@ -259,19 +260,19 @@ def get_input_files(self, action):
@dictify
def _get_input_files_coverage(self, wildcards):
yield "container", "work/containers/out/purecn.simg"
tpl = "output/{mapper}.{normal_library}/out/{mapper}.{normal_library}.bam"
yield "intervals", "work/purecn/out/{}_{}.list".format(
self.config["purecn"]["enrichment_kit_name"],
self.config["purecn"]["genome_name"],
)
tpl = "output/{mapper}.{library_name}/out/{mapper}.{library_name}.bam"
yield "bam", self.ngs_mapping(tpl.format(**wildcards))

@dictify
def _get_input_files_create(self, wildcards):
yield "container", "work/containers/out/purecn.simg"
tpl = "work/{mapper}.purecn/out/{mapper}.{normal_library}_coverage_loess.txt.gz"
tpl = "work/{mapper}.purecn/out/{mapper}.purecn.{library_name}_coverage_loess.txt.gz"
yield "normals", [
tpl.format(mapper=wildcards.mapper, normal_library=lib) for lib in self.normal_libraries
tpl.format(mapper=wildcards.mapper, library_name=lib) for lib in self.normal_libraries
]

def get_output_files(self, action):
Expand All @@ -294,7 +295,7 @@ def get_output_files(self, action):
}
if action == "coverage":
return {
"coverage": "work/{mapper}.purecn/out/{mapper}.{normal_library,.+-DNA[0-9]+-WES[0-9]+}_coverage_loess.txt.gz"
"coverage": "work/{mapper}.purecn/out/{mapper}.purecn.{library_name,.+-DNA[0-9]+-WES[0-9]+}_coverage_loess.txt.gz"
}
if action == "create_panel":
return {
Expand All @@ -314,7 +315,7 @@ def get_log_file(self, action):
self.config["purecn"]["enrichment_kit_name"],
self.config["purecn"]["genome_name"],
),
"coverage": "work/{mapper}.purecn/log/{mapper}.{normal_library,.+-DNA[0-9]+-WES[0-9]+}",
"coverage": "work/{mapper}.purecn/log/{mapper}.purecn.{library_name,.+-DNA[0-9]+-WES[0-9]+}",
"create_panel": "work/{mapper}.purecn/log/{mapper}.purecn.panel_of_normals",
}
assert action in self.actions
Expand Down
3 changes: 1 addition & 2 deletions snappy_pipeline/workflows/somatic_cnv_checking/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,7 @@ def input_function(wildcards):
base_path = "output/{mapper}.{caller}.{library_name}/out/{mapper}.{caller}.{library_name}".format(
**wildcards
)
filenames["cnv"] = cnv_calling(base_path + ".bed.gz")
filenames["cnv_tbi"] = filenames["cnv"] + ".tbi"
filenames["cnv"] = cnv_calling(base_path + "_dnacopy.seg")
return filenames

return input_function
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@
path_dataset_directory: REQUIRED
arriba:
path_index: REQUIRED # REQUIRED STAR path index (preferably 2.7.10 or later)
features: REQUIRED # REQUIRED Gene features (for ex. ENCODE or ENSEMBL) in gtf format
blacklist: "" # optional (provided in the arriba distribution, see /fast/work/groups/cubi/projects/biotools/static_data/app_support/arriba/v2.3.0)
known_fusions: "" # optional
tags: "" # optional (can be set to the same path as known_fusions)
Expand Down Expand Up @@ -423,10 +422,6 @@ def check_config(self):
config_keys=("step_config", "somatic_gene_fusion_calling", "arriba", "path_index"),
msg="Path to STAR indices is required",
)
self.parent.ensure_w_config(
config_keys=("step_config", "somatic_gene_fusion_calling", "arriba", "features"),
msg="Path to genomic features gtf file is required",
)

# Check that the path to the STAR index is valid.
for fn in ("Genome", "SA", "SAindex"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ rule somatic_targeted_seq_cnv_calling_sequenza_install:
params:
packages=[
{"name": "aroneklund/copynumber", "repo": "github"},
{"name": "sequenzatools/sequenza", "repo": "bitbucket"},
],
threads: wf.get_resource("sequenza", "install", "threads")
resources:
Expand All @@ -254,7 +255,7 @@ rule somatic_targeted_seq_cnv_calling_sequenza_install:
log:
**wf.get_log_file("sequenza", "install"),
wrapper:
wf.wrapper_path("r")
wf.wrapper_path("sequenza/install")


rule somatic_targeted_seq_cnv_calling_sequenza_gcreference:
Expand All @@ -272,6 +273,25 @@ rule somatic_targeted_seq_cnv_calling_sequenza_gcreference:
wf.wrapper_path("sequenza/gcreference")


rule somatic_targeted_seq_cnv_calling_sequenza_coverage:
input:
unpack(wf.get_input_files("sequenza", "coverage")),
output:
**wf.get_output_files("sequenza", "coverage"),
params:
sample_id=wf.get_params("sequenza", "coverage"),
threads: wf.get_resource("sequenza", "coverage", "threads")
resources:
time=wf.get_resource("sequenza", "coverage", "time"),
memory=wf.get_resource("sequenza", "coverage", "memory"),
partition=wf.get_resource("sequenza", "coverage", "partition"),
tmpdir=wf.get_resource("sequenza", "coverage", "tmpdir"),
log:
**wf.get_log_file("sequenza", "coverage"),
wrapper:
wf.wrapper_path("sequenza/coverage")


rule somatic_targeted_seq_cnv_calling_sequenza_run:
input:
unpack(wf.get_input_files("sequenza", "run")),
Expand All @@ -289,25 +309,6 @@ rule somatic_targeted_seq_cnv_calling_sequenza_run:
wf.wrapper_path("sequenza/run")


rule somatic_targeted_seq_cnv_calling_sequenza_report:
input:
**wf.get_input_files("sequenza", "report"),
output:
**wf.get_output_files("sequenza", "report"),
params:
sample_id=wf.get_params("sequenza", "report"),
threads: wf.get_resource("sequenza", "report", "threads")
resources:
time=wf.get_resource("sequenza", "report", "time"),
memory=wf.get_resource("sequenza", "report", "memory"),
partition=wf.get_resource("sequenza", "report", "partition"),
tmpdir=wf.get_resource("sequenza", "report", "tmpdir"),
log:
**wf.get_log_file("sequenza", "report"),
wrapper:
wf.wrapper_path("sequenza/report")


# Run PureCN ------------------------------------------------------


Expand Down
Loading

0 comments on commit 4874074

Please sign in to comment.