diff --git a/README.md b/README.md index 7af1cbfff..9cf115321 100644 --- a/README.md +++ b/README.md @@ -97,3 +97,9 @@ Moving GenomeAnalysisTK-3.8-1-0-gf15c1c3ef.tar.bz2 to /home/holtgrem_c/miniconda ``` You are now ready to run GATK v3 from this environment. + +## Development Notes + +Here, you can find the required layout for post-PR commit messages: + +- https://github.com/amannn/action-semantic-pull-request#configuration diff --git a/docs/index.rst b/docs/index.rst index 0d29098b4..dce630dd1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -54,7 +54,6 @@ Project Info step/ngs_data_qc step/ngs_mapping step/ngs_sanity_checking - step/roh_calling step/somatic_gene_fusion_calling step/somatic_neoepitope_prediction step/somatic_ngs_sanity_checking diff --git a/docs/step/roh_calling.rst b/docs/step/roh_calling.rst deleted file mode 100644 index b3d03939e..000000000 --- a/docs/step/roh_calling.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _step_roh_calling: - -==================== -Germline RoH Calling -==================== - -.. automodule:: snappy_pipeline.workflows.roh_calling - diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py index a73bd5e8a..782b7d0fa 100644 --- a/snappy_pipeline/apps/snappy_snake.py +++ b/snappy_pipeline/apps/snappy_snake.py @@ -31,7 +31,6 @@ ngs_sanity_checking, panel_of_normals, repeat_expansion, - roh_calling, somatic_gene_fusion_calling, somatic_hla_loh_calling, somatic_msi_calling, @@ -100,7 +99,6 @@ "panel_of_normals": panel_of_normals, "repeat_analysis": repeat_expansion, "ngs_sanity_checking": ngs_sanity_checking, - "roh_calling": roh_calling, "somatic_gene_fusion_calling": somatic_gene_fusion_calling, "somatic_hla_loh_calling": somatic_hla_loh_calling, "somatic_msi_calling": somatic_msi_calling, diff --git a/snappy_pipeline/workflows/roh_calling/Snakefile b/snappy_pipeline/workflows/roh_calling/Snakefile deleted file mode 100644 index 926e6c098..000000000 --- a/snappy_pipeline/workflows/roh_calling/Snakefile +++ /dev/null @@ -1,70 +0,0 @@ -# -*- coding: utf-8 -*- -"""CUBI Pipeline roh_calling step Snakefile""" - -import os - -from snappy_pipeline import expand_ref -from snappy_pipeline.workflows.roh_calling import RohCallingWorkflow - -__author__ = "Manuel Holtgrewe " - - -# Configuration =============================================================== - - -configfile: "config.yaml" - - -# Expand "$ref" JSON pointers in configuration (also works for YAML) -config, lookup_paths, config_paths = expand_ref("config.yaml", config) - -# WorkflowImpl Object Setup =================================================== - -wf = RohCallingWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) - -# Rules ======================================================================= - - -localrules: - # Linking files from work/ to output/ should be done locally - roh_calling_link_out_run, - - -rule all: - input: - wf.get_result_files(), - - -# House-Keeping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -# Generic linking out --------------------------------------------------------- - - -rule roh_calling_link_out_run: - input: - wf.get_input_files("link_out", "run"), - output: - wf.get_output_files("link_out", "run"), - run: - shell(wf.get_shell_cmd("link_out", "run", wildcards)) - - -# ROH Calling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -# Run bcftools roh ------------------------------------------------------------ - - -rule roh_calling_bcftools_roh_run: - input: - unpack(wf.get_input_files("bcftools_roh", "run")), - output: - **wf.get_output_files("bcftools_roh", "run"), - threads: wf.get_resource("bcftools_roh", "run", "threads") - resources: - time=wf.get_resource("bcftools_roh", "run", "time"), - memory=wf.get_resource("bcftools_roh", "run", "memory"), - partition=wf.get_resource("bcftools_roh", "run", "partition"), - log: - wf.get_log_file("bcftools_roh", "run"), - wrapper: - wf.wrapper_path("bcftools_roh") diff --git a/snappy_pipeline/workflows/roh_calling/__init__.py b/snappy_pipeline/workflows/roh_calling/__init__.py deleted file mode 100644 index 6f4dced19..000000000 --- a/snappy_pipeline/workflows/roh_calling/__init__.py +++ /dev/null @@ -1,294 +0,0 @@ -# -*- coding: utf-8 -*- -"""Implementation of the ``roh_calling`` step - -The roh_calling step allows for calling runs of homozygosity from germline VCF files. - -Currently, it only employs ``bcftools roh`` with simple parameters but in the future more -involved approaches could be added. - -========== -Stability -========== - -The step itself is quite new. Except for default parameter changes in the ROH calling, no large -changes are expected. - -========== -Step Input -========== - -The ``roh_calling`` step uses Snakemake sub workflows for using the result of the -``variant_calling`` step. - -=========== -Step Output -=========== - -Run-of-homozygosity will be performed on the input VCF files, on all samples at the same time. -The only tool for this currently implemented is ``bcftools roh``. The result of this is one -gzip-compressed text files containing the assignment at the variant loci and the resulting -segmentation. - -- ``{mapper}.{var_caller}.{roh_caller}.{ngs_library_name}.txt.gz`` -- ``{mapper}.{var_caller}.{roh_caller}.{ngs_library_name}.txt.gz.tbi`` - -This file is then further processed to generate one (tabixed) BED file for each individual in the -pedigree listing the ROH variants. - -- ``{mapper}.{var_caller}.{roh_caller}.{ngs_library_name}.vcf.gz`` -- ``{mapper}.{var_caller}.{roh_caller}.{ngs_library_name}.vcf.gz.md5`` -- ``{mapper}.{var_caller}.{roh_caller}.{ngs_library_name}.vcf.gz.tbi`` -- ``{mapper}.{var_caller}.{roh_caller}.{ngs_library_name}.vcf.gz.tbi.md5`` - -For example, it could look like this: - -:: - - output/ - +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1 - |   `-- out - │   +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz - │   +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz.md5 - │   +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz.tbi - │   +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.bed.gz.tbi.md5 - │   +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.txt.gz - │   +-- bwa.gatk3_ug.bcftools_roh.P001-N1-DNA1-WGS1.txt.gz.md5 - │   +-- bwa.gatk3_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz - │   +-- bwa.gatk3_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz.md5 - │   +-- bwa.gatk3_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz.tbi - │   +-- bwa.gatk3_ug.bcftools_roh.P002-N1-DNA1-WGS1.bed.gz.tbi.md5 - │   +-- bwa.gatk3_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz - │   +-- bwa.gatk3_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz.md5 - │   +-- bwa.gatk3_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz.tbi - │   `-- bwa.gatk3_ug.bcftools_roh.P003-N1-DNA1-WGS1.bed.gz.tbi.md5 - [...] - -==================== -Global Configuration -==================== - -No global configuration is used. - -===================== -Default Configuration -===================== - -The default configuration is as follows. - -.. include:: DEFAULT_CONFIG_roh_calling.rst - -===================== -Available ROH Callers -===================== - -At the moment, only ``bcftools_roh`` is available as a caller. - -======= -Reports -======= - -No reports are generated. -""" - -from collections import OrderedDict -import os # noqa: F401 -import sys - -from biomedsheets.shortcuts import GermlineCaseSheet, is_not_background -from snakemake.io import expand - -from snappy_pipeline.base import UnsupportedActionException -from snappy_pipeline.utils import dictify, listify -from snappy_pipeline.workflows.abstract import ( - BaseStep, - BaseStepPart, - LinkOutStepPart, - ResourceUsage, -) -from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow -from snappy_pipeline.workflows.variant_calling import VariantCallingWorkflow - -__author__ = "Manuel Holtgrewe " - -#: Default configuration for the roh_calling step -DEFAULT_CONFIG = r""" -# Default configuration roh_calling -step_config: - roh_calling: - path_variant_calling: ../variant_calling - tools_ngs_mapping: - - bwa - tools_variant_calling: - - gatk3_hc - tools: [bcftools_roh] # REQUIRED, available: 'bcftools_roh'. - bcftools_roh: - path_targets: null - path_af_file: null - ignore_homref: false - skip_indels: false - rec_rate: 1e-8 -""" - - -#: Key => extension mapping for BED file. -BED_EXTENSIONS = { - "bed": ".bed.gz", - "vcf_tbi": ".bed.gz.tbi", - "bed_md5": ".bed.gz.md5", - "tbi_vcf_md5": ".bed.gz.tbi.md5", -} - - -class BcftoolsRohStepPart(BaseStepPart): - """Perform ROH calling using ``bcftools roh``.""" - - name = "bcftools_roh" - - actions = ("run",) - - def __init__(self, parent): - super().__init__(parent) - self.base_path_out = ( - "work/{{mapper}}.{name}.{{library_name}}/out/{{mapper}}.{name}.{{library_name}}{ext}" - ) - # Build shortcut from index library name to pedigree - self.donor_ngs_library_to_pedigree = OrderedDict() - for sheet in self.parent.shortcut_sheets: - self.donor_ngs_library_to_pedigree.update(sheet.donor_ngs_library_to_pedigree) - - def get_input_files(self, action): - """Return input function for bcftools_roh rule""" - assert action in self.actions - return getattr(self, "_get_input_files_{}".format(action)) - - @dictify - def _get_input_files_run(self, wildcards): - """Return path to input VCF file""" - tpl = ( - "output/{mapper}.{var_caller}.{index_ngs_library}/out/" - "{mapper}.{var_caller}.{index_ngs_library}" - ) - key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} - variant_calling = self.parent.sub_workflows["variant_calling"] - for key, ext in key_ext.items(): - yield key, variant_calling(tpl.format(**wildcards) + ext) - - def get_output_files(self, action): - """Return output function for bcftools_roh rule""" - assert action in self.actions - return getattr(self, "_get_output_files_{}".format(action))() - - @dictify - def _get_output_files_run(self): - """Return output files for ROH calling""" - path = ( - "work/{mapper}.{var_caller}.bcftools_roh.{index_ngs_library}/out/" - "{mapper}.{var_caller}.bcftools_roh.{index_ngs_library}" - ) - yield "txt", path + ".regions.txt.gz" - yield "txt_md5", path + ".regions.txt.gz.md5" - - def get_log_file(self, action): - assert action in self.actions - return ( - "work/{mapper}.{var_caller}.bcftools_roh.{index_ngs_library}/" - "log/snakemake.bcftools_roh.log" - ) - - def get_resource_usage(self, action): - """Get Resource Usage - - :param action: Action (i.e., step) in the workflow, example: 'run'. - :type action: str - - :return: Returns ResourceUsage for step. - - :raises UnsupportedActionException: if action not in class defined list of valid actions. - """ - if action not in self.actions: - actions_str = ", ".join(self.actions) - error_message = f"Action '{action}' is not supported. Valid options: {actions_str}" - raise UnsupportedActionException(error_message) - return ResourceUsage( - threads=1, - time="00:10:00", - memory="4G", - ) - - -class RohCallingWorkflow(BaseStep): - """Perform run-of-homozygosity calling workflow""" - - name = "roh_calling" - sheet_shortcut_class = GermlineCaseSheet - - @classmethod - def default_config_yaml(cls): - """Return default config YAML, to be overwritten by project-specific one""" - return DEFAULT_CONFIG - - def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): - super().__init__( - workflow, - config, - config_lookup_paths, - config_paths, - workdir, - (VariantCallingWorkflow, NgsMappingWorkflow), - ) - # Register sub step classes so the sub steps are available - self.register_sub_step_classes((BcftoolsRohStepPart, LinkOutStepPart)) - # Register sub workflows - self.register_sub_workflow("variant_calling", self.config["path_variant_calling"]) - # Copy over "tools" setting from variant_calling/ngs_mapping if not set here - if not self.config["tools_ngs_mapping"]: - self.config["tools_ngs_mapping"] = self.w_config["step_config"]["ngs_mapping"]["tools"][ - "dna" - ] - if not self.config["tools_variant_calling"]: - self.config["tools_variant_calling"] = self.w_config["step_config"]["variant_calling"][ - "tools" - ] - - @listify - def get_result_files(self): - """Return list of result files for the NGS mapping workflow - - We will process all primary DNA libraries and perform joint calling within pedigrees - """ - yield from self._yield_result_files( - ( - "output/{mapper}.{caller}.bcftools_roh.{index_library}/out/" - "{mapper}.{caller}.bcftools_roh.{donor_library}{ext}" - ), - mapper=self.config["tools_ngs_mapping"], - caller=self.config["tools_variant_calling"], - ) - - def _yield_result_files(self, tpl, **kwargs): - """Build output paths from path template and extension list""" - for sheet in filter(is_not_background, self.shortcut_sheets): - for pedigree in sheet.cohort.pedigrees: - if not pedigree.index: # pragma: no cover - msg = "INFO: pedigree without index (names: {})" - print( - msg.format(list(sorted(d.name for d in pedigree.donors))), file=sys.stderr - ) - continue - elif not pedigree.index.dna_ngs_library: # pragma: no cover - msg = "INFO: pedigree index without DNA NGS library (names: {})" - print( - msg.format( # pragma: no cover - list(sorted(d.name for d in pedigree.donors)) - ), - file=sys.stderr, - ) - continue # pragma: no cover - # Text file only for the index - yield from expand( - tpl, - ext=[".regions.txt.gz", ".regions.txt.gz.md5"], - donor_library=[pedigree.index.dna_ngs_library.name], - index_library=[pedigree.index.dna_ngs_library.name], - **kwargs, - ) diff --git a/snappy_pipeline/workflows/variant_calling/Snakefile b/snappy_pipeline/workflows/variant_calling/Snakefile index 5f5304f62..d5f5ee265 100644 --- a/snappy_pipeline/workflows/variant_calling/Snakefile +++ b/snappy_pipeline/workflows/variant_calling/Snakefile @@ -247,3 +247,22 @@ rule variant_calling_baf_file_generation: **wf.get_log_file("baf_file_generation", "run"), wrapper: wf.wrapper_path("baf_file_generation") + + +# Run bcftools roh ------------------------------------------------------------ + + +rule variant_calling_bcftools_roh: + input: + **wf.get_input_files("bcftools_roh", "run"), + output: + **wf.get_output_files("bcftools_roh", "run"), + threads: wf.get_resource("bcftools_roh", "run", "threads") + resources: + time=wf.get_resource("bcftools_roh", "run", "time"), + memory=wf.get_resource("bcftools_roh", "run", "memory"), + partition=wf.get_resource("bcftools_roh", "run", "partition"), + log: + **wf.get_log_file("bcftools_roh", "run"), + wrapper: + wf.wrapper_path("bcftools_roh") diff --git a/snappy_pipeline/workflows/variant_calling/__init__.py b/snappy_pipeline/workflows/variant_calling/__init__.py index b372cb127..5c038ced6 100644 --- a/snappy_pipeline/workflows/variant_calling/__init__.py +++ b/snappy_pipeline/workflows/variant_calling/__init__.py @@ -148,6 +148,10 @@ report/baf/{mapper}.{caller}.{index_library}.{donor_library}.bw +``roh_calling`` + + Perform run-of-homozygosity calling with ``bcftools roh``. + ========= Log Files ========= @@ -302,6 +306,13 @@ jannovar_stats: enabled: true path_ser: REQUIRED # REQUIRED + bcftools_roh: + enabled: true + path_targets: null # REQUIRED; optional + path_af_file: null # REQUIRED + ignore_homref: false + skip_indels: false + rec_rate: 1e-8 # Variant calling tools and their configuration # @@ -730,28 +741,70 @@ def get_resource_usage(self, action: str) -> ResourceUsage: ) +class BcftoolsRohStepPart(GetResultFilesMixin, ReportGetLogFileMixin, BaseStepPart): + """ROH calling with `bcftools roh`.""" + + name = "bcftools_roh" + actions = ("run",) + report_per_donor = False + + def get_input_files(self, action: str) -> SnakemakeDict: + """Return required input files""" + self._validate_action(action) + return getattr(self, f"_get_input_files_{action}")() + + @dictify + def _get_input_files_run(self) -> SnakemakeDictItemsGenerator: + yield "vcf", ( + "output/{mapper}.{var_caller}.{index_library_name}/out/" + "{mapper}.{var_caller}.{index_library_name}.vcf.gz" + ) + + def get_output_files(self, action: str) -> SnakemakeDict: + """Return step part output files""" + self._validate_action(action) + return getattr(self, f"_get_output_files_{action}")() + + @dictify + def _get_output_files_run(self) -> SnakemakeDictItemsGenerator: + ext_names = {"txt": ".txt", "txt_md5": ".txt.md5"} + base_path = ( + "work/{mapper}.{var_caller}.{index_library_name}/report/roh/" + "{mapper}.{var_caller}.{index_library_name}" + ) + work_files = {key: f"{base_path}{ext}" for key, ext in ext_names.items()} + yield from work_files.items() + yield "output_links", [ + re.sub(r"^work/", "output/", work_path) + for work_path in chain(work_files.values(), self.get_log_file("run").values()) + ] + + def get_resource_usage(self, action: str) -> ResourceUsage: + """Get Resource Usage + + :param action: Action (i.e., step) in the workflow, example: 'run'. + :type action: str + + :return: Returns ResourceUsage for step. + """ + self._validate_action(action) + return ResourceUsage( + threads=1, + time="00:10:00", + memory="4G", + ) + + class JannovarStatisticsStepPart(GetResultFilesMixin, ReportGetLogFileMixin, BaseStepPart): """Base class for VCF statistics computation with "jannovar statistics" Statistics are computed overall and per-sample """ - #: Step name name = "jannovar_stats" - - #: Class available actions actions = ("run",) - - #: Whether we generate per-donor files. report_per_donor = False - def __init__(self, parent): - super().__init__(parent) - self.base_path_out = ( - "work/{mapper}.{var_caller}.{index_library_name}/report/jannovar_stats/" - "{mapper}.{var_caller}.{index_library_name}" - ) - @dictify def get_input_files(self, action) -> SnakemakeDictItemsGenerator: """Return path to input files""" @@ -771,10 +824,12 @@ def _get_output_files_run(self) -> SnakemakeDictItemsGenerator: """Return output files that all germline variant calling sub steps must return (VCF + TBI file) """ + base_path = ( + "work/{mapper}.{var_caller}.{index_library_name}/report/jannovar_stats/" + "{mapper}.{var_caller}.{index_library_name}" + ) ext_names = {"report": ".txt", "report_md5": ".txt.md5"} - work_files = {} - for key, ext in ext_names.items(): - work_files[key] = self.base_path_out + ext + work_files = {key: f"{base_path}{ext}" for key, ext in ext_names.items()} yield from work_files.items() yield "output_links", [ re.sub(r"^work/", "output/", work_path) @@ -876,6 +931,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) Gatk4HaplotypeCallerJointStepPart, Gatk4HaplotypeCallerGvcfStepPart, BcftoolsStatsStepPart, + BcftoolsRohStepPart, JannovarStatisticsStepPart, BafFileGenerationStepPart, ) @@ -887,7 +943,7 @@ def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir) def get_result_files(self) -> SnakemakeListItemsGenerator: for tool in self.config["tools"]: yield from self.sub_steps[tool].get_result_files() - for name in ("baf_file_generation", "bcftools_stats", "jannovar_stats"): + for name in ("baf_file_generation", "bcftools_stats", "jannovar_stats", "bcftools_roh"): if self.w_config["step_config"]["variant_calling"][name]["enabled"]: yield from self.sub_steps[name].get_result_files() diff --git a/snappy_wrappers/wrappers/bcftools_roh/wrapper.py b/snappy_wrappers/wrappers/bcftools_roh/wrapper.py index 92e7c07b4..76e1bf2fa 100644 --- a/snappy_wrappers/wrappers/bcftools_roh/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools_roh/wrapper.py @@ -1,49 +1,75 @@ -# -*- coding: utf-8 -*- -"""CUBI+Snakemake wrapper code for bcftools roh -""" - -from snakemake.shell import shell +from snakemake import shell __author__ = "Manuel Holtgrewe " +DEF_HELPER_FUNCS = r""" +compute-md5() +{ + if [[ $# -ne 2 ]]; then + >&2 echo "Invalid number of arguments: $#" + exit 1 + fi + md5sum $1 \ + | awk '{ gsub(/.*\//, "", $2); print; }' \ + > $2 +} +""" + shell( r""" set -x -export TMPDIR=$(mktemp -d) -trap "rm -rf $TMPDIR" EXIT +# Write files for reproducibility ----------------------------------------------------------------- + +{DEF_HELPER_FUNCS} + +# Write out information about conda and save a copy of the wrapper with picked variables +# as well as the environment.yaml file. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +compute-md5 {snakemake.log.conda_list} {snakemake.log.conda_list_md5} +compute-md5 {snakemake.log.conda_info} {snakemake.log.conda_info_md5} +cp {__real_file__} {snakemake.log.wrapper} +compute-md5 {snakemake.log.wrapper} {snakemake.log.wrapper_md5} +cp $(dirname {__file__})/environment.yaml {snakemake.log.env_yaml} +compute-md5 {snakemake.log.env_yaml} {snakemake.log.env_yaml_md5} -# Also pipe stderr to log file -if [[ -n "{snakemake.log}" ]]; then +# Also pipe stderr to log file -------------------------------------------------------------------- + +if [[ -n "{snakemake.log.log}" ]]; then if [[ "$(set +e; tty; set -e)" != "" ]]; then - rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log}) - exec 2> >(tee -a "{snakemake.log}" >&2) + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) else - rm -f "{snakemake.log}" && mkdir -p $(dirname {snakemake.log}) - echo "No tty, logging disabled" >"{snakemake.log}" + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" fi fi +# Create auto-cleaned temporary directory +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + # Perform ROH Calling ----------------------------------------------------------------------------- out={snakemake.output.txt} raw_out=${{out%.regions.txt.gz}}.raw.txt.gz bcftools roh \ - $(if [[ "{snakemake.config[step_config][roh_calling][bcftools_roh][path_targets]}" != "None" ]]; then - echo --regions-file "{snakemake.config[step_config][roh_calling][bcftools_roh][path_targets]}" + $(if [[ "{snakemake.config[step_config][variant_calling][bcftools_roh][path_targets]}" != "None" ]]; then + echo --regions-file "{snakemake.config[step_config][variant_calling][bcftools_roh][path_targets]}" fi) \ - $(if [[ "{snakemake.config[step_config][roh_calling][bcftools_roh][path_af_file]}" != "None" ]]; then - echo --AF-file "{snakemake.config[step_config][roh_calling][bcftools_roh][path_af_file]}" + $(if [[ "{snakemake.config[step_config][variant_calling][bcftools_roh][path_af_file]}" != "None" ]]; then + echo --AF-file "{snakemake.config[step_config][variant_calling][bcftools_roh][path_af_file]}" fi) \ - $(if [[ "{snakemake.config[step_config][roh_calling][bcftools_roh][ignore_homref]}" != "False" ]]; then + $(if [[ "{snakemake.config[step_config][variant_calling][bcftools_roh][ignore_homref]}" != "False" ]]; then echo --ignore-homref fi) \ - $(if [[ "{snakemake.config[step_config][roh_calling][bcftools_roh][skip_indels]}" != "False" ]]; then + $(if [[ "{snakemake.config[step_config][variant_calling][bcftools_roh][skip_indels]}" != "False" ]]; then echo --skip-indels fi) \ - $(if [[ "{snakemake.config[step_config][roh_calling][bcftools_roh][rec_rate]}" != "None" ]]; then - echo --rec-rate "{snakemake.config[step_config][roh_calling][bcftools_roh][rec_rate]}" + $(if [[ "{snakemake.config[step_config][variant_calling][bcftools_roh][rec_rate]}" != "None" ]]; then + echo --rec-rate "{snakemake.config[step_config][variant_calling][bcftools_roh][rec_rate]}" fi) \ --output $raw_out \ --output-type srz \ @@ -60,7 +86,24 @@ ) | bgzip -c \ > {snakemake.output.txt} -pushd $(dirname {snakemake.output.txt}) -md5sum $(basename {snakemake.output.txt}) >$(basename {snakemake.output.txt}).md5 +compute-md5 {snakemake.output.txt} {snakemake.output.txt_md5} + +# Create output links ----------------------------------------------------------------------------- + +for path in {snakemake.output.output_links}; do + dst=$path + src=work/${{dst#output/}} + ln -sr $src $dst +done +""" +) + +# Compute MD5 sums of logs. +shell( + r""" +{DEF_HELPER_FUNCS} + +sleep 1s # try to wait for log file flush +compute-md5 {snakemake.log.log} {snakemake.log.log_md5} """ ) diff --git a/snappy_wrappers/wrappers/bcftools_stats/run/wrapper.py b/snappy_wrappers/wrappers/bcftools_stats/run/wrapper.py index f5e597887..ca8293432 100644 --- a/snappy_wrappers/wrappers/bcftools_stats/run/wrapper.py +++ b/snappy_wrappers/wrappers/bcftools_stats/run/wrapper.py @@ -2,22 +2,37 @@ __author__ = "Manuel Holtgrewe " +DEF_HELPER_FUNCS = r""" +compute-md5() +{ + if [[ $# -ne 2 ]]; then + >&2 echo "Invalid number of arguments: $#" + exit 1 + fi + md5sum $1 \ + | awk '{ gsub(/.*\//, "", $2); print; }' \ + > $2 +} +""" + shell( r""" set -x # Write files for reproducibility ----------------------------------------------------------------- +{DEF_HELPER_FUNCS} + # Write out information about conda and save a copy of the wrapper with picked variables # as well as the environment.yaml file. conda list >{snakemake.log.conda_list} conda info >{snakemake.log.conda_info} -md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} -md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} +compute-md5 {snakemake.log.conda_list} {snakemake.log.conda_list_md5} +compute-md5 {snakemake.log.conda_info} {snakemake.log.conda_info_md5} cp {__real_file__} {snakemake.log.wrapper} -md5sum {snakemake.log.wrapper} >{snakemake.log.wrapper_md5} +compute-md5 {snakemake.log.wrapper} {snakemake.log.wrapper_md5} cp $(dirname {__file__})/environment.yaml {snakemake.log.env_yaml} -md5sum {snakemake.log.env_yaml} >{snakemake.log.env_yaml_md5} +compute-md5 {snakemake.log.env_yaml} {snakemake.log.env_yaml_md5} # Also pipe stderr to log file -------------------------------------------------------------------- @@ -45,7 +60,7 @@ > {snakemake.output.txt} # Compute MD5 sums on output files -md5sum {snakemake.output.txt} >{snakemake.output.txt_md5} +compute-md5 {snakemake.output.txt} {snakemake.output.txt_md5} # Create output links ----------------------------------------------------------------------------- @@ -60,7 +75,9 @@ # Compute MD5 sums of logs. shell( r""" +{DEF_HELPER_FUNCS} + sleep 1s # try to wait for log file flush -md5sum {snakemake.log.log} >{snakemake.log.log_md5} +compute-md5 {snakemake.log.log} {snakemake.log.log_md5} """ ) diff --git a/tests/snappy_pipeline/workflows/test_workflows_roh_calling.py b/tests/snappy_pipeline/workflows/test_workflows_roh_calling.py deleted file mode 100644 index 858620101..000000000 --- a/tests/snappy_pipeline/workflows/test_workflows_roh_calling.py +++ /dev/null @@ -1,180 +0,0 @@ -# -*- coding: utf-8 -*- -"""Tests for the ``roh_calling`` workflow module code""" - - -import textwrap - -import pytest -import ruamel.yaml as ruamel_yaml -from snakemake.io import Wildcards - -from snappy_pipeline.workflows.roh_calling import RohCallingWorkflow - -from .conftest import patch_module_fs - -__author__ = "Manuel Holtgrewe " - - -@pytest.fixture(scope="module") # otherwise: performance issues -def minimal_config(): - """Return YAML parsing result for (germline) configuration""" - yaml = ruamel_yaml.YAML() - return yaml.load( - textwrap.dedent( - r""" - static_data_config: - reference: - path: /path/to/ref.fa - dbsnp: - path: /path/to/dbsnp.vcf.gz - - step_config: - ngs_mapping: - tools: - dna: ['bwa'] - compute_coverage_bed: true - path_target_regions: /path/to/regions.bed - bwa: - path_index: /path/to/bwa/index.fa - - variant_calling: - tools: - - gatk3_hc - roh_calling: - path_variant_callilng: ../variant_calling - - data_sets: - first_batch: - file: sheet.tsv - search_patterns: - - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} - search_paths: ['/path'] - type: germline_variants - naming_scheme: only_secondary_id - """ - ).lstrip() - ) - - -@pytest.fixture -def roh_calling_workflow( - dummy_workflow, - minimal_config, - config_lookup_paths, - work_dir, - config_paths, - germline_sheet_fake_fs, - mocker, -): - """Return RohCallingWorkflow object pre-configured with germline sheet""" - # Patch out file-system related things in abstract (the crawling link in step is defined there) - germline_sheet_fake_fs.fs.create_file( - file_path="/path/to/ref.fa.fai", - contents="1\t249250621\t52\t60\t61\n2\t243199373\t253404903\t60\t61\n", - create_missing_dirs=True, - ) - patch_module_fs("snappy_pipeline.workflows.abstract", germline_sheet_fake_fs, mocker) - patch_module_fs("snappy_pipeline.workflows.roh_calling", germline_sheet_fake_fs, mocker) - # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we - # can obtain paths from the function as if we really had a NGSMappingPipelineStep there - dummy_workflow.globals = { - "ngs_mapping": lambda x: "NGS_MAPPING/" + x, - "variant_calling": lambda x: "VAR_CALLING/" + x, - } - # Construct the workflow object - return RohCallingWorkflow( - dummy_workflow, - minimal_config, - config_lookup_paths, - config_paths, - work_dir, - ) - - -# Tests for BcftoolsRohStepPart -------------------------------------------------------------------- - - -def test_roh_calling_bcftools_roh_step_part_get_input_files_run(roh_calling_workflow): - """Tests BcftoolsRohStepPart._get_input_files_run()""" - # Define expected - base_name_out = ( - "VAR_CALLING/output/bwa.gatk3_hc.P001-N1-DNA1-WGS1/out/bwa.gatk3_hc.P001-N1-DNA1-WGS1" - ) - expected = { - "vcf_tbi": base_name_out + ".vcf.gz.tbi", - "vcf": base_name_out + ".vcf.gz", - } - # Get actual - wildcards = Wildcards( - fromdict={ - "mapper": "bwa", - "var_caller": "gatk3_hc", - "index_ngs_library": "P001-N1-DNA1-WGS1", - } - ) - actual = roh_calling_workflow.get_input_files("bcftools_roh", "run")(wildcards) - assert actual == expected - - -def test_roh_calling_bcftools_roh_step_part_get_output_files_run(roh_calling_workflow): - """Tests BcftoolsRohStepPart._get_output_files_run()""" - # Define actual - base_name_out = ( - "work/{mapper}.{var_caller}.bcftools_roh.{index_ngs_library}/out/" - "{mapper}.{var_caller}.bcftools_roh.{index_ngs_library}" - ) - expected = { - "txt": base_name_out + ".regions.txt.gz", - "txt_md5": base_name_out + ".regions.txt.gz.md5", - } - # Get actual - actual = roh_calling_workflow.get_output_files("bcftools_roh", "run") - - assert actual == expected - - -def test_roh_calling_bcftools_roh_step_part_get_log_file(roh_calling_workflow): - """Tests BcftoolsRohStepPart.get_log_file() - run""" - # Define expected - expected = ( - "work/{mapper}.{var_caller}.bcftools_roh.{index_ngs_library}/log/" - "snakemake.bcftools_roh.log" - ) - # Get actual - actual = roh_calling_workflow.get_log_file("bcftools_roh", "run") - assert actual == expected - - -def test_roh_calling_bcftools_roh_step_part_get_resource_usage(roh_calling_workflow): - """Tests BcftoolsRohStepPart.get_resource()""" - # Define expected - expected_dict = {"threads": 1, "time": "00:10:00", "memory": "4G", "partition": "medium"} - # Evaluate - for action in ("run",): - for resource, expected in expected_dict.items(): - msg_error = f"Assertion error for resource '{resource}' in action '{action}'." - actual = roh_calling_workflow.get_resource("bcftools_roh", action, resource) - assert actual == expected, msg_error - - -# Tests for RohCallingWorkflow --------------------------------------------------------------------- - - -def test_roh_calling_workflow(roh_calling_workflow): - """Test simple functionality of the workflow""" - # Check created sub steps - expected = ["bcftools_roh", "link_out"] - assert expected == list(sorted(roh_calling_workflow.sub_steps.keys())) - - # Check result file construction - p0001_base_out = "output/bwa.gatk3_hc.bcftools_roh.P001-N1-DNA1-WGS1/out/" - p0004_base_out = "output/bwa.gatk3_hc.bcftools_roh.P004-N1-DNA1-WGS1/out/" - expected = [ - p0001_base_out + "bwa.gatk3_hc.bcftools_roh.P001-N1-DNA1-WGS1.regions.txt.gz", - p0001_base_out + "bwa.gatk3_hc.bcftools_roh.P001-N1-DNA1-WGS1.regions.txt.gz.md5", - p0004_base_out + "bwa.gatk3_hc.bcftools_roh.P004-N1-DNA1-WGS1.regions.txt.gz", - p0004_base_out + "bwa.gatk3_hc.bcftools_roh.P004-N1-DNA1-WGS1.regions.txt.gz.md5", - ] - expected = list(sorted(expected)) - actual = list(sorted(roh_calling_workflow.get_result_files())) - assert actual == expected diff --git a/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py b/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py index c0d6567cc..de9bf64ed 100644 --- a/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py +++ b/tests/snappy_pipeline/workflows/test_workflows_variant_calling.py @@ -482,6 +482,7 @@ def test_variant_calling_workflow(variant_calling_workflow): expected = [ "baf_file_generation", "bcftools_call", + "bcftools_roh", "bcftools_stats", "gatk3_hc", "gatk3_ug", @@ -535,6 +536,7 @@ def test_variant_calling_workflow(variant_calling_workflow): for step in ( f"{var_caller}_run", "jannovar_stats_run", + "bcftools_roh_run", ) ] base_out = ( @@ -563,10 +565,7 @@ def test_variant_calling_workflow(variant_calling_workflow): "gatk3_hc", "gatk3_ug", ) - for step in ( - "bcftools_stats_run", - "baf_file_generation_run", - ) + for step in ("bcftools_stats_run", "baf_file_generation_run") ] tpl = ( "output/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1/report/" @@ -613,6 +612,21 @@ def test_variant_calling_workflow(variant_calling_workflow): ) for ext in ("bw", "bw.md5") ] + tpl = ( + "output/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1/report/" + "roh/{mapper}.{var_caller}.P00{i}-N1-DNA1-WGS1.{ext}" + ) + expected += [ + tpl.format(mapper=mapper, var_caller=var_caller, i=i, ext=ext) + for i in (1, 4) + for mapper in ("bwa",) + for var_caller in ( + "bcftools_call", + "gatk3_hc", + "gatk3_ug", + ) + for ext in ("txt", "txt.md5") + ] expected = list(sorted(expected)) actual = list(sorted(variant_calling_workflow.get_result_files())) assert actual == expected