fix(defaults.toml): remove unnecessary options and add more comment

populationgenomics · Jan 8, 2025 · 57f80ca · 57f80ca
1 parent 803069b
commit 57f80ca
Showing 1 changed file with 68 additions and 98 deletions.
diff --git a/src/cpg_flow/defaults.toml b/src/cpg_flow/defaults.toml
@@ -1,4 +1,9 @@
 [workflow]
+# Only print the final merged config and a list of stages to be submitted.
+# Will skip any communication with Metamist, Hail Batch, and Cloud Storage, so
+# the code can be run without permissions.
+#dry_run = true
+
 # Cohorts to use as inputs.
 #input_cohorts = []
 
@@ -19,10 +24,6 @@
 # Force stage rerun
 #force_stages = []
 
-# Map of stages to lists of sequencing groups, to skip for specific stages
-#[workflow.skip_stages_for_sgs]
-#CramQC = ['CPGaaa']
-
 # Name of the workflow (to prefix output paths)
 #name =
 
@@ -33,9 +34,12 @@
 # By default, the hash of all input paths will be used.
 #output_version = "0.1"
 
-# Check input file existence (e.g. FASTQ files). When they are missing,
+# Limit to data of this sequencing type
+#sequencing_type = 'genome'
+
+# Check input file existence. When they are missing,
 # the `skip_sgs_with_missing_input` option controls whether such
-# sequencing groups should be ignored, or it should cause raising an error.
+# sequencing groups should be ignored, or it should raise an error.
 check_inputs = true
 
 # For the first (not-skipped) stage, if the input for a target does
@@ -53,107 +57,73 @@ check_intermediates = true
 # already exist. If it exists, do not submit stage jobs.
 check_expected_outputs = true
 
-# Limit to data of this sequencing type
-#sequencing_type = 'genome'
-
-# Realign CRAM when available, instead of using FASTQ.
-# The parameter value should correspond to CRAM version
-# (e.g. v0 in gs://cpg-fewgenomes-main/cram/v0/CPGaaa.cram
-#realign_from_cram_version = 'v0'
-
 # Calling intervals (defauls to whole genome intervals)
+# TODO: Remove from cpg_flow and move to cpg_utils, then remove from defaults.toml
 #intervals_path =
 
-# The GQ bands used for ReblockGVCF, specified as exclusive upper bounds for reference
-# confidence GQ bands (must be in [1, 100] and specified in increasing order). Finer
-# granularity bands result in more reference blocks and therefore larger GVCFs.
-reblock_gq_bands = [20, 30, 40]
-
-# Only print the final merged config and a list of stages to be submitted.
-# Will skip any communication with Metamist, Hail Batch, and Cloud Storage, so
-# the code can be run without permissions.
-#dry_run = true
+# Map of stages to lists of sequencing groups, to skip for specific stages
+# [workflow.skip_stages_for_sgs]
+#CramQC = ['CPGaaa']
 
-# By default, BamToCram stage will create CRAM analysis types, this can be overridden
-# bam_to_cram_analysis_type = 'pacbio_cram'
+[hail]
 
-# Map internally used validation sample external_id to truth sample names
-[validation.sample_map]
-HG001_NA12878 = 'na12878'
-SYNDIP = 'syndip'
+# This is different from the cpg_flow Workflow dry_run option.
+# This will create Hail Batch jobs, but will enable Hail Batch's dry_run option.
+dry_run = false
 
-[hail]
+# Delete temporary directories with intermediate files.
 delete_scratch_on_exit = false
 
 [resource_overrides]
 # Override default resource requirements for unusually large seq data without
 # demanding higher resources for all operations as standard. Examples below
 
-# picard MarkDuplicates overrides for unreasnobly large sequnce groups
-#picard_mem_gb = 100
-#picard_storage_gb = 350
-
-# haplotype caller overrides, see production-pipelines PR#381
-# defaults in code are 40 for genomes, none for exomes
-#haplotypecaller_storage = 80
-
-# Use highmem machine type for alignment step
-# align_use_highmem = true
-
-# Use additional storage in postproc_gvcf job for large gVCFs
-# postproc_gvcf_storage = 50
-
-# JointGenotyping GenomicsDBImport job overrides
-# genomicsdb_import_mem_gb = 32
-# genomicsdb_import_use_highmem = false
-
-# JointGenotyping GenotypeGVCFs job overrides
-# genotype_gvcfs_mem_gb = 15
-# genotype_gvcfs_use_highmem = false
-
-[mito_snv]
-# Example config for broad wdl found here:
-# https://raw.githubusercontent.com/broadinstitute/gatk/master/scripts/mitochondria_m2_wdl/ExampleInputsMitochondriaPipeline.json
-# f_score_beta is not configured so will use tool default of 1.0
-f_score_beta = 1.0
-# Sarah Stenton from Broad runs this pipline for seqr ingest and indicated they use a
-# threshold of 0.01 for seqr cohorts.
-vaf_filter_threshold = 0.01
-# Use verifybamid in addition to haplocheck for contamination estimate
-use_verifybamid = true
-
-[stripy]
-# Analysis_type can be "standard" (fast) or "extended" (marginally slower
-# but also uses unmapped reads for genotying)
-analysis_type = "extended"
-# See https://gitlab.com/andreassh/stripy-pipeline#list-of-loci
-# Excluded by default: C9orf72, HTT
-target_loci = """AFF2,AR,ARX_1,ARX_2,ATN1,ATXN1,ATXN10,ATXN2,ATXN3,ATXN7,ATXN8OS,\
-BEAN1,CACNA1A,CBL,CNBP,COMP,CSTB,DAB1,DIP2B,DMD,DMPK,EIF4A3,FGF14,FMR1,FOXL2,FXN,GIPC1,\
-GLS,HOXA13_1,HOXA13_2,HOXA13_3,HOXD13,JPH3,LRP12,MARCHF6,NIPA1,NOP56,NOTCH2NLC,\
-NUTM2B-AS1,PABPN1,PHOX2B,PPP2R2B,PRDM12,PRNP,RAPGEF2,RFC1,RILPL1,RUNX2,SAMD12,SOX3,\
-STARD7,TBP,TBX1,TCF4,TNRC6A,VWA1,XYLT1,YEATS2,ZFHX3,ZIC2,ZIC3"""
-# Path to bed+ file containins extra loci to include in the analysis. Tab-delimited BED
-# file containing at least the following four values: chromosome, start and end position
-# of the STR locus and motif on the plus strand. Optionally, the locus name/ID can be
-# specified as fifth value. Additionally, you can also specify disease name, inheritance,
-# normal range and pathogenic cut-off values which are then being used to colourise
-# results e.g.:
-# https://gitlab.com/andreassh/stripy-pipeline/-/blob/main/examples/vntr.bed
-# custom_loci_path = "gs://cpg-reference/hg38/loci/seqr/seqr_stripy_custom_loci.bed"
-# Set to empty string if no custom loci are to be used.
-custom_loci_path = ""
-# Change the path the stripy report is saved to, useful when testing novel loci
-output_prefix = "stripy"
-
-# Add in specific multiQC report config options
-# See https://multiqc.info/docs/getting_started/config for more details
-# [workflow.cram_multiqc]
-# send_to_slack = true
-# [workflow.cram_multiqc.extra_config]
-# plots_force_interactive = true
-
-# [workflow.gvcf_multiqc]
-# send_to_slack = true
-# [workflow.gvcf_multiqc.extra_config]
-# plots_force_interactive = true
+# The value should be an int representing the memory in GB
+# This is true for both memory and storage overrides
+# job_mem_override = 50
+
+# Other common overrides could be for number of cpus or GBs of storage
+# job_cpu_override = 4
+# job_storage_override = 100
+
+# To use this override in the job python file do something like this:
+# def my_job(
+#     b: hb.Batch,
+#     input_file: hb.ResourceFile,
+#     job_attrs: dict | None = None,
+#     output_path: Path | None = None,
+#     second_output_path: Path | None = None,
+#     fasta_reference: hb.ResourceGroup | None = None,
+#     overwrite: bool = False,
+# ) -> Job | None:
+#     """
+#     My super awesome custom job
+#     """
+#     job = b.new_job(job_name, job_attrs)
+#     ...
+#
+#     # check for a memory override for this job
+#     memory_override = get_config()['resource_overrides'].get('job_mem_override')
+#     assert isinstance(memory_override, (int, type(None)))
+#
+#     resource = HIGHMEM.request_resources(ncpu=4, mem_gb=memory_override)
+#     # Any other resource logic or modifications...
+#
+#     # This line sets the resource for the job created above
+#     resource.set_to_job(j)
+#     ....
+#
+#     return job
+
+# Adding custom options for workflow stages and jobs
+# You are more than welcome to add your own custom options, however they should not
+# live in the defaults.toml file. Instead, create a new toml file in the same directory
+# That you pass to the analysis runner. Any options you create following the toml syntax
+# will be available to you in the workflow and job python files.
+# For example, if you create a file called custom_options.toml with the following content:
+
+# [custom]
+# my_custom_option = "my_custom_value"
+
+# You can access this value in your workflow or job python file like this:
+# get_config().get('custom', {}).get('my_custom_option')