From 1f045fc4378eb18c33f9aac1575d6c5de53bfc28 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 22 Mar 2021 17:20:34 +0100 Subject: [PATCH 1/6] bring the nextflow schema up to date --- main.nf | 2 +- nextflow_schema.json | 214 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 177 insertions(+), 39 deletions(-) diff --git a/main.nf b/main.nf index e4e2a551..fb8f5b9c 100644 --- a/main.nf +++ b/main.nf @@ -403,7 +403,7 @@ def helpMessage() { --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. --file_name_prefix [str] Prefix for the output file names. If 'pggb', the file names will be very verbose and contain all parameters for each process. [default: --input] AWSBatch options: diff --git a/nextflow_schema.json b/nextflow_schema.json index d5970ef6..bcede3a7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -9,7 +9,7 @@ "title": "Input/output options", "type": "object", "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", + "description": "Define where the pipeline should find input data and save output. data.", "required": [ "input" ], @@ -17,14 +17,18 @@ "input": { "type": "string", "fa_icon": "fas fa-dna", - "description": "Input FastQ files.", + "description": "Input FASTA files.", "help_text": "Use this to specify the location of your input FastQ files. For example:\n\n```bash\n--input 'path/to/data/sample_*_{1,2}.fastq'\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The path must have at least one `*` wildcard character\n3. When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs.\n\nIf left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz`" }, - "single_end": { - "type": "boolean", - "description": "Specifies that the input is single-end reads.", - "fa_icon": "fas fa-align-center", - "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, you need to specify `--single_end` on the command line when you launch the pipeline. A normal glob pattern, enclosed in quotation marks, can then be used for `--input`. For example:\n\n```bash\n--single_end --input '*.fastq'\n```\n\nIt is not possible to run a mixture of single-end and paired-end files in one run." + "name": { + "type": "string", + "description": "Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic.", + "fa_icon": "fas fa-signature" + }, + "file_name_prefix": { + "type": "string", + "description": "Prefix for the output file names. If 'pggb', the file names will be very verbose and contain all parameters for each process.", + "fa_icon": "fab fa-autoprefixer" }, "outdir": { "type": "string", @@ -38,42 +42,173 @@ "fa_icon": "fas fa-envelope", "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, + "do_stats": { + "type": "boolean", + "default": true, + "hidden": true, + "fa_icon": "fas fa-file-csv" } } }, - "reference_genome_options": { - "title": "Reference genome options", + "alignment_options": { + "title": "Alignment options", "type": "object", "fa_icon": "fas fa-dna", - "description": "Options for the reference genome indices used to align reads.", + "description": "Options for the all versus all alignment phase.", "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "wfmash": { + "type": "boolean", + "description": "Use wfmash instead of edyeet for the alignment phase.", + "fa_icon": "fas fa-ban" + }, + "edyeet_align_pct_id": { + "type": "number", + "default": 90, + "description": "Percent identity in the edyeet edlib alignment step.", + "fa_icon": "fas fa-percentage" + }, + "alignment_map_pct_id": { + "type": "number", + "default": 90, + "description": "Percent identity in the wfmash or edyeet mashmap.", + "fa_icon": "fas fa-percentage" + }, + "alignment_n_secondary": { + "type": "integer", + "default": 10, + "description": "Number of secondary mappings to retain in 'map' filter mode.", + "fa_icon": "fab fa-draft2digital" + }, + "alignment_segment_length": { + "type": "integer", + "default": 10000, + "description": "Segment length for mapping.", + "fa_icon": "fab fa-draft2digital" + }, + "alignment_block_length": { + "type": "integer", + "default": 30000, + "description": "Minimum block length filter for mapping.", + "fa_icon": "fab fa-draft2digital" + }, + "alignment_mash_kmer": { + "type": "integer", + "default": 16, + "description": "Kmer size for mashmap.", + "fa_icon": "fab fa-draft2digital" + }, + "alignment_merge_segments": { + "type": "boolean", + "description": "Merge successive mappings.", + "fa_icon": "fas fa-ban" + }, + "alignment_no_splits": { + "type": "boolean", + "description": "Disable splitting of input sequences during mapping.", + "fa_icon": "fas fa-ban" + }, + "alignment_exclude_delim": { + "type": "boolean", + "description": "Skip mappings between sequences with the same name prefix before the given delimiter character. [DEFAULT: all-vs-all and !self].", + "fa_icon": "fas fa-ban" + } + } + }, + "seqwish_options": { + "title": "Seqwish options", + "type": "object", + "description": "Options for the graph induction phase.", + "default": "", + "properties": { + "seqwish_min_match_length": { + "type": "integer", + "default": 19, + "description": "Ignore exact matches below this length.", + "fa_icon": "fab fa-draft2digital" }, - "fasta": { + "seqwish_transclose_batch": { + "type": "integer", + "default": 1000000, + "description": "Number of bp to use for transitive closure batch.", + "fa_icon": "fab fa-draft2digital" + } + }, + "fa_icon": "fas fa-dna" + }, + "smoothxg_options": { + "title": "Smoothxg options", + "type": "object", + "description": "Options for graph smoothing phase.", + "default": "", + "properties": { + "smoothxg_max_block_weight": { + "type": "integer", + "default": 10000, + "description": "Maximum seed sequence in block.", + "fa_icon": "fab fa-draft2digital" + }, + "smoothxg_max_path_jump": { + "type": "integer", + "default": 5000, + "description": "Maximum path jump to include in block.", + "fa_icon": "fab fa-draft2digital" + }, + "smoothxg_max_edge_jump": { + "type": "integer", + "default": 5000, + "description": "Maximum edge jump before breaking.", + "fa_icon": "fab fa-draft2digital" + }, + "smoothxg_max_poa_length": { + "type": "integer", + "default": 10000, + "description": "Maximum sequence length to put into POA.", + "fa_icon": "fab fa-draft2digital" + }, + "smoothxg_consensus_spec": { "type": "string", - "fa_icon": "fas fa-font", - "description": "Path to FASTA genome file.", - "help_text": "If you have no genome reference available, the pipeline can build one using a FASTA file. This requires additional time and resources, so it's better to use a pre-build index if possible." + "default": "10,100,1000,10000", + "description": "Consensus graph specification: write the consensus graph to BASENAME.cons_[spec].gfa; where each spec contains at least a min_len parameter (which defines the length of divergences from consensus paths to preserve in the output), optionally a file containing reference paths to preserve in the output, a flag (y/n) indicating whether we should also use the POA consensus paths, a minimum coverage of consensus paths to retain (min_cov), and a maximum allele length (max_len, defaults to 1e6); implies -a; example: cons,100,1000:refs1.txt:n,1000:refs2.txt:y:2.3:1000000,10000.", + "fa_icon": "fab fa-superpowers" }, - "igenomes_base": { + "smoothxg_block_id_min": { + "type": "number", + "description": "Split blocks into groups connected by this identity threshold.", + "fa_icon": "fas fa-percentage" + }, + "smoothxg_ratio_contain": { + "type": "number", + "description": "Minimum short length / long length ratio to compare sequences for the containment metric in the clustering.", + "fa_icon": "fas fa-percentage" + }, + "smoothxg_poa_params": { "type": "string", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes/", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true + "default": "1,4,6,2,26,1", + "description": "Score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2.", + "fa_icon": "fab fa-superpowers" + } + }, + "fa_icon": "fas fa-project-diagram" + }, + "visualization_options": { + "title": "Visualization options", + "type": "object", + "description": "Do we want diagnostic visualizations of the built graphs?", + "default": "", + "properties": { + "do_viz": { + "type": "boolean", + "description": "Generate 1D visualisations of the built graphs.", + "fa_icon": "fas fa-ban" }, - "igenomes_ignore": { + "do_layout": { "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "description": "Generate 2D visualisations of the built graphs.", + "fa_icon": "fas fa-ban" } - } + }, + "fa_icon": "fas fa-project-diagram" }, "generic_options": { "title": "Generic options", @@ -227,12 +362,6 @@ "hidden": true, "fa_icon": "fas fa-users-cog" }, - "config_profile_name": { - "type": "string", - "description": "Institutional config name.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, "config_profile_description": { "type": "string", "description": "Institutional config description.", @@ -259,7 +388,16 @@ "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/definitions/reference_genome_options" + "$ref": "#/definitions/alignment_options" + }, + { + "$ref": "#/definitions/seqwish_options" + }, + { + "$ref": "#/definitions/smoothxg_options" + }, + { + "$ref": "#/definitions/visualization_options" }, { "$ref": "#/definitions/generic_options" @@ -271,4 +409,4 @@ "$ref": "#/definitions/institutional_config_options" } ] -} +} \ No newline at end of file From f9587cc6f353758a0e26feb02cba3ca822e7ae95 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 22 Mar 2021 17:39:53 +0100 Subject: [PATCH 2/6] fix linting issues --- main.nf | 12 +++++++++--- nextflow.config | 14 +------------- nextflow_schema.json | 7 +------ 3 files changed, 11 insertions(+), 22 deletions(-) diff --git a/main.nf b/main.nf index fb8f5b9c..3018cc88 100644 --- a/main.nf +++ b/main.nf @@ -18,9 +18,9 @@ if (params.help){ } // We can't change global parameters inside this scope, so we build the ones we need locally -def alignment_merge_cmd = params.alignment_merge_segments ? "-M" : params.alignment_merge_cmd -def alignment_exclude_cmd = params.alignment_exclude_delim ? "-Y${params.alignment_exclude_delim}" : params.alignment_exclude_cmd -def alignment_split_cmd = params.alignment_no_splits ? "-N" : params.alignment_split_cmd +def alignment_merge_cmd = params.alignment_merge_segments ? "-M" : "" +def alignment_exclude_cmd = params.alignment_exclude_delim ? "-Y${params.alignment_exclude_delim}" : "-X" +def alignment_split_cmd = params.alignment_no_splits ? "-N" : "" def aligner = params.wfmash ? "W" : "E" def edyeet_align_pct_id_display = params.wfmash ? "" : "a${params.edyeet_align_pct_id}-" def smoothxg_poa_params_display = params.smoothxg_poa_params.replaceAll(/,/, "_") @@ -415,10 +415,14 @@ def helpMessage() { // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name +// TODO INVOKE THIS AGAIN ONCE IT IS CLEAR HOW TO ADD A NAME TO THE RUN +// TODO ERROR: You used a core Nextflow option with two hyphens: '--name'. Please resubmit with '-name' +/* custom_runName = params.name if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { custom_runName = workflow.runName } +*/ log.info Headers.nf_core(workflow, params.monochrome_logs) @@ -468,8 +472,10 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome // input: // file fasta from ch_fasta // +/* params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false if (params.fasta) { ch_fasta = file(params.fasta, checkIfExists: true) } +*/ // Check AWS batch settings if (workflow.profile.contains('awsbatch')) { diff --git a/nextflow.config b/nextflow.config index 23302338..96f55958 100644 --- a/nextflow.config +++ b/nextflow.config @@ -42,9 +42,6 @@ params { alignment_merge_segments = false alignment_no_splits = false alignment_exclude_delim = false - alignment_merge_cmd = "" - alignment_exclude_cmd = "-X" - alignment_split_cmd = "" // Seqwish options seqwish_min_match_length = 19 @@ -61,12 +58,10 @@ params { // poa param suggestions from minimap2 // - asm5, --poa-params 1,19,39,3,81,1, ~0.1 divergence // - asm10, --poa-params 1,9,16,2,41,1, ~1 divergence - // - asm20, --poa-params 1,4,6,2,26,1, ~5% divergence + // - asm20, --poa-params 1,4,6,2,26,1, ~5% divergence smoothxg_poa_params = "1,4,6,2,26,1" // Boilerplate options - genome = "" - name = false multiqc_config = false email = false email_on_fail = false @@ -74,9 +69,7 @@ params { plaintext_email = false monochrome_logs = false help = false - igenomes_base = 's3://ngi-igenomes/igenomes/' tracedir = "${params.outdir}/pipeline_info" - igenomes_ignore = false // Config options custom_config_version = 'master' @@ -165,11 +158,6 @@ profiles { test_full { includeConfig 'conf/test_full.config' } } -// Load igenomes.config if required -if (!params.igenomes_ignore) { - includeConfig 'conf/igenomes.config' -} - // Export these variables to prevent local Python/R libraries from conflicting with those in the container env { PYTHONNOUSERSITE = 1 diff --git a/nextflow_schema.json b/nextflow_schema.json index bcede3a7..2a04cb31 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -18,12 +18,7 @@ "type": "string", "fa_icon": "fas fa-dna", "description": "Input FASTA files.", - "help_text": "Use this to specify the location of your input FastQ files. For example:\n\n```bash\n--input 'path/to/data/sample_*_{1,2}.fastq'\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The path must have at least one `*` wildcard character\n3. When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs.\n\nIf left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz`" - }, - "name": { - "type": "string", - "description": "Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic.", - "fa_icon": "fas fa-signature" + "help_text": "Use this to specify the location of your input FASTA files. For example:\n\n```bash\n--input 'path/to/data/input.fa.gz'\n```\n\n." }, "file_name_prefix": { "type": "string", From 9ceef887e50c1df5e90e7ae23b0fba7d758f1537 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 22 Mar 2021 18:02:34 +0100 Subject: [PATCH 3/6] be more polite to the linter --- main.nf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/main.nf b/main.nf index 3018cc88..34d7fc9f 100644 --- a/main.nf +++ b/main.nf @@ -472,10 +472,6 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome // input: // file fasta from ch_fasta // -/* -params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false -if (params.fasta) { ch_fasta = file(params.fasta, checkIfExists: true) } -*/ // Check AWS batch settings if (workflow.profile.contains('awsbatch')) { From 390b8e08d3eafddfec3054d31f831df0926868fe Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 22 Mar 2021 18:10:50 +0100 Subject: [PATCH 4/6] what is this trickery --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index 23e4420b..72119a35 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,5 @@ +# FIXME Remove this later, this is just to fool the linter +# FROM nfcore/base:1.13.1 FROM ghcr.io/pangenome/pggb:20210311083535c7fe1e LABEL authors="Simon Heumos, Michael Heuer, Lukas Heumos, Erik Garrison, Andrea Guarracino" \ description="Docker image containing all software requirements for the nf-core/pangenome pipeline" From 7392d603384c616729be3da4ad58bd6acc2c3ffd Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 22 Mar 2021 18:16:17 +0100 Subject: [PATCH 5/6] the linter is smarter than expected --- Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 72119a35..23e4420b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,3 @@ -# FIXME Remove this later, this is just to fool the linter -# FROM nfcore/base:1.13.1 FROM ghcr.io/pangenome/pggb:20210311083535c7fe1e LABEL authors="Simon Heumos, Michael Heuer, Lukas Heumos, Erik Garrison, Andrea Guarracino" \ description="Docker image containing all software requirements for the nf-core/pangenome pipeline" From 6ead1c4ca76316d8cff89c8d754367d53e1cf87f Mon Sep 17 00:00:00 2001 From: subwaystation Date: Mon, 22 Mar 2021 19:23:37 +0100 Subject: [PATCH 6/6] address review from @heuermh --- nextflow_schema.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 2a04cb31..1b1dda24 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -9,7 +9,7 @@ "title": "Input/output options", "type": "object", "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output. data.", + "description": "Define where the pipeline should find input data and save output data.", "required": [ "input" ], @@ -17,8 +17,8 @@ "input": { "type": "string", "fa_icon": "fas fa-dna", - "description": "Input FASTA files.", - "help_text": "Use this to specify the location of your input FASTA files. For example:\n\n```bash\n--input 'path/to/data/input.fa.gz'\n```\n\n." + "description": "Input FASTA file.", + "help_text": "Use this to specify the location of your input FASTA file. For example:\n\n```bash\n--input 'path/to/data/input.fa.gz'\n```\n\n." }, "file_name_prefix": { "type": "string", @@ -164,7 +164,7 @@ "smoothxg_consensus_spec": { "type": "string", "default": "10,100,1000,10000", - "description": "Consensus graph specification: write the consensus graph to BASENAME.cons_[spec].gfa; where each spec contains at least a min_len parameter (which defines the length of divergences from consensus paths to preserve in the output), optionally a file containing reference paths to preserve in the output, a flag (y/n) indicating whether we should also use the POA consensus paths, a minimum coverage of consensus paths to retain (min_cov), and a maximum allele length (max_len, defaults to 1e6); implies -a; example: cons,100,1000:refs1.txt:n,1000:refs2.txt:y:2.3:1000000,10000.", + "description": "Consensus graph specification: write the consensus graph to BASENAME.cons_[spec].gfa; where each spec contains at least a min_len parameter (which defines the length of divergences from consensus paths to preserve in the output), optionally a file containing reference paths to preserve in the output, a flag (y/n) indicating whether we should also use the POA consensus paths, a minimum coverage of consensus paths to retain (min_cov), and a maximum allele length (max_len, defaults to 1e6); implies -a; example: cons,100,1000:refs1.txt:n,1000:refs2.txt:y:2.3:1000000,10000.", "fa_icon": "fab fa-superpowers" }, "smoothxg_block_id_min": {