diff --git a/CHANGELOG.md b/CHANGELOG.md index 67c6606f..c732afa7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Better support for custom protocols ([#273](https://github.com/nf-core/scrnaseq/pull/273)). + - The universc protocol is now specified via the `--protocol` flag + - Any protocol specified is now passed to the respective aligner + - Added a section to the documentation + ## v2.4.1 - 2023-09-28 - Fix whitelist logic for dropseq ([#267](https://github.com/nf-core/scrnaseq/pull/267)) diff --git a/assets/protocols.json b/assets/protocols.json new file mode 100644 index 00000000..23ff1328 --- /dev/null +++ b/assets/protocols.json @@ -0,0 +1,90 @@ +{ + "alevin": { + "10XV1": { + "protocol": "10xv1", + "whitelist": "assets/whitelist/10x_V1_barcode_whitelist.txt.gz" + }, + "10XV2": { + "protocol": "10xv2", + "whitelist": "assets/whitelist/10x_V2_barcode_whitelist.txt.gz" + }, + "10XV3": { + "protocol": "10xv3", + "whitelist": "assets/whitelist/10x_V3_barcode_whitelist.txt.gz" + }, + "dropseq": { + "protocol": "dropseq" + } + }, + "cellranger": { + "auto": { + "protocol": "auto" + }, + "10XV1": { + "protocol": "SC3Pv1" + }, + "10XV2": { + "protocol": "SC3Pv2" + }, + "10XV3": { + "protocol": "SC3Pv3" + } + }, + "star": { + "10XV1": { + "protocol": "CB_UMI_Simple", + "extra_args": "--soloUMIlen 10", + "whitelist": "assets/whitelist/10x_V1_barcode_whitelist.txt.gz" + }, + "10XV2": { + "protocol": "CB_UMI_Simple", + "extra_args": "--soloUMIlen 10", + "whitelist": "assets/whitelist/10x_V2_barcode_whitelist.txt.gz" + }, + "10XV3": { + "protocol": "CB_UMI_Simple", + "extra_args": "--soloUMIlen 12", + "whitelist": "assets/whitelist/10x_V3_barcode_whitelist.txt.gz" + }, + "dropseq": { + "protocol": "CB_UMI_Simple" + }, + "smartseq": { + "protocol": "SmartSeq" + } + }, + "kallisto": { + "10XV1": { + "protocol": "10XV1" + }, + "10XV2": { + "protocol": "10XV2" + }, + "10XV3": { + "protocol": "10XV3" + }, + "dropseq": { + "protocol": "DROPSEQ" + }, + "smartseq": { + "protocol": "SMARTSEQ" + } + }, + "universc": { + "auto": { + "protocol": "10x" + }, + "10XV1": { + "protocol": "10x-v1" + }, + "10XV2": { + "protocol": "10x-v2" + }, + "10XV3": { + "protocol": "10x-v3" + }, + "dropseq": { + "protocol": "dropseq" + } + } +} diff --git a/conf/modules.config b/conf/modules.config index 2fd974c5..c60949fc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -78,7 +78,7 @@ if(params.aligner == "cellranger") { path: "${params.outdir}/${params.aligner}/count", mode: params.publish_dir_mode ] - ext.args = {meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : ''} + ext.args = {"--chemistry ${meta.chemistry} " + (meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : '')} } } } diff --git a/docs/usage.md b/docs/usage.md index f90dc242..abe170fd 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -25,19 +25,6 @@ CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz There is a strict requirement for the first 3 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. - -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, -``` - | Column | Description | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Required. Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | @@ -54,9 +41,9 @@ This parameter is currently supported by - [Salmon Alevin](https://salmon.readthedocs.io/en/latest/alevin.html#expectcells) - [STARsolo](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md) +- [Cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) -In the future, support for this paramter will be added to cellranger and UniverSC. Note that since cellranger v7, -it is not recommended anymore to supply the `--expected-cells` parameter. +Note that since cellranger v7, it is **not recommended** anymore to supply the `--expected-cells` parameter. ## Aligning options @@ -71,7 +58,7 @@ Other aligner options for running the pipeline are: - [Cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) to perform both alignment and downstream analysis. - `--aligner cellranger` - [UniverSC](https://github.com/minoda-lab/universc) to run an open-source version of Cell Ranger on any technology - - '--aligner universc' + - '--aligner universc` ### If using cellranger or universc @@ -91,9 +78,36 @@ For more details, see As a sanity check, we verify that filenames of a pair of FASTQ files only differ by `R1`/`R2`. -#### UniverSC technology configuration +### Support for different scRNA-seq protocols + +The single-cell protocol used in the experiment can be specified using the `--protocol` flag. +For cellranger, it is recommended to stick with the default value `'auto'` for automatic detection of the protocol. +For all other aligner, you need to specify the protocol manually. + +The three 10x Genomics protocols 3' v1 (`10XV1`), 3' v2 (`10XV2`) and 3' v3 (`10XV3`) are universally supported +by all aligners in the pipeline and mapped to the correct options automatically. If the protocol is unknown to the +nf-core pipeline, the value specified to `--protocol` is passed to the aligner _in verbatim_ to support additional protocols. + +Here are some hints on running the various aligners with different protocols + +#### Kallisto/bustools + +The command `kb --list` shows all supported, preconfigured protocols. Additionally, a custom technology string such as +`0,0,16:0,16,26:1,0,0` can be speficied: + +> Additionally kallisto bus will accept a string specifying a new technology in the format of bc:umi:seq where each of bc,umi and seq are a triplet of integers separated by a comma, denoting the file index, start and stop of the sequence used. For example to specify the 10xV2 technology we would use 0,0,16:0,16,26:1,0,0 + +For more details, please refer to the [Kallisto/bustools documentation](https://pachterlab.github.io/kallisto/manual#bus). + +#### Alevin/fry + +Alevin/fry also supports custom chemistries in a slighly different format, e.g. `1{b[16]u[12]x:}2{r:}`. + +For more details, see the [simpleaf documentation](https://simpleaf.readthedocs.io/en/latest/quant-command.html#a-note-on-the-chemistry-flag) + +#### UniverSC -UniverSC automatically updates the barcode whitelist and chemistry parameters. Use "universc_technology" to set the 'technology' parameter to configure the run. +See the [UniverSC GitHub page](https://github.com/minoda-lab/universc#pre-set-configurations) for all supported protocols. Currently only 3\' scRNA-Seq parameters are supported in nextflow, although chemistry parameters for 5\' scRNA-Seq and full-length scRNA-Seq libraries are supported by teh container. diff --git a/lib/WorkflowScrnaseq.groovy b/lib/WorkflowScrnaseq.groovy index b58a89db..e4273887 100755 --- a/lib/WorkflowScrnaseq.groovy +++ b/lib/WorkflowScrnaseq.groovy @@ -4,6 +4,8 @@ import nextflow.Nextflow import groovy.text.SimpleTemplateEngine +import groovy.json.JsonSlurper + class WorkflowScrnaseq { @@ -121,90 +123,21 @@ class WorkflowScrnaseq { } } - /* - * Format the protocol - * Given the protocol paramter (params.protocol) and the aligner (params.aligner), - * this function formats the protocol such that it is fit for the respective - * subworkflow - */ - static formatProtocol(protocol, aligner) { - String new_protocol = protocol - String chemistry = '' - String other_parameters = '' - - // alevin - if (aligner == 'alevin') { - switch (protocol) { - case '10XV1': - new_protocol = '10xv1' - chemistry = 'V1' - break - case '10XV2': - new_protocol = '10xv2' - chemistry = 'V2' - break - case '10XV3': - new_protocol = '10xv3' - chemistry = 'V3' - break - // case 'dropseq': - // new_protocol = 'dropseq' - } - } - - // star - else if (aligner == 'star') { - switch (protocol) { - case '10XV1': - new_protocol = 'CB_UMI_Simple' - chemistry = 'V1' - other_parameters = '--soloUMIlen 10' - break - case '10XV2': - new_protocol = 'CB_UMI_Simple' - chemistry = 'V2' - other_parameters = '--soloUMIlen 10' - break - case '10XV3': - new_protocol = 'CB_UMI_Simple' - chemistry = 'V3' - other_parameters = '--soloUMIlen 12' - break - case 'dropseq': - new_protocol = 'CB_UMI_Simple' - break - case 'smartseq': - new_protocol = 'SmartSeq' - } - } - - // kallisto bustools - else if (aligner = 'kallisto' ) { - switch (protocol) { - case '10XV1': - new_protocol = '10XV1' - chemistry = 'V1' - break - case '10XV2': - new_protocol = '10XV2' - chemistry = 'V2' - break - case '10XV3': - new_protocol = '10XV3' - chemistry = 'V3' - break - case 'dropseq': - new_protocol = 'DROPSEQ' - break - case 'smartseq': - new_protocol = 'SMARTSEQ' - } - } - else { - exit 1, 'Aligner not recognized.' + // + // Retrieve the aligner-specific protocol based on the specified protocol. + // Returns a map ["protocol": protocol, "extra_args": , "whitelist": ] + // extra_args and whitelist are optional. + public static Map getProtocol(workflow, log, aligner, protocol) { + def jsonSlurper = new JsonSlurper() + def json = new File("${workflow.projectDir}/assets/protocols.json").text + def protocols = jsonSlurper.parseText(json) + def aligner_map = protocols[aligner] + if(aligner_map.containsKey(protocol)) { + return aligner_map[protocol] + } else { + log.warn("Protocol '${protocol}' not recognized by the pipeline. Passing on the protocol to the aligner unmodified.") + return ["protocol": protocol] } - - return [new_protocol, chemistry, other_parameters] } } diff --git a/modules/local/simpleaf_quant.nf b/modules/local/simpleaf_quant.nf index 0c879ceb..f350acf3 100644 --- a/modules/local/simpleaf_quant.nf +++ b/modules/local/simpleaf_quant.nf @@ -66,7 +66,7 @@ process SIMPLEAF_QUANT { -o ${prefix}_alevin_results \\ -m $txp2gene \\ -t $task.cpus \\ - -c $protocol \\ + -c "$protocol" \\ $expect_cells \\ $unfiltered_command \\ $args diff --git a/nextflow.config b/nextflow.config index b911ce28..a4a24255 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,7 +14,7 @@ params { outdir = null input = null save_reference = false - protocol = '10XV3' + protocol = 'auto' // reference files genome = null @@ -42,7 +42,6 @@ params { // UniverSC paramaters universc_index = null - universc_technology = '10x' // Template Boilerplate options skip_multiqc = false diff --git a/nextflow_schema.json b/nextflow_schema.json index c2642a1b..d3d5fec6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -62,10 +62,10 @@ }, "protocol": { "type": "string", - "description": "The protocol that was used to generate the single cell data, e.g. 10XV2 (default).", - "default": "10XV2", - "fa_icon": "fas fa-cogs", - "enum": ["10XV3", "10XV2", "10XV1", "dropseq"] + "description": "The protocol that was used to generate the single cell data, e.g. 10x Genomics v2 Chemistry.\n\n Can be 'auto' (cellranger only), '10XV1', '10XV2', '10XV3', or any other protocol string that will get directly passed the respective aligner.", + "help_text": "The default is to auto-detect the protocol when running cellranger. For all other aligners the protocol MUST be manually specified. \n\n The following protocols are recognized by the pipeline and mapped to the corresponding protocol name of the respective aligner: '10XV1', '10XV2', '10XV3'. \n\nAny other protocol value is passed to the aligner in verbatim to support other sequencing platforms. See the [kallisto](https://pachterlab.github.io/kallisto/manual#bus), [simpleaf](https://simpleaf.readthedocs.io/en/latest/quant-command.html#a-note-on-the-chemistry-flag), [starsolo](https://gensoft.pasteur.fr/docs/STAR/2.7.9a/STARsolo.html), and [universc](https://github.com/minoda-lab/universc#pre-set-configurations) documentations for more details.", + "default": "auto", + "fa_icon": "fas fa-cogs" } }, "fa_icon": "fas fa-terminal" @@ -243,11 +243,6 @@ "universc_index": { "type": "string", "description": "Specify a pre-calculated cellranger index. Readily prepared indexes can be obtained from the 10x Genomics website." - }, - "universc_technology": { - "type": "string", - "description": "Specify a single-cell technology, vendor, or platform. See the UniverSC documentation or GitHub repository for more details.", - "default": "10x" } } }, diff --git a/subworkflows/local/alevin.nf b/subworkflows/local/alevin.nf index 8fc0a983..764c08f8 100644 --- a/subworkflows/local/alevin.nf +++ b/subworkflows/local/alevin.nf @@ -20,7 +20,6 @@ workflow SCRNASEQ_ALEVIN { txp2gene barcode_whitelist protocol - chemistry ch_fastq diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf index 228edb06..bfdd533e 100644 --- a/subworkflows/local/align_cellranger.nf +++ b/subworkflows/local/align_cellranger.nf @@ -13,6 +13,7 @@ workflow CELLRANGER_ALIGN { gtf cellranger_index ch_fastq + protocol main: ch_versions = Channel.empty() @@ -34,7 +35,7 @@ workflow CELLRANGER_ALIGN { // Obtain read counts CELLRANGER_COUNT ( // TODO what is `gem` and why is it needed? - ch_fastq.map{ meta, reads -> [meta + ["gem": meta.id, "samples": [meta.id]], reads] }, + ch_fastq.map{ meta, reads -> [meta + ["chemistry": protocol, "gem": meta.id, "samples": [meta.id]], reads] }, cellranger_index ) ch_versions = ch_versions.mix(CELLRANGER_COUNT.out.versions) diff --git a/subworkflows/local/kallisto_bustools.nf b/subworkflows/local/kallisto_bustools.nf index 9d63ef1e..3210e47a 100644 --- a/subworkflows/local/kallisto_bustools.nf +++ b/subworkflows/local/kallisto_bustools.nf @@ -15,7 +15,6 @@ workflow KALLISTO_BUSTOOLS { kallisto_index txp2gene protocol - chemistry kb_workflow ch_fastq diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 581bf2c4..aeed5a0a 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -68,7 +68,10 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' // TODO: Are this channels still necessary? ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true) ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true) -(protocol, chemistry, other_parameters) = WorkflowScrnaseq.formatProtocol(params.protocol, params.aligner) +protocol_config = WorkflowScrnaseq.getProtocol(workflow, log, params.aligner, params.protocol) +if (protocol_config['protocol'] == 'auto' && aligner != "cellranger") { + error "Only cellranger supports `protocol = 'auto'`. Please specify the protocol manually!" +} // general input and params ch_input = file(params.input) @@ -81,8 +84,8 @@ ch_multiqc_star = Channel.empty() ch_multiqc_cellranger = Channel.empty() if (params.barcode_whitelist) { ch_barcode_whitelist = file(params.barcode_whitelist) -} else if (params.protocol.contains("10X")) { - ch_barcode_whitelist = file("$baseDir/assets/whitelist/10x_${chemistry}_barcode_whitelist.txt.gz", checkIfExists: true) +} else if (protocol_config.containsKey("whitelist")) { + ch_barcode_whitelist = file("$projectDir/${protocol_config['whitelist']}") } else { ch_barcode_whitelist = [] } @@ -137,8 +140,7 @@ workflow SCRNASEQ { ch_filter_gtf, ch_kallisto_index, ch_txp2gene, - protocol, - chemistry, + protocol_config['protocol'], kb_workflow, ch_fastq ) @@ -156,8 +158,7 @@ workflow SCRNASEQ { ch_salmon_index, ch_txp2gene, ch_barcode_whitelist, - protocol, - chemistry, + protocol_config['protocol'], ch_fastq ) ch_versions = ch_versions.mix(SCRNASEQ_ALEVIN.out.ch_versions) @@ -171,11 +172,11 @@ workflow SCRNASEQ { ch_genome_fasta, ch_filter_gtf, ch_star_index, - protocol, + protocol_config['protocol'], ch_barcode_whitelist, ch_fastq, star_feature, - other_parameters + protocol_config.get('extra_args', ""), ) ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) ch_mtx_matrices = ch_mtx_matrices.mix(STARSOLO.out.star_counts) @@ -189,7 +190,8 @@ workflow SCRNASEQ { ch_genome_fasta, ch_filter_gtf, ch_cellranger_index, - ch_fastq + ch_fastq, + protocol_config['protocol'] ) ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions) ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_out) @@ -205,7 +207,7 @@ workflow SCRNASEQ { ch_genome_fasta, ch_filter_gtf, ch_universc_index, - params.universc_technology, + protocol_config['protocol'], ch_fastq ) ch_versions = ch_versions.mix(UNIVERSC_ALIGN.out.ch_versions)