From fdaf9d6666ccfd92d021591434b2a216ff2f00e0 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 23 Nov 2023 14:00:04 +0100 Subject: [PATCH 1/7] cutadapt --- workflows/crisprseq_screening.nf | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf index e37619ce..aae058c9 100644 --- a/workflows/crisprseq_screening.nf +++ b/workflows/crisprseq_screening.nf @@ -55,6 +55,7 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // MODULE: Installed directly from nf-core/modules // include { FASTQC } from '../modules/nf-core/fastqc/main' +include { CUTADAPT } from '../modules/nf-core/cutadapt/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { MAGECK_COUNT } from '../modules/nf-core/mageck/count/main' include { MAGECK_MLE } from '../modules/nf-core/mageck/mle/main' @@ -98,14 +99,21 @@ workflow CRISPRSEQ_SCREENING { ch_input ) + ch_input.dump(tag: "ch_input") + CUTADAPT( + ch_input + ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + // this is to concatenate everything for mageck count ch_input .map { meta, fastq -> [meta.condition, fastq, meta.single_end] } + // if one element is paired-end and the other single-end throw an error + // otherwise just concatenate the conditions and the fastqs .reduce { a, b -> if(a[2] != b[2] ) { error "Your samplesheet contains a mix of single-end and paired-end data. This is not supported." From 77e94707bd592ca71395de2c160637aafad1c39d Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Wed, 29 Nov 2023 10:21:23 +0100 Subject: [PATCH 2/7] Add cutadapt in crispr screening --- conf/modules.config | 5 +++-- conf/test_targeted.config | 3 +++ modules/nf-core/cutadapt/main.nf | 10 +--------- nextflow.config | 1 + nextflow_schema.json | 4 ++++ workflows/crisprseq_screening.nf | 17 +++++++++++++---- 6 files changed, 25 insertions(+), 15 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index afae335d..e1aca9a7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -100,7 +100,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.prefix = 'count_table' - } withName: MAGECK_MLE { @@ -131,7 +130,9 @@ process { } withName: CUTADAPT { - ext.args = '-g file:overrepresented.fasta -N' + if(params.analysis == 'targeted') { + ext.args = '-g file:overrepresented.fasta -N' + } publishDir = [ path: { "${params.outdir}/preprocessing/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, diff --git a/conf/test_targeted.config b/conf/test_targeted.config index c2e9ef7f..9bf6cabc 100644 --- a/conf/test_targeted.config +++ b/conf/test_targeted.config @@ -25,4 +25,7 @@ params { // Aligner aligner = 'minimap2' + + // Steps + overrepresented = true } diff --git a/modules/nf-core/cutadapt/main.nf b/modules/nf-core/cutadapt/main.nf index 936c259b..07fe5f89 100644 --- a/modules/nf-core/cutadapt/main.nf +++ b/modules/nf-core/cutadapt/main.nf @@ -21,7 +21,6 @@ process CUTADAPT { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - if (adapter_seq != []) """ cutadapt \\ --cores $task.cpus \\ @@ -34,14 +33,7 @@ process CUTADAPT { cutadapt: \$(cutadapt --version) END_VERSIONS """ - else - """ - cat <<-END_VERSIONS > versions.yml - "${task.process}": - cutadapt: \$(cutadapt --version) - END_VERSIONS - """ - + stub: def prefix = task.ext.prefix ?: "${meta.id}" def trimmed = meta.single_end ? "${prefix}.trim.fastq.gz" : "${prefix}_1.trim.fastq.gz ${prefix}_2.trim.fastq.gz" diff --git a/nextflow.config b/nextflow.config index 5f898729..9c2274d5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,6 +16,7 @@ params { protospacer = null library = null crisprcleanr = null + cutadapt = null rra_contrasts = null mle_design_matrix = null count_table = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 286554b9..3c3a955f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -183,6 +183,10 @@ "type": "string", "description": "sgRNA library annotation for crisprcleanR" }, + "cutadapt": { + "type": "string", + "description": "cutadapt adapter" + }, "min_reads": { "type": "number", "description": "a filter threshold value for sgRNAs, based on their average counts in the control sample", diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf index aae058c9..5e5980af 100644 --- a/workflows/crisprseq_screening.nf +++ b/workflows/crisprseq_screening.nf @@ -98,14 +98,22 @@ workflow CRISPRSEQ_SCREENING { FASTQC ( ch_input ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + + + empty_channel = Channel.value([[]]) + ch_input_cutadapt = ch_input.combine(Channel.value([[]])) - ch_input.dump(tag: "ch_input") CUTADAPT( - ch_input + ch_input_cutadapt ) + ch_versions = ch_versions.mix(CUTADAPT.out.versions) - - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + CUTADAPT.out.reads + .map{ meta, fastq -> + [meta, [fastq]] + } + .set { ch_input } // this is to concatenate everything for mageck count ch_input @@ -126,6 +134,7 @@ workflow CRISPRSEQ_SCREENING { .set { joined } + // // MODULE: Run mageck count // From ee83a95ca8df306036a2418478557c4d1ed673ee Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Wed, 29 Nov 2023 14:29:29 +0100 Subject: [PATCH 3/7] add cutadapt --- conf/modules.config | 3 +++ workflows/crisprseq_screening.nf | 2 ++ 2 files changed, 5 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index e1aca9a7..fb82328d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -133,6 +133,9 @@ process { if(params.analysis == 'targeted') { ext.args = '-g file:overrepresented.fasta -N' } + if(params.analysis == 'screening' && params.cutadapt) { + ext.args = "-a ${params.cutadapt}" + } publishDir = [ path: { "${params.outdir}/preprocessing/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf index 5e5980af..c637b6cc 100644 --- a/workflows/crisprseq_screening.nf +++ b/workflows/crisprseq_screening.nf @@ -104,6 +104,7 @@ workflow CRISPRSEQ_SCREENING { empty_channel = Channel.value([[]]) ch_input_cutadapt = ch_input.combine(Channel.value([[]])) + if(params.cutadapt) { CUTADAPT( ch_input_cutadapt ) @@ -114,6 +115,7 @@ workflow CRISPRSEQ_SCREENING { [meta, [fastq]] } .set { ch_input } + } // this is to concatenate everything for mageck count ch_input From 6eab0eb41e01a8fa8d9cb23235cb0ade926806a5 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Wed, 29 Nov 2023 15:45:26 +0100 Subject: [PATCH 4/7] Add docs --- docs/output/screening.md | 16 ++++++++++++++++ docs/usage/screening.md | 14 ++++++++------ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/docs/output/screening.md b/docs/output/screening.md index 869a7d33..29e5fa5d 100644 --- a/docs/output/screening.md +++ b/docs/output/screening.md @@ -16,6 +16,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Preprocessing](#preprocessing) - [FastQC](#fastqc) - Read Quality Control + - [cutadapt](#cutadapt) - Trimming reads from fastq files - [Counting](#counting) - [MAGeCK count](#mageck-count) - Mapping reads to reference - [CNV correction](#cnv-correction)) @@ -42,6 +43,21 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +### cutadapt + +
+Output files + +- `cutadapt/` + - `*.log`: log file of the command ran and the output + - `*.trim.fastq.gz`: trimmed fastq files + +
+ +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + + ## Counting ### MAGeCK count diff --git a/docs/usage/screening.md b/docs/usage/screening.md index 63fba93e..63a4db99 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -52,15 +52,17 @@ After the alignment step, the pipeline currently supports 3 algorithms to detect MAGeCK RRA performs robust ranking aggregation to identify genes that are consistently ranked highly across multiple replicate screens. To run MAGeCK rra, `--rra_contrasts` contains two columns : treatment and reference. These two columns should be separated with a dot comma (;) and contain the `csv` extension. You can also integrate several samples/conditions by comma separating them. Please find an example here below : -| treatment | reference | -| ----------------------- | ------------------- | -| treatment1 | control1 | -| treatment1,treatment2 | control1,control2 | -| ----------------------- | ------------------- | -| treatment1 | control1 | +| reference | treatment | +|-------------------|-----------------------| +| control1 | treatment1 | +| control1,control2 | treatment1,treatment2 | A full example can be found [here](https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/full_test/samplesheet_full.csv). +### cutadapt + +MAGeCK is normally able to automatically determine the trimming length and sgRNA length, in most cases. Therefore, you don't need to go to this step unless MAGeCK fails to do so by itself. If the nucleotide length in front of sgRNA varies between different reads, you can use cutadapt to remove the adaptor sequences by using the flag `--crisprcleanr ADAPTER`. + ### MAGeCK mle MAGeCK MLE uses a maximum likelihood estimation approach to estimate the effects of gene knockout on cell fitness. It models the read count data of guide RNAs targeting each gene and estimates the dropout probability for each gene. MAGeCK mle requires a design matrix. The design matrix is a `txt` file indicating the effects of different conditions on different samples. From 62c0525fc1ac475543df25485f2ab063bedd4ddd Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Thu, 30 Nov 2023 09:48:41 +0000 Subject: [PATCH 5/7] [automated] Fix linting with Prettier --- docs/output/screening.md | 2 -- docs/usage/screening.md | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/output/screening.md b/docs/output/screening.md index 29e5fa5d..d351c596 100644 --- a/docs/output/screening.md +++ b/docs/output/screening.md @@ -43,7 +43,6 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). - ### cutadapt
@@ -57,7 +56,6 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). - ## Counting ### MAGeCK count diff --git a/docs/usage/screening.md b/docs/usage/screening.md index 63a4db99..ab40d62c 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -53,7 +53,7 @@ After the alignment step, the pipeline currently supports 3 algorithms to detect MAGeCK RRA performs robust ranking aggregation to identify genes that are consistently ranked highly across multiple replicate screens. To run MAGeCK rra, `--rra_contrasts` contains two columns : treatment and reference. These two columns should be separated with a dot comma (;) and contain the `csv` extension. You can also integrate several samples/conditions by comma separating them. Please find an example here below : | reference | treatment | -|-------------------|-----------------------| +| ----------------- | --------------------- | | control1 | treatment1 | | control1,control2 | treatment1,treatment2 | @@ -61,7 +61,7 @@ A full example can be found [here](https://raw.githubusercontent.com/nf-core/tes ### cutadapt -MAGeCK is normally able to automatically determine the trimming length and sgRNA length, in most cases. Therefore, you don't need to go to this step unless MAGeCK fails to do so by itself. If the nucleotide length in front of sgRNA varies between different reads, you can use cutadapt to remove the adaptor sequences by using the flag `--crisprcleanr ADAPTER`. +MAGeCK is normally able to automatically determine the trimming length and sgRNA length, in most cases. Therefore, you don't need to go to this step unless MAGeCK fails to do so by itself. If the nucleotide length in front of sgRNA varies between different reads, you can use cutadapt to remove the adaptor sequences by using the flag `--crisprcleanr ADAPTER`. ### MAGeCK mle From 1c7fedbabb21ffeb8f86f2ed91bce16162ea4571 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 7 Dec 2023 13:55:48 +0100 Subject: [PATCH 6/7] fix cutadapt part --- docs/output/screening.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/output/screening.md b/docs/output/screening.md index d351c596..aa163333 100644 --- a/docs/output/screening.md +++ b/docs/output/screening.md @@ -54,7 +54,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +[cutadapt](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Cutadapt finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads. MAGeCK count normally automatically detects adapter sequences and trims, however if trimming lengths are different, cutadapt can be used, as mentioned [here](https://sourceforge.net/p/mageck/wiki/advanced_tutorial/). +For further reading and documentation see the [cutadapt helper page](https://cutadapt.readthedocs.io/en/stable/guide.html). ## Counting From 94c45f7447d6877e88fef3ee82674b7522576827 Mon Sep 17 00:00:00 2001 From: LaurenceKuhl Date: Thu, 7 Dec 2023 14:09:13 +0100 Subject: [PATCH 7/7] Fix typos in the docs --- docs/usage/screening.md | 2 +- nextflow_schema.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage/screening.md b/docs/usage/screening.md index ab40d62c..34325513 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -61,7 +61,7 @@ A full example can be found [here](https://raw.githubusercontent.com/nf-core/tes ### cutadapt -MAGeCK is normally able to automatically determine the trimming length and sgRNA length, in most cases. Therefore, you don't need to go to this step unless MAGeCK fails to do so by itself. If the nucleotide length in front of sgRNA varies between different reads, you can use cutadapt to remove the adaptor sequences by using the flag `--crisprcleanr ADAPTER`. +MAGeCK is normally able to automatically determine the trimming length and sgRNA length, in most cases. Therefore, you don't need to go to this step unless MAGeCK fails to do so by itself. If the nucleotide length in front of sgRNA varies between different reads, you can use cutadapt to remove the adaptor sequences by using the flag `--cutadapt ADAPTER`. ### MAGeCK mle diff --git a/nextflow_schema.json b/nextflow_schema.json index 3c3a955f..0248e10d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -185,7 +185,7 @@ }, "cutadapt": { "type": "string", - "description": "cutadapt adapter" + "description": "cutadapt adapter for screening analysis" }, "min_reads": { "type": "number",