add samplesheet functionality

this is inspired by the functionality in nf-core/nanoseq and nf-core/rnaseq The idea is to require a samplesheet to run the pipeline, which will allow for single/paired end auto-detection and mapping samples against different reference genomes. addresses nf-core#181
phue · Mar 22, 2021 · eaf7ad1 · eaf7ad1
1 parent baf4faa
commit eaf7ad1
Show file tree

Hide file tree

Showing 5 changed files with 331 additions and 0 deletions.
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import errno
+import argparse
+
+
+def parse_args(args=None):
+    Description = "Reformat nf-core/rnaseq samplesheet file and check its contents."
+    Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("FILE_OUT", help="Output file.")
+    return parser.parse_args(args)
+
+
+def make_dir(path):
+    if len(path) > 0:
+        try:
+            os.makedirs(path)
+        except OSError as exception:
+            if exception.errno != errno.EEXIST:
+                raise exception
+
+
+def print_error(error, context='Line', context_str=''):
+    error_str = "ERROR: Please check samplesheet -> {}".format(error)
+    if context != '' and context_str != '':
+        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(error, context.strip(), context_str.strip())
+    print(error_str)
+    sys.exit(1)
+
+
+def check_samplesheet(file_in, file_out):
+    """
+    This function checks that the samplesheet follows the following structure:
+
+    sample,fastq_1,fastq_2
+    WT_LIB1_REP1_1,WT_LIB1_REP1_1.fastq.gz,WT_LIB1_REP1_2.fastq.gz
+    KO_LIB1_REP1,KO_LIB1_REP1_1.fastq.gz,KO_LIB1_REP1_2.fastq.gz
+    """
+
+    sample_run_dict = {}
+    with open(file_in, "r") as fin:
+
+        ## Check header
+        MIN_COLS = 2
+        HEADER = ['sample', 'fastq_1', 'fastq_2', 'genome']
+        header = [x.strip('"') for x in fin.readline().strip().split(",")]
+        if header[:len(HEADER)] != HEADER:
+            print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
+            sys.exit(1)
+
+        ## Check sample entries
+        for line in fin:
+            lspl = [x.strip().strip('"') for x in line.strip().split(",")]
+
+            ## Check valid number of columns per row
+            if len(lspl) < len(HEADER):
+                print_error("Invalid number of columns (minimum = {})!".format(len(HEADER)), 'Line', line)
+
+            num_cols = len([x for x in lspl if x])
+            if num_cols < MIN_COLS:
+                print_error("Invalid number of populated columns (minimum = {})!".format(MIN_COLS), 'Line', line)
+
+            ## Check sample name entries
+            sample, fastq_1, fastq_2, genome = lspl[:len(HEADER)]
+            if sample:
+                if sample.find(" ") != -1:
+                    print_error("Sample entry contains spaces!", 'Line', line)
+            else:
+                print_error("Sample entry has not been specified!", 'Line', line)
+
+            ## Check FastQ file extension
+            for fastq in [fastq_1, fastq_2]:
+                if fastq:
+                    if fastq.find(" ") != -1:
+                        print_error("FastQ file contains spaces!", 'Line', line)
+                    if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
+                        print_error("FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", 'Line', line)
+
+            ## Check genome entries
+            if genome:
+                if genome.find(' ') != -1:
+                    print_error("Genome entry contains spaces!",'Line', line)
+                if len(genome.split('.')) > 1:
+                    if genome[-6:] != '.fasta' and genome[-3:] != '.fa' and genome[-9:] != '.fasta.gz' and genome[-6:] != '.fa.gz':
+                        print_error("Genome entry does not have extension '.fasta', '.fa', '.fasta.gz' or '.fa.gz'!",'Line', line)
+
+            ## Auto-detect paired-end/single-end
+            sample_info = []  ## [single_end, fastq_1, fastq_2, genome]
+            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
+                sample_info = ["0", fastq_1, fastq_2, genome]
+            elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
+                sample_info = ["1", fastq_1, fastq_2, genome]
+            else:
+                print_error("Invalid combination of columns provided!", 'Line', line)
+
+            ## Create sample mapping dictionary = {sample: [ single_end, fastq_1, fastq_2, genome ]}
+            if sample not in sample_run_dict:
+                sample_run_dict[sample] = sample_info
+            else:
+                print_error("Samplesheet contains duplicate rows!", 'Line', line)
+
+    ## Write validated samplesheet with appropriate columns
+    if len(sample_run_dict) > 0:
+        out_dir = os.path.dirname(file_out)
+        make_dir(out_dir)
+        with open(file_out, "w") as fout:
+            fout.write(",".join(['sample', 'single_end', 'fastq_1', 'fastq_2', 'genome']) + "\n")
+            for sample_id, sample_info in sorted(sample_run_dict.items()):
+                ## Write to file
+                fout.write(','.join([sample_id] + sample_info) + '\n')
+
+
+def main(args=None):
+    args = parse_args(args)
+    check_samplesheet(args.FILE_IN, args.FILE_OUT)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/modules/local/process/functions.nf b/modules/local/process/functions.nf
@@ -0,0 +1,59 @@
+/*
+ * -----------------------------------------------------
+ *  Utility functions used in nf-core DSL2 module files
+ * -----------------------------------------------------
+ */
+
+/*
+ * Extract name of software tool from process name using $task.process
+ */
+def getSoftwareName(task_process) {
+    return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
+}
+
+/*
+ * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
+ */
+def initOptions(Map args) {
+    def Map options = [:]
+    options.args          = args.args ?: ''
+    options.args2         = args.args2 ?: ''
+    options.publish_by_id = args.publish_by_id ?: false
+    options.publish_dir   = args.publish_dir ?: ''
+    options.publish_files = args.publish_files
+    options.suffix        = args.suffix ?: ''
+    return options
+}
+
+/*
+ * Tidy up and join elements of a list to return a path string
+ */
+def getPathFromList(path_list) {
+    def paths = path_list.findAll { item -> !item?.trim().isEmpty() }  // Remove empty entries
+    paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
+    return paths.join('/')
+}
+
+/*
+ * Function to save/publish module results
+ */
+def saveFiles(Map args) {
+    if (!args.filename.endsWith('.version.txt')) {
+        def ioptions = initOptions(args.options)
+        def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
+        if (ioptions.publish_by_id) {
+            path_list.add(args.publish_id)
+        }
+        if (ioptions.publish_files instanceof Map) {
+            for (ext in ioptions.publish_files) {
+                if (args.filename.endsWith(ext.key)) {
+                    def ext_list = path_list.collect()
+                    ext_list.add(ext.value)
+                    return "${getPathFromList(ext_list)}/$args.filename"
+                }
+            }
+        } else if (ioptions.publish_files == null) {
+            return "${getPathFromList(path_list)}/$args.filename"
+        }
+    }
+}
diff --git a/modules/local/process/get_software_versions.nf b/modules/local/process/get_software_versions.nf
@@ -0,0 +1,36 @@
+// Import generic module functions
+include { saveFiles } from './functions'
+
+params.options = [:]
+
+/*
+ * Parse software version numbers
+ */
+process GET_SOFTWARE_VERSIONS {
+    publishDir "${params.outdir}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', publish_id:'') }
+
+    conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/python:3.8.3"
+    } else {
+        container "quay.io/biocontainers/python:3.8.3"
+    }
+
+    cache false
+
+    input:
+    path versions
+
+    output:
+    path "software_versions.csv"     , emit: csv
+    path 'software_versions_mqc.yaml', emit: yaml
+
+    script:
+    """
+    echo $workflow.manifest.version > pipeline.version.txt
+    echo $workflow.nextflow.version > nextflow.version.txt
+    scrape_software_versions.py &> software_versions_mqc.yaml
+    """
+}
diff --git a/modules/local/process/samplesheet_check.nf b/modules/local/process/samplesheet_check.nf
@@ -0,0 +1,85 @@
+// Import generic module functions
+include { saveFiles } from './functions'
+
+params.options = [:]
+
+/*
+ * Reformat design file and check validity
+ */
+process SAMPLESHEET_CHECK {
+    tag "$samplesheet"
+    publishDir "${params.outdir}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', publish_id:'') }
+
+    conda     (params.enable_conda ? "conda-forge::python=3.8.3" : null)
+    container "quay.io/biocontainers/python:3.8.3"
+
+    input:
+    path samplesheet
+
+    output:
+    path '*.csv'
+
+
+    script:  // This script is bundled with the pipeline, in nf-core/methylseq/bin/
+    """
+    check_samplesheet.py $samplesheet samplesheet.valid.csv
+    """
+}
+
+
+def get_metadata(LinkedHashMap sample) {
+    def meta = [:]
+    meta.id           = sample.sample
+    meta.single_end   = sample.single_end.toBoolean()
+
+    return meta
+}
+
+// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
+def get_samplesheet_paths(LinkedHashMap sample) {
+    def meta = get_metadata(sample)
+
+    if (!file(sample.fastq_1).exists()) {
+        exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${sample.fastq_1}"
+    }
+    if (meta.single_end) {
+        return [ meta, [ file(sample.fastq_1) ] ]
+    } else {
+        if (!file(sample.fastq_2).exists()) {
+            exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${sample.fastq_2}"
+        }
+        return [ meta, [ file(sample.fastq_1), file(sample.fastq_2) ] ]
+    }
+}
+
+// Function to get list of [ meta, genome ]
+def get_genome_paths(LinkedHashMap sample, LinkedHashMap genomeMap) {
+    def meta = get_metadata(sample)
+
+    def genome = [:]
+
+    if (sample.genome) {
+        if (genomeMap && genomeMap.containsKey(sample.genome)) {
+            // get fasta and indices from iGenomes
+            genome.fasta = file(genomeMap[sample.genome].fasta, checkIfExists: true)
+            if (params.aligner =~ /bismark/){
+                genome.bismark_index = file(genomeMap[sample.genome].bismark, checkIfExists: true)
+            }
+            else if (params.aligner = 'bwameth'){
+                genome.fasta_index = file(genomeMap[sample.genome].fasta_index, checkIfExists: true)
+            }
+        } else {
+            // genome is a fasta file, or not part of iGenomes
+            genome.fasta = file(sample.genome, checkIfExists: true)
+        }
+    } else if ( params.fasta ) {
+      // samplesheet does not contain genome column, fall back to params.fasta
+      genome.fasta = file(params.fasta, checkIfExists: true)
+    } else {
+        exit 1, "ERROR: Please either supply a fasta file with --fasta or specify genome column in the samplesheet"
+    }
+
+    return [ meta, genome ]
+}
diff --git a/modules/local/subworkflow/input_check.nf b/modules/local/subworkflow/input_check.nf
@@ -0,0 +1,27 @@
+/*
+ * Check input samplesheet and get read channels
+ */
+
+params.options = [:]
+
+include {
+    SAMPLESHEET_CHECK;
+    get_samplesheet_paths;
+    get_genome_paths} from '../process/samplesheet_check' addParams( options: params.options )
+
+workflow INPUT_CHECK {
+    take:
+    samplesheet // file: /path/to/samplesheet.csv
+
+    main:
+    SAMPLESHEET_CHECK ( samplesheet )
+        .splitCsv ( header:true, sep:',' )
+        .set { sample }
+
+    reads = sample.map { get_samplesheet_paths(it) }
+    genome = sample.map { get_genome_paths(it, params.genomes) }
+
+    emit:
+    reads // channel: [ val(meta), [ reads ] ]
+    genome // channel: [ val(meta), [ fasta ] ]
+}