Skip to content

Commit

Permalink
add samplesheet functionality
Browse files Browse the repository at this point in the history
this is inspired by the functionality in nf-core/nanoseq and
nf-core/rnaseq
The idea is to require a samplesheet to run the pipeline, which will
allow for single/paired end auto-detection and mapping samples against
different reference genomes.

addresses nf-core#181
  • Loading branch information
phue committed Mar 22, 2021
1 parent baf4faa commit eaf7ad1
Show file tree
Hide file tree
Showing 5 changed files with 331 additions and 0 deletions.
124 changes: 124 additions & 0 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/env python

import os
import sys
import errno
import argparse


def parse_args(args=None):
Description = "Reformat nf-core/rnaseq samplesheet file and check its contents."
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
return parser.parse_args(args)


def make_dir(path):
if len(path) > 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise exception


def print_error(error, context='Line', context_str=''):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != '' and context_str != '':
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(error, context.strip(), context_str.strip())
print(error_str)
sys.exit(1)


def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
sample,fastq_1,fastq_2
WT_LIB1_REP1_1,WT_LIB1_REP1_1.fastq.gz,WT_LIB1_REP1_2.fastq.gz
KO_LIB1_REP1,KO_LIB1_REP1_1.fastq.gz,KO_LIB1_REP1_2.fastq.gz
"""

sample_run_dict = {}
with open(file_in, "r") as fin:

## Check header
MIN_COLS = 2
HEADER = ['sample', 'fastq_1', 'fastq_2', 'genome']
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[:len(HEADER)] != HEADER:
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
sys.exit(1)

## Check sample entries
for line in fin:
lspl = [x.strip().strip('"') for x in line.strip().split(",")]

## Check valid number of columns per row
if len(lspl) < len(HEADER):
print_error("Invalid number of columns (minimum = {})!".format(len(HEADER)), 'Line', line)

num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:
print_error("Invalid number of populated columns (minimum = {})!".format(MIN_COLS), 'Line', line)

## Check sample name entries
sample, fastq_1, fastq_2, genome = lspl[:len(HEADER)]
if sample:
if sample.find(" ") != -1:
print_error("Sample entry contains spaces!", 'Line', line)
else:
print_error("Sample entry has not been specified!", 'Line', line)

## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
if fastq:
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", 'Line', line)
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
print_error("FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", 'Line', line)

## Check genome entries
if genome:
if genome.find(' ') != -1:
print_error("Genome entry contains spaces!",'Line', line)
if len(genome.split('.')) > 1:
if genome[-6:] != '.fasta' and genome[-3:] != '.fa' and genome[-9:] != '.fasta.gz' and genome[-6:] != '.fa.gz':
print_error("Genome entry does not have extension '.fasta', '.fa', '.fasta.gz' or '.fa.gz'!",'Line', line)

## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2, genome]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2, genome]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2, genome]
else:
print_error("Invalid combination of columns provided!", 'Line', line)

## Create sample mapping dictionary = {sample: [ single_end, fastq_1, fastq_2, genome ]}
if sample not in sample_run_dict:
sample_run_dict[sample] = sample_info
else:
print_error("Samplesheet contains duplicate rows!", 'Line', line)

## Write validated samplesheet with appropriate columns
if len(sample_run_dict) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(['sample', 'single_end', 'fastq_1', 'fastq_2', 'genome']) + "\n")
for sample_id, sample_info in sorted(sample_run_dict.items()):
## Write to file
fout.write(','.join([sample_id] + sample_info) + '\n')


def main(args=None):
args = parse_args(args)
check_samplesheet(args.FILE_IN, args.FILE_OUT)


if __name__ == "__main__":
sys.exit(main())
59 changes: 59 additions & 0 deletions modules/local/process/functions.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* -----------------------------------------------------
* Utility functions used in nf-core DSL2 module files
* -----------------------------------------------------
*/

/*
* Extract name of software tool from process name using $task.process
*/
def getSoftwareName(task_process) {
return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
}

/*
* Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
*/
def initOptions(Map args) {
def Map options = [:]
options.args = args.args ?: ''
options.args2 = args.args2 ?: ''
options.publish_by_id = args.publish_by_id ?: false
options.publish_dir = args.publish_dir ?: ''
options.publish_files = args.publish_files
options.suffix = args.suffix ?: ''
return options
}

/*
* Tidy up and join elements of a list to return a path string
*/
def getPathFromList(path_list) {
def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries
paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
return paths.join('/')
}

/*
* Function to save/publish module results
*/
def saveFiles(Map args) {
if (!args.filename.endsWith('.version.txt')) {
def ioptions = initOptions(args.options)
def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
if (ioptions.publish_by_id) {
path_list.add(args.publish_id)
}
if (ioptions.publish_files instanceof Map) {
for (ext in ioptions.publish_files) {
if (args.filename.endsWith(ext.key)) {
def ext_list = path_list.collect()
ext_list.add(ext.value)
return "${getPathFromList(ext_list)}/$args.filename"
}
}
} else if (ioptions.publish_files == null) {
return "${getPathFromList(path_list)}/$args.filename"
}
}
}
36 changes: 36 additions & 0 deletions modules/local/process/get_software_versions.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Import generic module functions
include { saveFiles } from './functions'

params.options = [:]

/*
* Parse software version numbers
*/
process GET_SOFTWARE_VERSIONS {
publishDir "${params.outdir}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', publish_id:'') }

conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/python:3.8.3"
} else {
container "quay.io/biocontainers/python:3.8.3"
}

cache false

input:
path versions

output:
path "software_versions.csv" , emit: csv
path 'software_versions_mqc.yaml', emit: yaml

script:
"""
echo $workflow.manifest.version > pipeline.version.txt
echo $workflow.nextflow.version > nextflow.version.txt
scrape_software_versions.py &> software_versions_mqc.yaml
"""
}
85 changes: 85 additions & 0 deletions modules/local/process/samplesheet_check.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// Import generic module functions
include { saveFiles } from './functions'

params.options = [:]

/*
* Reformat design file and check validity
*/
process SAMPLESHEET_CHECK {
tag "$samplesheet"
publishDir "${params.outdir}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', publish_id:'') }

conda (params.enable_conda ? "conda-forge::python=3.8.3" : null)
container "quay.io/biocontainers/python:3.8.3"

input:
path samplesheet

output:
path '*.csv'


script: // This script is bundled with the pipeline, in nf-core/methylseq/bin/
"""
check_samplesheet.py $samplesheet samplesheet.valid.csv
"""
}


def get_metadata(LinkedHashMap sample) {
def meta = [:]
meta.id = sample.sample
meta.single_end = sample.single_end.toBoolean()

return meta
}

// Function to get list of [ meta, [ fastq_1, fastq_2 ] ]
def get_samplesheet_paths(LinkedHashMap sample) {
def meta = get_metadata(sample)

if (!file(sample.fastq_1).exists()) {
exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${sample.fastq_1}"
}
if (meta.single_end) {
return [ meta, [ file(sample.fastq_1) ] ]
} else {
if (!file(sample.fastq_2).exists()) {
exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${sample.fastq_2}"
}
return [ meta, [ file(sample.fastq_1), file(sample.fastq_2) ] ]
}
}

// Function to get list of [ meta, genome ]
def get_genome_paths(LinkedHashMap sample, LinkedHashMap genomeMap) {
def meta = get_metadata(sample)

def genome = [:]

if (sample.genome) {
if (genomeMap && genomeMap.containsKey(sample.genome)) {
// get fasta and indices from iGenomes
genome.fasta = file(genomeMap[sample.genome].fasta, checkIfExists: true)
if (params.aligner =~ /bismark/){
genome.bismark_index = file(genomeMap[sample.genome].bismark, checkIfExists: true)
}
else if (params.aligner = 'bwameth'){
genome.fasta_index = file(genomeMap[sample.genome].fasta_index, checkIfExists: true)
}
} else {
// genome is a fasta file, or not part of iGenomes
genome.fasta = file(sample.genome, checkIfExists: true)
}
} else if ( params.fasta ) {
// samplesheet does not contain genome column, fall back to params.fasta
genome.fasta = file(params.fasta, checkIfExists: true)
} else {
exit 1, "ERROR: Please either supply a fasta file with --fasta or specify genome column in the samplesheet"
}

return [ meta, genome ]
}
27 changes: 27 additions & 0 deletions modules/local/subworkflow/input_check.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Check input samplesheet and get read channels
*/

params.options = [:]

include {
SAMPLESHEET_CHECK;
get_samplesheet_paths;
get_genome_paths} from '../process/samplesheet_check' addParams( options: params.options )

workflow INPUT_CHECK {
take:
samplesheet // file: /path/to/samplesheet.csv

main:
SAMPLESHEET_CHECK ( samplesheet )
.splitCsv ( header:true, sep:',' )
.set { sample }

reads = sample.map { get_samplesheet_paths(it) }
genome = sample.map { get_genome_paths(it, params.genomes) }

emit:
reads // channel: [ val(meta), [ reads ] ]
genome // channel: [ val(meta), [ fasta ] ]
}

0 comments on commit eaf7ad1

Please sign in to comment.