forked from nf-core/methylseq
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
this is inspired by the functionality in nf-core/nanoseq and nf-core/rnaseq The idea is to require a samplesheet to run the pipeline, which will allow for single/paired end auto-detection and mapping samples against different reference genomes. addresses nf-core#181
- Loading branch information
Showing
5 changed files
with
331 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/usr/bin/env python | ||
|
||
import os | ||
import sys | ||
import errno | ||
import argparse | ||
|
||
|
||
def parse_args(args=None): | ||
Description = "Reformat nf-core/rnaseq samplesheet file and check its contents." | ||
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>" | ||
|
||
parser = argparse.ArgumentParser(description=Description, epilog=Epilog) | ||
parser.add_argument("FILE_IN", help="Input samplesheet file.") | ||
parser.add_argument("FILE_OUT", help="Output file.") | ||
return parser.parse_args(args) | ||
|
||
|
||
def make_dir(path): | ||
if len(path) > 0: | ||
try: | ||
os.makedirs(path) | ||
except OSError as exception: | ||
if exception.errno != errno.EEXIST: | ||
raise exception | ||
|
||
|
||
def print_error(error, context='Line', context_str=''): | ||
error_str = "ERROR: Please check samplesheet -> {}".format(error) | ||
if context != '' and context_str != '': | ||
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(error, context.strip(), context_str.strip()) | ||
print(error_str) | ||
sys.exit(1) | ||
|
||
|
||
def check_samplesheet(file_in, file_out): | ||
""" | ||
This function checks that the samplesheet follows the following structure: | ||
sample,fastq_1,fastq_2 | ||
WT_LIB1_REP1_1,WT_LIB1_REP1_1.fastq.gz,WT_LIB1_REP1_2.fastq.gz | ||
KO_LIB1_REP1,KO_LIB1_REP1_1.fastq.gz,KO_LIB1_REP1_2.fastq.gz | ||
""" | ||
|
||
sample_run_dict = {} | ||
with open(file_in, "r") as fin: | ||
|
||
## Check header | ||
MIN_COLS = 2 | ||
HEADER = ['sample', 'fastq_1', 'fastq_2', 'genome'] | ||
header = [x.strip('"') for x in fin.readline().strip().split(",")] | ||
if header[:len(HEADER)] != HEADER: | ||
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER))) | ||
sys.exit(1) | ||
|
||
## Check sample entries | ||
for line in fin: | ||
lspl = [x.strip().strip('"') for x in line.strip().split(",")] | ||
|
||
## Check valid number of columns per row | ||
if len(lspl) < len(HEADER): | ||
print_error("Invalid number of columns (minimum = {})!".format(len(HEADER)), 'Line', line) | ||
|
||
num_cols = len([x for x in lspl if x]) | ||
if num_cols < MIN_COLS: | ||
print_error("Invalid number of populated columns (minimum = {})!".format(MIN_COLS), 'Line', line) | ||
|
||
## Check sample name entries | ||
sample, fastq_1, fastq_2, genome = lspl[:len(HEADER)] | ||
if sample: | ||
if sample.find(" ") != -1: | ||
print_error("Sample entry contains spaces!", 'Line', line) | ||
else: | ||
print_error("Sample entry has not been specified!", 'Line', line) | ||
|
||
## Check FastQ file extension | ||
for fastq in [fastq_1, fastq_2]: | ||
if fastq: | ||
if fastq.find(" ") != -1: | ||
print_error("FastQ file contains spaces!", 'Line', line) | ||
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): | ||
print_error("FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", 'Line', line) | ||
|
||
## Check genome entries | ||
if genome: | ||
if genome.find(' ') != -1: | ||
print_error("Genome entry contains spaces!",'Line', line) | ||
if len(genome.split('.')) > 1: | ||
if genome[-6:] != '.fasta' and genome[-3:] != '.fa' and genome[-9:] != '.fasta.gz' and genome[-6:] != '.fa.gz': | ||
print_error("Genome entry does not have extension '.fasta', '.fa', '.fasta.gz' or '.fa.gz'!",'Line', line) | ||
|
||
## Auto-detect paired-end/single-end | ||
sample_info = [] ## [single_end, fastq_1, fastq_2, genome] | ||
if sample and fastq_1 and fastq_2: ## Paired-end short reads | ||
sample_info = ["0", fastq_1, fastq_2, genome] | ||
elif sample and fastq_1 and not fastq_2: ## Single-end short reads | ||
sample_info = ["1", fastq_1, fastq_2, genome] | ||
else: | ||
print_error("Invalid combination of columns provided!", 'Line', line) | ||
|
||
## Create sample mapping dictionary = {sample: [ single_end, fastq_1, fastq_2, genome ]} | ||
if sample not in sample_run_dict: | ||
sample_run_dict[sample] = sample_info | ||
else: | ||
print_error("Samplesheet contains duplicate rows!", 'Line', line) | ||
|
||
## Write validated samplesheet with appropriate columns | ||
if len(sample_run_dict) > 0: | ||
out_dir = os.path.dirname(file_out) | ||
make_dir(out_dir) | ||
with open(file_out, "w") as fout: | ||
fout.write(",".join(['sample', 'single_end', 'fastq_1', 'fastq_2', 'genome']) + "\n") | ||
for sample_id, sample_info in sorted(sample_run_dict.items()): | ||
## Write to file | ||
fout.write(','.join([sample_id] + sample_info) + '\n') | ||
|
||
|
||
def main(args=None): | ||
args = parse_args(args) | ||
check_samplesheet(args.FILE_IN, args.FILE_OUT) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
/* | ||
* ----------------------------------------------------- | ||
* Utility functions used in nf-core DSL2 module files | ||
* ----------------------------------------------------- | ||
*/ | ||
|
||
/* | ||
* Extract name of software tool from process name using $task.process | ||
*/ | ||
def getSoftwareName(task_process) { | ||
return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() | ||
} | ||
|
||
/* | ||
* Function to initialise default values and to generate a Groovy Map of available options for nf-core modules | ||
*/ | ||
def initOptions(Map args) { | ||
def Map options = [:] | ||
options.args = args.args ?: '' | ||
options.args2 = args.args2 ?: '' | ||
options.publish_by_id = args.publish_by_id ?: false | ||
options.publish_dir = args.publish_dir ?: '' | ||
options.publish_files = args.publish_files | ||
options.suffix = args.suffix ?: '' | ||
return options | ||
} | ||
|
||
/* | ||
* Tidy up and join elements of a list to return a path string | ||
*/ | ||
def getPathFromList(path_list) { | ||
def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries | ||
paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes | ||
return paths.join('/') | ||
} | ||
|
||
/* | ||
* Function to save/publish module results | ||
*/ | ||
def saveFiles(Map args) { | ||
if (!args.filename.endsWith('.version.txt')) { | ||
def ioptions = initOptions(args.options) | ||
def path_list = [ ioptions.publish_dir ?: args.publish_dir ] | ||
if (ioptions.publish_by_id) { | ||
path_list.add(args.publish_id) | ||
} | ||
if (ioptions.publish_files instanceof Map) { | ||
for (ext in ioptions.publish_files) { | ||
if (args.filename.endsWith(ext.key)) { | ||
def ext_list = path_list.collect() | ||
ext_list.add(ext.value) | ||
return "${getPathFromList(ext_list)}/$args.filename" | ||
} | ||
} | ||
} else if (ioptions.publish_files == null) { | ||
return "${getPathFromList(path_list)}/$args.filename" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
// Import generic module functions | ||
include { saveFiles } from './functions' | ||
|
||
params.options = [:] | ||
|
||
/* | ||
* Parse software version numbers | ||
*/ | ||
process GET_SOFTWARE_VERSIONS { | ||
publishDir "${params.outdir}", | ||
mode: params.publish_dir_mode, | ||
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', publish_id:'') } | ||
|
||
conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) | ||
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { | ||
container "https://depot.galaxyproject.org/singularity/python:3.8.3" | ||
} else { | ||
container "quay.io/biocontainers/python:3.8.3" | ||
} | ||
|
||
cache false | ||
|
||
input: | ||
path versions | ||
|
||
output: | ||
path "software_versions.csv" , emit: csv | ||
path 'software_versions_mqc.yaml', emit: yaml | ||
|
||
script: | ||
""" | ||
echo $workflow.manifest.version > pipeline.version.txt | ||
echo $workflow.nextflow.version > nextflow.version.txt | ||
scrape_software_versions.py &> software_versions_mqc.yaml | ||
""" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
// Import generic module functions | ||
include { saveFiles } from './functions' | ||
|
||
params.options = [:] | ||
|
||
/* | ||
* Reformat design file and check validity | ||
*/ | ||
process SAMPLESHEET_CHECK { | ||
tag "$samplesheet" | ||
publishDir "${params.outdir}", | ||
mode: params.publish_dir_mode, | ||
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', publish_id:'') } | ||
|
||
conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) | ||
container "quay.io/biocontainers/python:3.8.3" | ||
|
||
input: | ||
path samplesheet | ||
|
||
output: | ||
path '*.csv' | ||
|
||
|
||
script: // This script is bundled with the pipeline, in nf-core/methylseq/bin/ | ||
""" | ||
check_samplesheet.py $samplesheet samplesheet.valid.csv | ||
""" | ||
} | ||
|
||
|
||
def get_metadata(LinkedHashMap sample) { | ||
def meta = [:] | ||
meta.id = sample.sample | ||
meta.single_end = sample.single_end.toBoolean() | ||
|
||
return meta | ||
} | ||
|
||
// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] | ||
def get_samplesheet_paths(LinkedHashMap sample) { | ||
def meta = get_metadata(sample) | ||
|
||
if (!file(sample.fastq_1).exists()) { | ||
exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${sample.fastq_1}" | ||
} | ||
if (meta.single_end) { | ||
return [ meta, [ file(sample.fastq_1) ] ] | ||
} else { | ||
if (!file(sample.fastq_2).exists()) { | ||
exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${sample.fastq_2}" | ||
} | ||
return [ meta, [ file(sample.fastq_1), file(sample.fastq_2) ] ] | ||
} | ||
} | ||
|
||
// Function to get list of [ meta, genome ] | ||
def get_genome_paths(LinkedHashMap sample, LinkedHashMap genomeMap) { | ||
def meta = get_metadata(sample) | ||
|
||
def genome = [:] | ||
|
||
if (sample.genome) { | ||
if (genomeMap && genomeMap.containsKey(sample.genome)) { | ||
// get fasta and indices from iGenomes | ||
genome.fasta = file(genomeMap[sample.genome].fasta, checkIfExists: true) | ||
if (params.aligner =~ /bismark/){ | ||
genome.bismark_index = file(genomeMap[sample.genome].bismark, checkIfExists: true) | ||
} | ||
else if (params.aligner = 'bwameth'){ | ||
genome.fasta_index = file(genomeMap[sample.genome].fasta_index, checkIfExists: true) | ||
} | ||
} else { | ||
// genome is a fasta file, or not part of iGenomes | ||
genome.fasta = file(sample.genome, checkIfExists: true) | ||
} | ||
} else if ( params.fasta ) { | ||
// samplesheet does not contain genome column, fall back to params.fasta | ||
genome.fasta = file(params.fasta, checkIfExists: true) | ||
} else { | ||
exit 1, "ERROR: Please either supply a fasta file with --fasta or specify genome column in the samplesheet" | ||
} | ||
|
||
return [ meta, genome ] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/* | ||
* Check input samplesheet and get read channels | ||
*/ | ||
|
||
params.options = [:] | ||
|
||
include { | ||
SAMPLESHEET_CHECK; | ||
get_samplesheet_paths; | ||
get_genome_paths} from '../process/samplesheet_check' addParams( options: params.options ) | ||
|
||
workflow INPUT_CHECK { | ||
take: | ||
samplesheet // file: /path/to/samplesheet.csv | ||
|
||
main: | ||
SAMPLESHEET_CHECK ( samplesheet ) | ||
.splitCsv ( header:true, sep:',' ) | ||
.set { sample } | ||
|
||
reads = sample.map { get_samplesheet_paths(it) } | ||
genome = sample.map { get_genome_paths(it, params.genomes) } | ||
|
||
emit: | ||
reads // channel: [ val(meta), [ reads ] ] | ||
genome // channel: [ val(meta), [ fasta ] ] | ||
} |