-
Notifications
You must be signed in to change notification settings - Fork 4
/
build_pangenome.nf
155 lines (126 loc) · 5.72 KB
/
build_pangenome.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env nextflow
// Using DSL-2
nextflow.enable.dsl=2
// Import helpers
GroovyShell shell = new GroovyShell()
def helpers = shell.parse(new File("${workflow.projectDir}/helpers.gvy"))
// Import sub-workflows
include { align_genomes } from './modules/align_genomes' addParams(output: "${params.output}/align")
include { ani } from './modules/ani'
include { download_genes } from './modules/download_genes'
include { download_genomes } from './modules/download_genomes'
include { deduplicate } from './modules/deduplicate' addParams(output: "${params.output}/gene_catalog")
include { bin_genes } from './modules/processes/bin_genes' addParams(output: "${params.output}/bin_pangenome")
// Standalone entrypoint
workflow {
// Show help message if the user specifies the --help flag at runtime
helpers.help_message(
"""
Build a pangenome dataset from a collection of genes and genomes
1. Genomes and genes will be downloaded from NCBI if needed
2. Genes will be deduplicated to yield a nonredundant gene catalog
3. Genomes will be aligned against the gene catalog
4. Genes and genomes will be binned by co-occurrence
Parameters:
--genome_tsv (optional) Table of genomes to download from NCBI (e.g. https://www.ncbi.nlm.nih.gov/datasets/genome/)
--genome_csv (optional) Table of genomes to download from legacy NCBI (e.g. https://www.ncbi.nlm.nih.gov/genome/browse#!/prokaryotes/)
--genes (optional) Folder containing the set of gene amino acid FASTAs to deduplicate
--genomes (optional) Folder containing the set of genome nucleotide FASTAs to align against
--output Folder where output files will be written
--min_coverage Minimum proportion of a gene which must align in order to retain the alignment
(default: ${params.min_coverage}, ranges 0-100)
--min_identity Minimum percent identity of the amino acid alignment required to retain the alignment
(default: ${params.min_identity}, ranges 0-100)
--max_evalue Maximum E-value threshold used to filter all alignments
(default: ${params.max_evalue})
--aligner Algorithm used for alignment (default: ${params.aligner}, options: diamond, blast)
--query_gencode Genetic code used for conceptual translation of genome sequences
(default: ${params.query_gencode})
--max_overlap Any alignment which overlaps a higher-scoring alignment by more than this
amount will be filtered out (default: ${params.max_overlap}, range: 0-100)
--min_genomes_per_gene Minimum number of genomes for a gene to be found in to be included (default: 1)
--max_dist_genes Maximum Jaccard distance threshold used to group genes into bins
--min_bin_size Minimum number of genes needed to retain a bin
--max_dist_genomes Maximum Euclidean distance threshold used to group genomes based on gene bin content
""",
params.help
)
// Make sure that the required parameters were provided
helpers.require_param(params.output, "output")
//////////////////////////////////////
// MANUALLY INPUT GENES AND GENOMES //
//////////////////////////////////////
// Remove any trailing slash from the genome and gene folder
if(params.genomes){params.genomes = params.genomes.replaceAll('/$', '')}
if(params.genes){params.genes = params.genes.replaceAll('/$', '')}
// Get all of the genomes
Channel
.fromPath("${params.genomes}/**.f*")
.filter { it.exists() }
.set { genomes_ch }
// Get all of the genes
Channel
.fromPath("${params.genes}/**.f*")
.filter { it.exists() }
.set { genes_ch }
////////////////////////////////
// DOWNLOAD GENOMES FROM NCBI //
////////////////////////////////
// Parse the CSV path(s), if provided
if(params.genome_csv.length() > 1){
Channel
.fromPath( params.genome_csv.split(',').toList() )
.filter { it.exists() }
.set { genome_manifest_csv }
}else{
Channel.empty().set { genome_manifest_csv }
}
// Parse the TSV path(s), if provided
if(params.genome_tsv.length() > 1){
Channel
.fromPath( params.genome_tsv.split(',').toList() )
.filter { it.exists() }
.set { genome_manifest_tsv }
}else{
Channel.empty().set { genome_manifest_tsv }
}
// Download the genomes for each genome
download_genomes(
genome_manifest_csv,
genome_manifest_tsv
)
//////////////////////////////
// DOWNLOAD GENES FROM NCBI //
//////////////////////////////
// Download the genes for each genome
download_genes(
genome_manifest_csv,
genome_manifest_tsv
)
//////////////////////////////
// DEDUPLICATE GENE CATALOG //
//////////////////////////////
// Combine genes from NCBI and from the folder input
deduplicate(genes_ch.mix(download_genes.out.genes))
//////////////////////
// ALIGN TO GENOMES //
//////////////////////
// Run the genome alignment sub-workflow
align_genomes(
genomes_ch.mix(download_genomes.out.genomes),
deduplicate.out.fasta
)
// Calculate the ANI for all of those genomes
ani(
genomes_ch.mix(download_genomes.out.genomes)
)
///////////////////////////
// BIN GENES AND GENOMES //
///////////////////////////
// Bin the genes
bin_genes(
align_genomes.out.concat_alignments,
deduplicate.out.annot,
download_genomes.out.annot
)
}