-
Notifications
You must be signed in to change notification settings - Fork 39
/
nextflow_schema.json
323 lines (322 loc) · 17.8 KB
/
nextflow_schema.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
"title": "epi2me-labs/wf-single-cell",
"workflow_title": "Single cell workflow",
"description": "Identification of cell- and UMI barcodes from single-cell sequencing.",
"demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/wf-single-cell-demo.tar.gz",
"aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-single-cell/wf-single-cell-demo/aws.nextflow.config",
"url": "https://github.com/epi2me-labs/wf-single-cell",
"type": "object",
"definitions": {
"input": {
"title": "Input Options",
"type": "object",
"fa_icon": "fas fa-arrow-right",
"description": "Parameters for finding and handling input data for analysis.",
"properties": {
"fastq": {
"type": "string",
"format": "path",
"title": "FASTQ",
"description": "FASTQ files to use in the analysis.",
"help_text": "This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
},
"bam": {
"type": "string",
"format": "path",
"description": "BAM or unaligned BAM (uBAM) files to use in the analysis.",
"help_text": "This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
},
"ref_genome_dir": {
"type": "string",
"format": "directory-path",
"title": "Reference genome directory",
"description": "The path to the 10x reference directory",
"help_text": "Human reference data can be downloaded from 10x [here](https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-GRCh38-2020-A.tar.gz). Instructions for preparing reference data can be found [here](https://www.10xgenomics.com/support/software/cell-ranger/tutorials/cr-tutorial-mr#overview)"
},
"kit": {
"type": "string",
"title": "10x kit and version",
"description": "The 10x kit and version separated by a colon (eg: 3prime:v3)",
"help_text": "10x kits can be released with different versions, each requiring a specific whitelist that is looked-up by the workflow. If `single_cell_sample_sheet` is not defined, the 10x kit is applied to all samples. This parameter is ignored if `single_cell_sample_sheet` is supplied.",
"default": "3prime:v3",
"enum": [
"3prime:v2",
"3prime:v3",
"3prime:v4",
"5prime:v1",
"5prime:v2",
"multiome:v1",
"visium:v1"
]
},
"expected_cells": {
"type": "integer",
"title": "Expected cell number",
"description": "Number of expected cells in the sample.",
"help_text": "The number of expected cells. If `single_cell_sample_sheet` is not defined, `expected_cells` is applied to all samples. This parameter is ignored if `single_cell_sample_sheet` is supplied.",
"default": 500
},
"full_length_only": {
"type": "boolean",
"title": "Full length reads only",
"description": "Only process full length reads.",
"help_text": "If set to true, only process reads or subreads that are classified as full length (read segments flanked by compatible adapters in the expected orientation).",
"default": true
}
},
"allOf": [
{
"required": [
"ref_genome_dir"
]
},
{
"oneOf": [
{
"required": [
"fastq"
]
},
{
"required": [
"bam"
]
}
]
}
]
},
"samples": {
"title": "Sample Options",
"type": "object",
"fa_icon": "fas fa-vials",
"description": "Parameters that relate to samples such as sample sheets and sample names.",
"properties": {
"single_cell_sample_sheet": {
"type": "string",
"title": "Single cell sample sheet",
"description": "An optional CSV file used to assign library metadata to the different samples. If all samples have the same library metadata, this can be supplied instead by using the parameters (kit, expected cells).",
"help_text": "Columns should be: [sample_id, kit, exp_cells]. This must not be confused with the MinKNOW sample_sheet. `sample_id` should correspond to `sample_name` which is defined either in the `sample_sheet`, given by the `sample` parameter (for single sample runs) or if no `sample_sheet` or `sample` is given, is derived from the folder name containing the FASTQ files.",
"format": "file-path"
},
"sample_sheet": {
"type": "string",
"format": "file-path",
"description": "A CSV file used to map barcodes to sample aliases. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files.",
"help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Extra columns are allowed. A `type` column is required for certain workflows and should have the following values; `test_sample`, `positive_control`, `negative_control`, `no_template_control`."
},
"sample": {
"type": "string",
"description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files."
}
}
},
"output": {
"title": "Output Options",
"type": "object",
"fa_icon": "fas fa-arrow-left",
"description": "Parameters for saving and naming workflow outputs.",
"properties": {
"out_dir": {
"type": "string",
"default": "output",
"format": "directory-path",
"description": "Directory for output of all workflow results."
}
}
},
"advanced_options": {
"title": "Advanced options",
"type": "object",
"fa_icon": "far fa-question-circle",
"description": "Advanced options for configuring processes inside the workflow.",
"help_text": "These advanced options do not need to be changed for typical use, but allow fine tuning of workflows for users who want more control over the workflow.",
"properties": {
"kit_config": {
"type": "string",
"format": "file-path",
"title": "10x kit config file",
"description": "A file defining the configurations associated with the various supported 10x kits.",
"help_text": "A CSV file is expected with the following headers [kit, barcode_length, umi_length]. If not specified, a default `kit_configs.csv` (found in the project directory root) will be used. This parameter does not typically need be changed."
},
"threads": {
"type": "integer",
"title": "Maximum number of threads",
"description": "Number of CPU threads to use in resource intensive processes.",
"help_text": "The total CPU resource used by the workflow is constrained by the executor configuration.",
"default": 8
},
"fastq_chunk": {
"type": "integer",
"title": "Fastq chunk size",
"description": "Sets the maximum number of reads per chunk for the initial processing of reads.",
"default": 1000000,
"help_text": "Controls batching of reads for processing."
},
"barcode_adapter1_suff_length": {
"type": "integer",
"title": "Adapter 1 suffix length",
"description": "Suffix length of the read1 adapter to use in creating the probe sequence for identifying barcode/UMI bases.",
"help_text": "For example, specifying 12 would mean that the last 12 bases of the specified read1 sequence will be included in the probe sequence.",
"default": 10
},
"barcode_min_quality": {
"type": "integer",
"title": "Barcode minimum quality",
"description": "Minimum allowed nucleotide-level quality score in the extracted/uncorrected barcode sequence.",
"help_text": "Values equal or higher to this this will be considered 'high-quality' and used for generating the barcode whitelist.",
"default": 15
},
"barcode_max_ed": {
"type": "integer",
"title": "Barcode maximum edit distance",
"description": "Maximum allowable edit distance between uncorrected barcode and the best matching corrected barcode from the sample whitelist.",
"help_text": "Barcodes are corrected by searching from a list of barcodes known to exist in the dataset. A maximum edit distance of 2 between query and whitelist barcode is recommended.",
"default": 2
},
"barcode_min_ed_diff": {
"type": "integer",
"title": "Barcode minimum edit distance difference",
"description": "Minimum allowable edit distance difference between whitelist barcode candidates.",
"help_text": "If there is more than one candidate barcode found in the whitelist, the edit distance difference of the top hit and second best hits (in relation to the uncorrected barcode) must be at least this value to be able to assign a barcode. If the edit distance difference is less than this, it is assumed that barcode identity is amiguous, and the read is not tagged with a corrected barcode.",
"default": 2
},
"gene_assigns_minqv": {
"type": "integer",
"title": "Gene assignment minimum MAPQ",
"description": "Minimum MAPQ score allowed for a read to be assigned to a gene.",
"default": 30
},
"matrix_min_genes": {
"type": "integer",
"title": "Cell filter: Minimum number of genes",
"description": "Filter cells from the gene expression matrix if they contain fewer than <matrix_min_genes> genes.",
"default": 200
},
"matrix_min_cells": {
"type": "integer",
"title": "Gene filter: minimum cells",
"description": "Filter genes from the gene expression matrix that are observed in fewer than <matrix_min_cells> cells.",
"default": 3
},
"matrix_max_mito": {
"type": "integer",
"title": "Cell filter: Mitochondrial expression",
"description": "Filter cells from the gene expression matrix if more than <matrix_max_mito> percent of UMI counts come from mitochondrial genes.",
"default": 20
},
"matrix_norm_count": {
"type": "integer",
"title": "Normalize counts",
"description": "Normalize expression matrix to <matrix_norm_count> counts per cell.",
"default": 10000
},
"genes_of_interest": {
"type": "string",
"format": "path",
"title": "Gene of interest file",
"description": "File containing a list of gene symbols (one symbol per line) to annotate with expression values in the UMAP projections. If doing visium spatial analysis, these genes will be used to annotate the spatial plots. "
},
"mito_prefix": {
"type": "string",
"title": "Mitochondrial gene name prefix",
"description": "Gene name prefix to identify for mitochondrial genes.",
"help_text": "Parts of the workflow analyse mitochondrial genes separately. These genes are identified by searching for a gene name prefix. Human mitochondrial genes can be identified with prefix 'MT-' and mouse genes with prefix 'mt-'. If the reference genome contains data from multiple organisms with different nomenclature, multiple prefixes can be supplied like so: 'MT-,mt-'",
"default": "MT-"
},
"umap_n_repeats": {
"type": "integer",
"title": "Number of UMAP repetitions",
"default": 3,
"description": "Number of UMAP projection to repeat for each dataset.",
"help_text": "The UMAP algorithm contains elements of randomness that can mislead users into seeing associations between cells that are not meaningful. It is recommended to view multiple plots generated with the same parameters and check that any observed structure is consistent across runs."
},
"stringtie_opts": {
"type": "string",
"title": "Stringtie2 options",
"default": "-c 2",
"description": "StringTie options for transcriptome assembly.",
"help_text": "StringTie option string can be supplied at the command line as in this example: `--stringtie_opts=\"-c 5 -m 100 \"`. StringTie options can be found here: http://ccb.jhu.edu/software/stringtie/index.shtml?t=manual. The default option (-c 2) ensures that only transcripts with a coverage of 2 or higher are included in the generated transcriptome"
}
}
},
"misc": {
"title": "Miscellaneous Options",
"type": "object",
"description": "Everything else.",
"default": "",
"properties": {
"disable_ping": {
"type": "boolean",
"default": false,
"description": "Enable to prevent sending a workflow ping."
},
"help": {
"type": "boolean",
"default": false,
"description": "Display help text.",
"fa_icon": "fas fa-question-circle",
"hidden": true
},
"version": {
"type": "boolean",
"default": false,
"description": "Display version and exit.",
"fa_icon": "fas fa-question-circle",
"hidden": true
}
}
}
},
"allOf": [
{
"$ref": "#/definitions/input"
},
{
"$ref": "#/definitions/samples"
},
{
"$ref": "#/definitions/output"
},
{
"$ref": "#/definitions/advanced_options"
},
{
"$ref": "#/definitions/misc"
}
],
"properties": {
"aws_image_prefix": {
"type": "string",
"hidden": true
},
"aws_queue": {
"type": "string",
"hidden": true
},
"monochrome_logs": {
"type": "boolean"
},
"validate_params": {
"type": "boolean",
"default": true
},
"show_hidden_params": {
"type": "boolean"
}
},
"resources": {
"recommended": {
"cpus": 64,
"memory": "256GB"
},
"minimum": {
"cpus": 8,
"memory": "32GB"
},
"run_time": "Approximately 8h for 120M reads with the recommended requirements.",
"arm_support": false
}
}