forked from a-xavier/tapes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtapes.py
484 lines (420 loc) · 21.2 KB
/
tapes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
'''
Name: tapes.py
Author: Alexandre Xavier
Version; 0.1
Contact: alexandre.xavier@uon.edu.au
Python Version: 3.x
from https://github.com/a-xavier/tapes
Licence: MIT
'''
#!usr/bin/env python3
import sys
import argparse
import os
import json
from time import time
from numpy import array_split
from multiprocessing.pool import Pool
import src.vep_process as vp
import src.t_func as tf
parser = argparse.ArgumentParser(prog='tapes', usage=tf.help_message(), add_help=False)
parser.add_argument('option', type=str, nargs=1,
help="Choose the main function:\n"
"annotate: use annovar to annotate a vcf file\n"
"sort: use tapes to prioritise variants\n"
"db: see or download databases\n"
"analyse: use an already analysed file to produce a specific report",
choices=['annotate', 'sort', 'db', 'decompose', 'analyse', 'analyze'])
parser.add_argument('-i', '--input',
required=False,
help="Path of the input csv file annotated by annovar",
type=str)
parser.add_argument('-o', '--output',
required=False,
help="Path of the sorted output csv file",
type=str)
parser.add_argument('-b', '--build_db',
help="Download databases flagged for download : /humandb will be populated with relevant databases\n"
"Also downloads the necessary tapes databases if necessary",
action="store_true",
required=False)
parser.add_argument('-A', '--annovar',
nargs='?',
help="Path of the annovar FOLDER",
type=str,
required=False)
parser.add_argument('-s', '--see_db',
help="Path of the annovar FOLDER : /humandb will be searched for relevant databases and write the"
"result in db_config.json in tapes current directory",
action="store_true",
required=False)
parser.add_argument('-e', '--enrichr',
help="use --enrichr to analyse top mutations (>0.85 in Total score) with EnrichR API"
"Default is GO_Biological_Process_2018, see http://amp.pharm.mssm.edu/Enrichr/#stats to see "
"all available database",
nargs="?", const="GO_Biological_Process_2018",
required=False)
parser.add_argument("-d", "--disease",
help="excluse genes non involved in specified disease (based on Disease column description in a new"
" spreadsheet in xlsx file, use quotes '' to use a multiple words term (eg; 'multiple sclerosis' "
"Default = cancer",
nargs="?", const="cancer",
required=False)
parser.add_argument('--BIG',
help="If the vcf is too big, exported file remove probably benign variants before exporting",
nargs='?', const=0.35, default=None, type=float,
required=False)
parser.add_argument('--by_sample',
help='Add a spreadsheet to xlsx report with the 5 top pathogenic variants',
required=False,
action="store_true")
parser.add_argument('--list',
help="A txt file containing a list of refGene Genes symbols or a string of refGenes Genes symbol "
"in quotes",
required=False)
parser.add_argument("-a", "--assembly",
help="Name of the assembly, hg19 / hg38"
"Default is hg19", nargs="?", const="hg19", default="hg19", choices={"hg19", "hg38"},
required=False)
parser.add_argument("--ref_anno",
help="Genes format to use RefSeq 'refGene', ENSEMBL 'ensGene' or USCS 'knownGene "
"default is refGene", nargs="?", const="refGene", default="refGene",
choices={"refGene", "ensGene", "knownGene"},
required=False)
parser.add_argument("--acmg",
help="Do the ACMG criteria computation",
action='store_true',
required=False)
parser.add_argument("--trio",
help="Specify pedigree data for trio analysis as .txt file' "
"default is None", nargs='?', default=None,
required=False)
parser.add_argument("--kegg",
help="Similar to --list."
" Indicate a kegg pathway and tapes will return a spreasheet with"
"genes only contained in that pathway",
required=False)
parser.add_argument('--cutoff',
help='Select cutoff for BS1.'
'Frequency for rare disease, default is 0.005', type=float,
nargs=1,
default=0.005,
required=False)
parser.add_argument('--acmg_db',
help='The location of the databases used to assign ACMG criteria\n'
'Default is the directory where tapes.py is located',
nargs='?',
default=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'acmg_db'),
required=False)
parser.add_argument("--tab",
help="Export in tab-separated values format instead of csv format",
action='store_true',
required=False)
parser.add_argument('--by_gene',
help='Add a spreadsheet to xlsx report with gene-grouped variants',
required=False,
action="store_true")
parser.add_argument('-t','--threads',
help='numbers of threads / Unused for now', type=int,
default=1,
required=False)
parser.add_argument('--pp2_percent',
help='Threshold for PP2, considering PP2 positive if variant is missense in a gene where more than the threshold are pathogenic missense', type=int,
default=80,
required=False)
parser.add_argument('--pp2_min',
help='Number of minimum pathogenic missense variants in gene to consider PP2', type=int,
default=1,
required=False)
parser.add_argument('--bp1_percent',
help='Threshold for BP1, considering BP1 positive if variant is missense in a gene where less than the threshold are pathogenic missense', type=int,
default=15,
required=False)
parser.add_argument("--test",
help="test flag - Will override all options and run a test of the sort function",
action='store_true',
required=False)
args = parser.parse_args()
if args.test == True:
args = parser.parse_args(['sort','-i./toy_dataset/test_input.csv', '-o./test_output/', '--tab', '--test', '--enrichr', '--by_gene' , '--by_sample'])
args.list = "FH CDC73"
args.kegg = "Pathways in cancer"
args.disease = "autosomal dominant"
args.trio = "./toy_dataset/trio.txt"
for item in vars(args):
print(item, '=', getattr(args, item))
else:
args = parser.parse_args()
# Process paths to transform into absolute
if args.annovar:
args.annovar = os.path.abspath(args.annovar)
if args.input != None:
args.input = os.path.abspath(args.input)
if args.output != None:
if args.output[-4:] == ('.csv' or '.vcf' or '.txt' or '.tsv'):
args.output = os.path.abspath(args.output)
elif args.output[-1] == '/':
args.output = (os.path.abspath(args.output)+'/')
elif args.output[-1] =='\\':
args.output = (os.path.abspath(args.output)+'\\')
else:
pass
# Convert inputs and outputs into absolute paths
script_absolute_path = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_absolute_path)
# PROCESS ACMG_DB PATH
try:
with open('./src/db_config.json', 'r') as db_cfg:
cfg_dict = json.load(db_cfg)
try:
acmg_db_path = cfg_dict["acmg_db_path"]['acmg_db_path']
except KeyError:
acmg_db_path = args.acmg_db
except FileNotFoundError:
print("No acmg_db path given and no db_config.json found\n"
"Default is: {}".format(args.acmg_db))
acmg_db_path = args.acmg_db
def main():
start_time = time()
output_type = tf.output_type(args.output) # Determine output type either directory or csv or txt/tsv
# Do not discriminate by platform anymore so win users can also use normal slash
if output_type == 'directory':
if '/' in args.output:
output_prefix = args.output.split('/')[-2]
elif '\\' in args.output:
output_prefix = args.output.split('\\')[-2]
file_path = tf.process_path(args.input) # process relative to absolute path
full_stuff, soft_used = tf.open_csv_file(file_path, acmg_db_path) # Load annotated csv in pandas
print(tf.tmp_stmp()+'{} annotated file'.format(soft_used))
ref_anno = tf.reference_used(full_stuff) # Detect reference annotation used (refseq/UCSC/ENSEMBL)
# IF NUMBER OF SAMPLE UNDER 15 PS4 will not be calculated using fisher's test but with the normal db
number_of_samples = tf.counting_number_of_samples(full_stuff)
tf.check_if_data_already_processed(full_stuff) # Check if file was already processed
full_stuff = tf.replace_dash_by_dots(full_stuff) # Replace dashes by dots in ref/alt columns
################################ PROCESSING AND CLASSIFYING VARiANTS ################################
# ACMG TAG CHECKS FOR ALL ANNOTATION REQUIREMENT
if args.acmg is True:
tf.check_acmg_tag_requirement(full_stuff, ref_anno)
elif args.acmg is False:
pass
### TESTS ###
# FIRST LOADS ALL NECESSARY DATABASES
global tup_databases
if soft_used == 'annovar':
tup_databases = tf.load_databases(acmg_db_path, ref_anno, args.assembly, args.trio, args.pp2_percent, args.pp2_min, args.bp1_percent)
elif soft_used == 'vep':
tup_databases = vp.load_vep_databases(acmg_db_path, args.assembly, ref_anno, args.trio, args.pp2_percent, args.pp2_min, args.bp1_percent)
# IF TRIO DATA IS THERE EXCLUDE HEALTHY PARENTS
if args.trio is not None:
print(tf.tmp_stmp() + 'Excluding Healthy Parents from trios...', end='', flush=True)
full_stuff_tup = tf.process_df_for_trio(tup_databases, full_stuff)
full_stuff = full_stuff_tup[0]
list_healthy = full_stuff_tup[1]
print('Done')
else:
list_healthy = []
# CHECK FOR MULTIPROCESSING
if args.threads >= 2:
# TEST FOR MULTIPROCESSING
split_df = array_split(full_stuff, args.threads) # Split dataframe
list_args = []
for chunck in split_df:
toup_argument = (chunck, ref_anno, number_of_samples, tup_databases, args.cutoff)
list_args.append(toup_argument)
#TODO MAYBE ADD MULTIPROCESSING TO GO FASTER
# problem HIGH USAGE OF RAM FOR EACH PROCESS (Because of databases copied to child processes)
pool = Pool(processes=args.threads)
if soft_used == "annovar":
result = pool.starmap(tf.process_data, list_args)
elif soft_used == 'vep':
result = pool.starmap(vp.process_data, list_args)
pool.close()
pool.join()
final_stuff = tf.pd.concat(result) # Merge all pieces for final dataframe
if soft_used == 'vep':
final_stuff = vp.rename_columns(final_stuff, ref_anno)
final_stuff.drop(list_healthy, axis=1, inplace=True)
###END MUTLTIPROCESS###
else:
if soft_used == 'annovar':
final_stuff = tf.process_data(full_stuff, ref_anno, number_of_samples, tup_databases, args.cutoff)
final_stuff.drop(list_healthy, axis=1, inplace=True)
elif soft_used == 'vep':
final_stuff = vp.process_data(full_stuff, ref_anno, number_of_samples, tup_databases, args.cutoff)
final_stuff = vp.rename_columns(final_stuff, ref_anno)
final_stuff.drop(list_healthy, axis=1, inplace=True)
final_stuff = tf.sort_by_pathogenicity(final_stuff)
############################## OUTPUT RAW REPORT ###############################################
if args.BIG is not None:
final_stuff = final_stuff[final_stuff['Probability_Path'] >= float(args.BIG)]
if output_type == 'csv' or output_type == 'txt':
tf.export_sorted_csv(final_stuff, tf.process_path(args.output), args.tab)
elif output_type == 'directory':
if args.tab is True:
ext = '.txt'
else:
ext = '.csv'
outfile = os.path.join(args.output, (output_prefix+ext))
tf.export_sorted_csv(final_stuff, outfile, args.tab)
################################ OPTIONS ############################################################
if args.kegg is not None:
tf.kegg_inverted_patway(args.kegg, final_stuff, ref_anno, acmg_db_path)
if args.by_gene is True:
tf.gene_grouped_report(final_stuff, ref_anno, acmg_db_path, number_of_samples)
if args.enrichr is not None:
tf.pathway_analysis(final_stuff, args.enrichr, ref_anno)
if args.by_sample is True:
tf.individual_reports(final_stuff, ref_anno)
if args.disease is not None:
tf.create_cancer_dataframe(final_stuff, args.disease, ref_anno)
if args.list is not None:
if os.path.isfile(args.list) is True:
with open(args.list, 'r') as list_file:
gene_list = []
for line in list_file.readlines():
gene_list.append(line.rstrip('\n'))
else:
gene_list = set(args.list.split(" "))
tf.find_list_of_genes(final_stuff, gene_list, ref_anno)
################################## OUTPUT EXTRA REPORT ################################################
if output_type == 'csv':
path_output = args.output.replace('.csv', '.xlsx')
tf.export_to_formatted_xlsx(final_stuff, path_output, number_of_samples, ref_anno)
print(tf.tmp_stmp() + "Output written : {}/xlsx".format(tf.process_path(args.output)))
elif output_type == 'txt':
no_ext = args.output[:-4]
path_output = no_ext+'.xlsx'
tf.export_to_formatted_xlsx(final_stuff, path_output, number_of_samples, ref_anno)
print(tf.tmp_stmp() + "Output written : {}/xlsx".format(tf.process_path(args.output)))
elif output_type == 'directory':
tf.csv_report(final_stuff, args.output, output_prefix, ref_anno, args.tab)
stop_time = time()
print('|| Process finished in ' + str(tf.ceil(stop_time - start_time)) + " seconds")
# USED FOR TESTING PURPOSE IN TRAVIS
def test_main():
print("ACTIVATING MAIN TEST")
main()
sys.exit(0)
if __name__ == "__main__":
if args.test == True:
test_main()
# Process data from annovar annotated files # EVEN IF NOT FULLY FREESOME COMPLIANT without acmg flag
elif (args.output and args.input and args.assembly) and args.option == ['sort']:
print('''
***TAPES: SORT***
''')
main()
# BUILD annovar DATABASES
elif args.option == ['db'] and args.build_db is True:
print('''
***TAPES: DOWNLOAD DATABASE***
''')
# PROCESS annovar PATH
if args.annovar is None: # If annovar folder not supplied, look at the config file
try:
with open('./src/db_config.json', 'r') as db_cfg:
cfg_dict = json.load(db_cfg)
annovar_path = cfg_dict["annovar_path"]['annovar_path']
except FileNotFoundError:
print("No annovar path given and no db_config.json found")
else:
annovar_path = args.annovar
tf.build_annovar_db(annovar_path, args.assembly, args.acmg)
tf.check_for_acmg_databases(acmg_db_path)
tf.check_online_annovar_dbs(annovar_path)
tf.scan_for_db(annovar_path)
tf.writing_db_to_file(annovar_path, args.acmg_db)
tf.amcg_db_download(acmg_db_path)
# ANNOTATE VCF FILE
elif (args.input and args.output) is not None and args.option == ['annotate']:
print('''
***TAPES: ANNOTATE***
''')
# PROCESS annovar PATH
if args.annovar is None: # If annovar folder not supplied, look at the config file
try:
with open('./src/db_config.json', 'r') as db_cfg:
cfg_dict = json.load(db_cfg)
annovar_path = cfg_dict["annovar_path"]['annovar_path']
except FileNotFoundError:
print("No annovar path given and no db_config.json found")
else:
annovar_path = args.annovar
if args.output[-4:] == '.csv' or args.output[-4:] == '.txt':
tf.process_annotate_vcf(args.input, args.output, annovar_path, args.assembly, args.ref_anno, args.acmg)
tf.merge_sample_info_in_annotated_csv(args.output, args.assembly)
elif args.output[-4:] == '.vcf':
tf.process_annotate_vcf(args.input, args.output, annovar_path, args.assembly, args.ref_anno, args.acmg)
else:
print('|| Could not determine the desired output type, please use either ".csv" or ".vcf" extension for the'
' output')
tf.sys.exit(1)
# SCAN annovar DATABASES ALREADY DOWNLOADED AND STORE THEM INSIDE OF A FILE
elif args.option == ['db'] and args.see_db is True:
print('''
***TAPES: SEE DATABASE***
''')
# PROCESS annovar PATH
if args.annovar is None: # If annovar folder not supplied, look at the config file
try:
with open('./src/db_config.json', 'r') as db_cfg:
cfg_dict = json.load(db_cfg)
annovar_path = cfg_dict["annovar_path"]['annovar_path']
except FileNotFoundError:
print("No annovar path given and no db_config.json found")
else:
args.annovar = os.path.abspath(args.annovar)
annovar_path = args.annovar
tf.check_online_annovar_dbs(annovar_path)
tf.scan_for_db(annovar_path)
tf.writing_db_to_file(annovar_path, args.acmg_db)
# Decompose vcf
elif args.option == ['decompose'] and args.input is not None and args.output is not None :
print('''
***TAPES: DECOMPOSE***
''')
print(tf.tmp_stmp()+'Decomposing...', end=' ')
tf.decompose_vcf(args.input, args.output)
print(tf.tmp_stmp()+"Done")
# Re-analyse TODO REFINE PROCESS for just the analysis wanted (don't do xlsx report)
elif (args.output and args.input and args.assembly) and (args.option == ['analyse'] or args.option == ['analyze']):
print('''
***TAPES: RE-ANALYSE***
''')
final_stuff, soft_used = tf.open_csv_file(args.input, acmg_db_path)
ref_anno = args.ref_anno
number_of_samples = tf.counting_number_of_samples(final_stuff)
output_type = tf.output_type(args.output) # Determine output type either directory or csv or txt/tsv
if output_type == 'directory':
print('|| Please specify output type : txt or csv')
if args.BIG is not None:
final_stuff = final_stuff[final_stuff['Probability_Path'] >= float(args.BIG)]
if args.kegg is not None:
tf.kegg_inverted_patway(args.kegg, final_stuff, ref_anno, acmg_db_path)
if args.by_gene is True:
tf.gene_grouped_report(final_stuff, ref_anno, acmg_db_path, number_of_samples)
if args.enrichr is not None:
tf.pathway_analysis(final_stuff, args.enrichr, ref_anno)
if args.by_sample is True:
tf.individual_reports(final_stuff, ref_anno)
if args.disease is not None:
tf.create_cancer_dataframe(final_stuff, args.disease, ref_anno)
if args.list is not None:
if os.path.isfile(args.list) is True:
with open(args.list, 'r') as list_file:
gene_list = []
for line in list_file.readlines():
gene_list.append(line.rstrip('\n'))
else:
gene_list = set(args.list.split(" "))
tf.find_list_of_genes(final_stuff, gene_list, ref_anno)
################################## OUTPUT EXTRA REPORT ################################################
if output_type == 'csv':
tf.csv_report_reanalyse(args.output, False)
print(tf.tmp_stmp() + "Done")
elif output_type == 'txt':
tf.csv_report_reanalyse(args.output, True)
print(tf.tmp_stmp() + "Done")
else:
print(tf.help_message())
print("Wrong combination of arguments")