-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpangia_db_build.py
executable file
·751 lines (650 loc) · 25.2 KB
/
pangia_db_build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
#!/usr/bin/env python
import argparse as ap, textwrap as tw
import taxonomy as t
import gzip
import sqlite3
import sys
import os
import subprocess
from re import search
def parse_params():
p = ap.ArgumentParser( prog='pangia_db_build.py', description="""Convert sequences to PanGIA v2 format.""" )
p.add_argument(
'-i','--input', metavar='[FILE(S)]', type=str, nargs='*',
help="input one or multiple FASTA file(s) (default: none)")
p.add_argument(
'-a', '--accPrefixOnly', metavar='[STR]', type=str, default=None,
help="""Parsed sequences with particular prefix in the accession number only [default: None]""")
p.add_argument(
'-dp', '--dbPath', metavar='[PATH]', type=str, default=None,
help="""Path of taxonomy databases [default: <BIN_DIR>/database/]""")
p.add_argument(
'-sdb','--sqlitedb', metavar='[FILE]', nargs='?',
help="Custom SQLite3 taxonomy db [default: <PREFIX>.custom.db]")
p.add_argument(
'-asr', '--assemblySummaryRefseq', metavar='[PATH]', type=str, default=None,
help="""Use assembly_summary_refseq.txt file to define strain [default: None]""")
p.add_argument(
'-asd', '--assemblyRefseqDir', metavar='[PATH]', type=str,
help="""local directory for FASTA files [default: <PREFIX>_refseq_genomes]""")
p.add_argument(
'-rc', '--refseqCategoryOnly', nargs='*', metavar='[STR]', type=str,
help="""Only parse specified refseq category(ies), like 'reference' or 'representative'. Only works with '--assemblySummaryRefseq' option. [default: None]""")
p.add_argument(
'-al', '--assemblyLevelOnly', nargs='*', metavar='[STR]', type=str,
help="""Only parse specified assembly level(s), like 'complete'. Only works with '--assemblySummaryRefseq' option. [default: None]""")
p.add_argument(
'-t','--tag', metavar='[STR]', type=str,
help="Specify a TAG and added to all input sequences. Use 'H' when you specify host reference sequences. The initial of superkingdom name will be used by default, for example 'B' for Bacteria and 'V' for Viruses.")
p.add_argument(
'-ssf', '--strainSeparateFasta', action="store_true",
help="""Each FASTA file represents a strain. This option is ALWAYS TRUE if --assemblySummaryRefseq is used. [default: FALSE]""")
p.add_argument(
'-sr', '--splitDatabaseByRank', metavar='[RANK]', type=str, default="superkingdom",
help="""Split database by rank [default: superkingdom]""")
p.add_argument(
'-x', '--taxid', metavar='[TAXID]', nargs='*', type=str, default=None,
help="""Only process sequences belong to specified taxid(s) [default: None]""")
p.add_argument(
'-s','--skipSeqType', metavar='[STR]', nargs='*', type=str,
help="""Skip specific type(s) of sequence: 'p' for plasmid and 'g' for phage. [default: None]""")
p.add_argument(
'-ngl','--notGottchaDbList', action="store_true",
help="""Not generate gottcha_db input list. [default: False]""")
p.add_argument(
'-p', '--prefix', metavar='[STR]', type=str, default="pangia",
help="""Output file [default: pangia]""")
p.add_argument(
'-rs', '--redundantStrain', action="store_true",
help="""Allow redundant strains [default: FALSE]""")
p.add_argument(
'-iso', '--includeIsolate', action="store_true",
help="""Different isolates will be treated as differnet strains [default: FALSE]""")
args_parsed = p.parse_args()
if not args_parsed.dbPath:
bin_dir = os.path.dirname(os.path.realpath(__file__))
args_parsed.dbPath = bin_dir + "/database"
if not args_parsed.assemblyRefseqDir:
args_parsed.assemblyRefseqDir = args_parsed.prefix + "_refseq_genomes"
if not args_parsed.sqlitedb:
args_parsed.sqlitedb = args_parsed.prefix + ".custom.db"
if args_parsed.assemblySummaryRefseq:
args_parsed.strainSeparateFasta = True
if not args_parsed.input and not args_parsed.assemblySummaryRefseq:
sys.stderr.write( "[ERROR] Neither --input nor --assemblySummaryRefseq wer specified.\n" )
sys.exit(1)
return args_parsed
def getAccFromHeader( full_header ):
acc = 'unknown'
header = full_header.split(" ")[0]
# old RefSeq header:
# >gi|546540813|ref|NZ_CBML010000001.1| Clostridium chauvoei JF4335 WGS project CBML000000000 data, contig 00016
if 'ref|' in header:
m = search( 'ref\|([^\|]+)', header)
try:
acc = m.group(1)
except:
sys.stderr.write( "\n[WARNING] seems 'ref', but failed to parse; header: %s\n" % header )
elif 'gb|' in header:
m = search( 'gb\|([^\|]+)', header)
try:
acc = m.group(1)
except:
sys.stderr.write( "\n[WARNING] seems 'gb', but failed to parse; header: %s\n" % header )
# >CBML010
elif '|' not in header:
acc = header[1:]
else:
sys.stderr.write( "\n[WARNING] Unrecognized format; header: %s\n" % header )
return acc
def generateCustomStrain(taxid, name, desc, c, asr_strain, asr_isolate, include_iso):
#get custom strain name
(str_name, iso_name) = getStrainName(taxid, name, desc, asr_strain, asr_isolate)
# get full custom strain name
cus_name = name
if str_name and not str_name in cus_name:
cus_name += " strain " + str_name
if '[strain|' in desc:
m = search( '\[strain\|([^\]]+)\]', desc )
if m and not m.group(1) in name:
cus_name = m.group(1)
if iso_name and include_iso and not iso_name in cus_name:
cus_name += " isolate " + iso_name
#check custom name's existence
c.execute( 'SELECT * FROM taxonomy_custom WHERE P_TAXID="%s" and NAME="%s"' % (taxid, cus_name) )
tax_rec = c.fetchone()
if tax_rec:
cus_taxid = tax_rec[0]
else:
cus_taxid = getNewCusTaxonomy( c, taxid, cus_name )
return cus_taxid, cus_name, str_name, iso_name
def getNewCusTaxonomy( c, taxid, cus_name, retry=0 ):
#insert new custom name
cus_taxid = ""
str_id = ""
depth = ""
c.execute('SELECT * FROM taxonomy_custom WHERE P_TAXID="%s" ORDER BY STR_ID DESC' % taxid )
tax_rec = c.fetchone()
# if parent taxid (usually a species) already in the database
if tax_rec:
str_id = tax_rec[2]+1
depth = tax_rec[4]
else:
str_id = 1
depth = int(t.taxid2depth(taxid)) + 1
cus_taxid = "%s.%s" % (taxid, str_id)
c.execute('INSERT INTO taxonomy_custom VALUES ("%s", "%s", %s, "%s", %s, "%s")' % ( cus_taxid, taxid, str_id, cus_name, depth, "no rank") )
return cus_taxid
def checkNewTaxid( c, tid, ptid, name, str_name="", iso_name="", aacc="", filename="" ):
"""
ARGUMENTS
c
tid
name
str_name
iso_name
aacc
filename
RETURN
BOOL
"""
c.execute('SELECT * FROM strain_taxonomy WHERE TAXID="%s"' % tid )
tax_rec = c.fetchone()
# if another assembly is named the same strain, skip this file (assembly)
if tax_rec:
sys.stderr.write( "[INFO] Strain exists (Taxid: %s, Name: %s).\n" % (tid, tax_rec[2]) )
return False
# else, add this strain to the taxanomy database
else:
# TAXID CHAR(30) NOT NULL, /* taxid */
# P_TAXID CHAR(30) NOT NULL, /* parent taxid */
# NAME CHAR(200) NOT NULL, /* taxonomy name */
# STR_NAME CHAR(200) NULL, /* strain name */
# ISO_NAME CHAR(200) NULL, /* isolate name */
# ASSEM_ACC CHAR(50) NULL, /* assembly acc# */
# FILE CHAR(50) NULL, /* file name */
# PRIMARY KEY (TAXID)
c.execute('INSERT INTO strain_taxonomy VALUES ("%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (
tid,
ptid,
name,
str_name,
iso_name,
aacc,
filename
)
)
return True
def getStrainName(taxid, name, desc, asr_strain, asr_isolate):
"""
ARGUMENTS
taxid
name
desc
asr_strain
asr_isolate
RETURN
str_name
iso_name
"""
str_name = ""
iso_name = ""
# get strain info from assembly summary report
if asr_strain:
str_name = asr_strain
iso_name = asr_isolate
#replace ", " with space
mod_desc = desc.replace( ", ", " " )
mod_desc = desc.replace( '"', "'" )
#if custom strain name exists, use it; otherwise, try to identify it from desc
if '[isolate|' in desc:
m = search( '\[isolate\|([^\]]+)\]', mod_desc )
if m and not m.group(1) in name: iso_name = m.group(1)
else:
if 'isolate' in desc:
m = search( '(isolate \S+)', mod_desc )
if m and not m.group(1) in name: iso_name = m.group(1)
elif 'clone' in desc:
m = search( '(clone \S+)', mod_desc )
if m and not m.group(1) in name: iso_name = m.group(1)
#for custom isolate name exists, use it; otherwise, try to identify it from desc
if '[strain|' in desc:
m = search( '\[strain\|([^\]]+)\]', mod_desc )
if m and not m.group(1) in name: str_name = m.group(1)
else:
if 'strain' in desc:
m = search( '(strain \S+)', mod_desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
elif 'strain:' in desc:
m = search( '(strain: \S+)', mod_desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
elif ' str ' in desc:
m = search( ' str (\S+)', mod_desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
elif ' str. ' in desc:
m = search( ' str\. (\S+)', mod_desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
elif 'sp.' in desc:
m = search( '(sp\. \S+)', mod_desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
elif '/' in desc: # for viruses especially
m = search( '(\S+\/\S+\/\S+\/\S+)', mod_desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
str_name = str_name.rstrip(',')
#more strain name search if all of above failed
if not str_name:
if "ATCC " in desc:
m = search( '(ATCC \S+)', mod_desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
if "NCTC " in desc:
m = search( '(NCTC \S+)', mod_desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
elif "genome assembly " in desc:
m = search( 'genome assembly (\S+)', mod_desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
elif "," in desc:
if name in desc:
m = search( '%s (\S+),'%name, desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
elif name in desc:
m = search( '%s (\S+)'%name, desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
elif name in desc:
m = search( '%s ([^,]+),'%name, desc )
if m and not m.group(1) in name: str_name += " "+m.group(1)
else:
m = search( '(\S+),', desc )
if m and not m.group(1) in name:
if not m.group(1).startswith('('):
str_name += " "+m.group(1).rstrip(')')
else:
str_name += " "+m.group(1)
# desperate mode
if not str_name:
str_name = mod_desc
return (str_name, iso_name)
def initTaxaDB(c):
c.execute("""
CREATE TABLE IF NOT EXISTS taxonomy_custom(
TAXID CHAR(30) NOT NULL,
P_TAXID CHAR(20) NOT NULL,
STR_ID INT NOT NULL,
NAME CHAR(200) NOT NULL,
DEPTH INT NOT NULL,
RANK CHAR(20) NOT NULL,
PRIMARY KEY (TAXID)
);
""")
c.execute("""
CREATE TABLE IF NOT EXISTS strain_taxonomy(
TAXID CHAR(30) NOT NULL, /* taxid */
P_TAXID CHAR(30) NOT NULL, /* parent taxid */
NAME CHAR(200) NOT NULL, /* taxonomy name */
STR_NAME CHAR(200) NULL, /* strain name */
ISO_NAME CHAR(200) NULL, /* isolate name */
ASSEM_ACC CHAR(50) NULL, /* assembly acc# */
FILE CHAR(50) NULL, /* file name */
PRIMARY KEY (TAXID)
);
""")
conn.commit()
def processASR( c, tid2lineage, asr, rc, al, asr_dir, target_tid, tag, flag_red_str, flag_ind_iso=0 ):
"""
ARGVS:
c OBJ sqlite3 connection obj
asr STR asr file
rc LIST refseq category
al LIST assembly level
asr_dir STR asr local directory
target_tid LIST target taxid
tag STR tag
flag_red_str BOOL flag for allowing redundant strains
flag_ind_iso BOOL flag for including isolate in strain name
RETURN:
asr_info DICT
"""
# init vars
asr_info = t._autoVivification()
lineage = t._autoVivification()
cnt_q = 0 #qualified assembly
cnt_tol = 0 #total genomes
# parsing assembly_summary_refseq.txt file
with open(asr) as f:
for line in f:
if line.startswith('#'):
continue
else:
cnt_tol += 1
line = line.strip('\n')
#split each line in assembly_summary_refseq.txt:
# 0- 4 assembly_accession bioproject biosample wgs_master refseq_category
# 5- 9 taxid species_taxid organism_name infraspecific_name isolate
# 10-14 version_status assembly_level release_type genome_rep seq_rel_date
# 15-19 asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path
# 20-21 excluded_from_refseq relation_to_type_material
tmp = line.split('\t')
filename = ""
local_path = ""
sys.stderr.write( "[INFO] Processing: %s..."%tmp[0] )
# try to get taxonomy lineage
try:
lineage = t.taxid2lineageDICT(tmp[5], 1, 1)
except:
sys.stderr.write( "skipped. Removed TaxID (%s) found for %s.\n" % (tmp[5], tmp[0]) )
continue
# SKIPPING following records
# 1) not specified refseq_category, example: reference
if rc:
flag=0
for cate in rc:
if cate.lower() in tmp[4].lower():
flag=1
break
if not flag:
sys.stderr.write( "skipped. Not belongs to specific refseq_category %s.\n"%rc )
continue
# 2) not specified assembly_level, example: complete
if al:
flag=0
for l in al:
if l.lower() in tmp[11].lower():
flag=1
break
if not flag:
sys.stderr.write( "skipped. Not belongs to specific assembly_level %s.\n"%al )
continue
# 3) assemblies that marked "excluded_from_refseq"
if tmp[20]:
sys.stderr.write( "skipped. Marked as excluded_from_refseq.\n" )
continue
# 4) not belongs to specified tax id
if tmp[5] and target_tid:
flag_tid_in_lineage=0
for t_tid in target_tid:
if not flag_tid_in_lineage:
for t_rank in lineage:
if lineage[t_rank]['taxid'] == t_tid:
flag_tid_in_lineage=1
break
if not flag_tid_in_lineage:
sys.stderr.write( "skipped. Not belongs to specified tid.\n" )
continue
# 5) sequence location is not available
if tmp[19] != "na":
if tmp[19].startswith('ftp'):
# real filename of assembly
filename = tmp[19].split('/')[-1] + "_genomic.fna.gz"
# directory structure is retained to keep local filesystem healthy
local_path = asr_dir + tmp[19].split('genomes')[1]
else:
filename = tmp[19].split('/')[-1]
local_path = "/".join(tmp[19].split('/')[:-1])
else:
sys.stderr.write( "skipped. No URL provided.\n" )
continue
# 6) not an unique strain
tid = tmp[5]
rank = t.taxid2rank(tid)
name = t.taxid2name(tid)
if rank != "strain" and rank != "unknown":
# generate custom strain if not exists and add strains to database
(cus_str_taxid, cus_str_name, str_name, iso_name) = generateCustomStrain( tid, name, '', c, tmp[8].replace("strain=",""), tmp[9], flag_ind_iso )
#add custom strain to lineage
lineage["strain"]['taxid'] = cus_str_taxid
lineage["strain"]['name'] = cus_str_name
flag_new_strain = checkNewTaxid(c, lineage["strain"]['taxid'], "", lineage["strain"]['name'], tmp[8].replace("strain=",""), tmp[9], tmp[0], filename )
if not flag_new_strain and not flag_red_str:
sys.stderr.write( "skipped. Not an unique strain.\n" )
continue
sys.stderr.write( "qualified.\n" )
# download sequence files if not available at local directory
if not os.path.isfile( local_path+"/"+filename ):
url = tmp[19]+"/"+filename
wget( url, local_path )
else:
sys.stderr.write( "[INFO] Found local file: %s.\n"%(local_path+"/"+filename) )
# tag
if not tag:
tag = lineage["superkingdom"]['name'][0]
# use assembly_accession as key
cnt_q += 1
tid = lineage["strain"]['taxid']
asr_info[tmp[0]]['taxid'] = tid
asr_info[tmp[0]]['full_str_name'] = lineage["strain"]['name']
asr_info[tmp[0]]['ftp_path'] = tmp[19]
asr_info[tmp[0]]['local_path'] = local_path
asr_info[tmp[0]]['filename'] = filename
asr_info[tmp[0]]['type_material'] = True if tmp[21] else False
asr_info[tmp[0]]['cate_tag'] = tag
tid2lineage[tid] = lineage
f.close()
return asr_info, tid2lineage, cnt_q, cnt_tol
def processFASTA( filename, asm_asr_info, tid2lineage, tag, flag_str_sep_fa, accPrefixOnly, skipSeqType, output, flag_ind_iso=False, flag_no_gdb_list=0, flag_redundant_str=False ):
"""
ARGVS:
filename STR FASTA filename
asm_asr_info DICT asr_info[asm]
tag STR tag
flag_str_sep_fa BOOL all sequences in the file belong to one strain
accPrefixOnly STR process sequences with accession number start with particular prefix only
skipSeqType LIST skip type(s) of sequence
output OBJ file object for output FASTA
RETURN:
cnt INT total # of output sequences
cnt_tol INT total # of input sequences
"""
#init vars
(acc, rtype, rdesc, rank, rtid, rname, rseq, rnote) = ("", "", "", "", "", "", "", "")
ignore = False
lineage = {}
gdb_list = {}
cnt_tol = 0
cnt = 0
# open files
if os.path.exists(filename):
if filename.endswith('gz'):
infile = gzip.open(filename, mode='rt', encoding='utf-8')
else:
infile = open(filename, 'r')
else:
sys.stderr.write( "[ERROR] %s doesn't exists.\n"%filename )
exit
# processing FASTA files
for line in infile:
if line.startswith(">"):
cnt_tol+=1
#print last sequences
if acc and not ignore:
cnt+=1
output.write( ">%s|%s|%s|%s [%s|%s] %s\n%s\n" % (acc, len(rseq), rtid, rtype, rank, rname, rdesc, rseq) )
if rank == 'unknown': sys.stderr.write( "\n[WARNING] Unknown sequence: >%s|%s|%s|%s %s\n" % (acc, len(rseq), rtid, rtype, rdesc) )
#reset taxa info for sequences
if not flag_str_sep_fa:
(acc, rtype, rdesc, rank, rtid, rname, rseq, rank) = ("", "", "", "", "", "", "", "")
else:
(acc, rtype, rseq) = ("", "", "")
#parse header
temp = line.strip().split(' ')
header = temp[0]
acc = getAccFromHeader( header )
rdesc = " ".join(temp[1:])
#somehow failed to parse acc from header
if not acc or acc=="unknown":
sys.stderr.write( "[WARNING] Unknown acc#: >%s|%s|%s|%s %s\n" % (acc, len(rseq), rtid, rtype, rdesc) )
ignore = True
continue
# check prefix
if accPrefixOnly:
if not acc.startswith(accPrefixOnly):
sys.stderr.write( "[WARNING] Ignored, acc prefix not match: >%s|%s|%s|%s %s\n" % (acc, len(rseq), rtid, rtype, rdesc) )
ignore = True
continue
# get taxonomy info
if "taxid" in asm_asr_info:
#asm file is provided
rtid = asm_asr_info["taxid"]
rname = asm_asr_info["full_str_name"]
rank = "strain"
lineage = tid2lineage[rtid]
else:
if not rtid:
# for custom acc#, parse [taxid|xxxx]
if "[taxid|" in rdesc:
m = search( '\[taxid\|(\d+)\]', rdesc)
try:
rtid = m.group(1)
except:
sys.stderr.write( "[WARNING] Failed to parse taxid from [taxid|xxxx].\n")
else:
# convert from acc2taxid
rtid = t.acc2taxid(acc)
if rtid and t.taxidStatus( rtid ) != "invalid":
rank = t.taxid2rank( rtid )
name = t.taxid2name( rtid )
else: # skip the reference if not able to find a tax id
ignore = True
sys.stderr.write( "[WARNING] Ignored. Failed mapping to TaxID.\n")
continue
lineage = t.taxid2lineageDICT(rtid, 1, 1)
# if rtid is not strain level taxonomy
(str_name, iso_name) = ("","")
if rank != "strain" and rank != "unknown":
(new_taxid, new_name, str_name, iso_name) = generateCustomStrain( rtid, name, rdesc, c, '', '', flag_ind_iso )
#write custom tax
rtid = new_taxid
rname = new_name
rank = "strain"
lineage[rank]['taxid'] = rtid
lineage[rank]['name'] = rname
else: # if rtid is strain level tax
rname = name
#check if the strain is unique
flag_new_strain = checkNewTaxid(c, rtid, "", rname, str_name, iso_name, "", filename )
if not rtid in tid2lineage:
tid2lineage[rtid] = lineage
if not flag_new_strain and not flag_redundant_str:
sys.stderr.write( "[WARNING] Ignored, strain exists.\n")
ignore = True
continue
# tags for different sequence types
if 'cate_tag' in asm_asr_info:
rtype = asm_asr_info['cate_tag']
elif tag:
rtype = tag
else:
rtype = lineage["superkingdom"]['name'][0]
if 'plasmid' in line or 'Plasmid' in line:
rtype += 'p' #plasmid
elif 'phage' in line or 'Phage' in line:
rtype += 'g' #Phage
else:
rtype += 'c' #chromosome
if skipSeqType and rtype in skipSeqType:
sys.stderr.write( "[WARNING] Ignored, unwanted type.\n")
ignore = True
continue
# PLASMID ONLY: if a plasmid doesn't belong to a species or up, roll it up to species
#if 'p' in rtype and ( rank == 'strain' or rank == 'no rank' ):
# rank = "species"
# rtid = t.taxid2taxidOnRank(rtid, rank)
# rname = t.taxid2name(rtid)
# hash for gotthca_db input list
if not flag_no_gdb_list:
gdb_list[filename] = rtid
ignore = False
else:
if ignore: continue
line = line.strip()
if not line: continue #empty line
rseq += line
#last data
if acc and not ignore:
cnt+=1
output.write( ">%s|%s|%s|%s [%s|%s] %s\n%s\n" % (acc, len(rseq), rtid, rtype, rank, rname, rdesc, rseq) )
infile.close()
return (cnt, cnt_tol, gdb_list)
def generateGottchaDbList(gdb_list, output_gdbl_buffer, tid2lineage):
for fn in gdb_list:
tid = gdb_list[fn]
ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
for rank in ranks:
text = "%s\t%s\tlinear\t%s\n"%( tid2lineage[tid][rank]['name'], fn, tid )
if not rank in output_gdbl_buffer:
output_gdbl_buffer[rank] = text
else:
output_gdbl_buffer[rank] += text
return output_gdbl_buffer
def wget(url, lpath="./"):
subprocess.run("mkdir -p %s"%lpath, shell=True, check=True)
subprocess.run("wget '%s' -P %s"%(url, lpath), shell=True, check=True)
if __name__ == '__main__':
argvs = parse_params()
cnt = 0
cnt_fa = 0 # accepted genomes
cnt_fa_tol = 0 # total genomes
cnt_seq = 0
cnt_seq_tol = 0
tid2lineage = t._autoVivification()
output = sys.stdout
output_gdbl_buffer = {}
# loading taxonomy
sys.stderr.write( "Loading taxonomy..." )
t.loadTaxonomy( argvs.dbPath )
sys.stderr.write( "completed.\n" )
# init the taxonomy sqlite3 db file
conn = sqlite3.connect(argvs.sqlitedb)
conn.isolation_level = None
c = conn.cursor()
initTaxaDB(c)
# create output file
output_fn = "%s.fasta"%argvs.prefix
sys.stderr.write( "The script will save output sequences to %s.\n"%output_fn )
output = open( "%s.fasta"%argvs.prefix, "w")
sys.stderr.write( "Start processing...\n" )
# input from assembly summary refseq report
if argvs.assemblySummaryRefseq and os.path.isfile(argvs.assemblySummaryRefseq):
sys.stderr.write( "Parsing assembly summary refseq report...\n" )
(asr_info, tid2lineage, cnt_fa, cnt_fa_tol) = processASR( c, tid2lineage, argvs.assemblySummaryRefseq, argvs.refseqCategoryOnly, argvs.assemblyLevelOnly, argvs.assemblyRefseqDir, argvs.taxid, argvs.tag, argvs.redundantStrain, argvs.includeIsolate )
sys.stderr.write( "Done. %s/%s genome(s) qualified.\n"%(cnt_fa, cnt_fa_tol) )
sys.stderr.write( "Parsing genomes sequences...\n" )
for asm in asr_info:
input_filename = asr_info[asm]['local_path']+"/"+asr_info[asm]['filename']
sys.stderr.write( "Parsing %s from %s...\n"%(asm,input_filename) )
(cnt_fa_seq, cnt_fa_seq_tol, gdb_list) = processFASTA( input_filename, asr_info[asm], tid2lineage, argvs.tag, True, argvs.accPrefixOnly, argvs.skipSeqType, output, argvs.includeIsolate )
output_gdbl_buffer = generateGottchaDbList(gdb_list, output_gdbl_buffer, tid2lineage)
cnt_seq += cnt_fa_seq
cnt_seq_tol += cnt_fa_seq_tol
cnt+=1
sys.stderr.write( "Done. %s/%s sequence(s) parsed.\n"%(cnt_fa_seq, cnt_fa_seq_tol) )
sys.stderr.write( "%s/%s qualified genome(s) parsed.\n"%(cnt, cnt_fa) )
if not argvs.notGottchaDbList:
sys.stderr.write( "Generating gottcha_db input lists...\n" )
ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
for rank in ranks:
with open( "%s.gottcha_db.%s.list"%(argvs.prefix, rank), "w") as f:
f.write( output_gdbl_buffer[rank] )
f.close()
sys.stderr.write( "Done.\n" )
# input from FASTA files
if argvs.input:
for input_filename in argvs.input:
cnt_fa += 1
if os.path.isfile(input_filename):
sys.stderr.write( "Parsing %s...\n"%input_filename )
(cnt_fa_seq, cnt_fa_seq_tol, gdb_list) = processFASTA( input_filename, {}, tid2lineage, argvs.tag, argvs.strainSeparateFasta, argvs.accPrefixOnly, argvs.skipSeqType, output, argvs.includeIsolate, argvs.notGottchaDbList, argvs.redundantStrain )
cnt += 1
cnt_seq += cnt_fa_seq
cnt_seq_tol += cnt_fa_seq_tol
sys.stderr.write( "Done. %s/%s sequence(s) parsed.\n"%(cnt_fa_seq, cnt_fa_seq_tol) )
else:
sys.stderr.write( "[WARNING] Skipped: file is not available: %s\n"%input_filename )
# output custom database from sqlite3 to tsv
with open(argvs.sqlitedb+".tsv", "w") as tsv_output:
try:
c.execute( 'SELECT * FROM taxonomy_custom ORDER BY NAME' )
for row in c.fetchall():
tsv_output.write( "%s\t%s\t%s\t%s\t%s\n"%(row[0], row[4], row[1], row[5], row[3]) )
except sqlite3.Error as e:
sys.stderr.write( "[ERROR] An error occurred: %s\n" % e.args[0] )
conn.close()
sys.stderr.write( "\nComplete! %d/%d qualified genome file(s); %d/%d sequence(s) processed. \n" % (cnt, cnt_fa, cnt_seq, cnt_seq_tol) )