-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbulk_makeblastdb.py
executable file
·87 lines (68 loc) · 2.92 KB
/
bulk_makeblastdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
"""
> bulk_makeblastdb.py <
Script to automate the makeblastdb process of (re-)making BLAST databases.
Useful for:
1. compiling new databases
2. changing descriptions of existing databases
To control which databases get recompiled, there's an external hardcoded file
that maps filename --> database type & description. Filename passed into
argparse MUST match these filenames identically.
Passing 'all' as the filename will process ALL files contained in the hardcoded
file. Use with caution.
NOTE: to prevent SequenceServer keeling over, FASTA sequences will have trailing
stop codons ('*') removed and internal stop codons replaced with 'X'.
"""
import argparse
import csv
import gzip
import os
import re
import subprocess
import tempfile
parser = argparse.ArgumentParser(description='''
Script to automate the makeblastdb process of (re-)making BLAST databases.''')
parser.add_argument('database_name', metavar='database_name',
type=str, nargs='+',
help='FASTA files for makeblastdb.')
args = parser.parse_args()
# read hardcoded file
blast_dict = {}
script_abspath = os.path.abspath(__file__)
hardcoded_tsv = script_abspath[:-2] + 'tsv'
tsv_reader = csv.reader(open(hardcoded_tsv), delimiter='\t')
for row in tsv_reader:
if not row: continue
if row[0][0] == '#': continue # ignore commented lines
blast_dict[row[0]] = (row[1], row[2], row[3])
if args.database_name == ['all']:
filenames = [x for x in blast_dict]
else:
filenames = args.database_name
for f in filenames:
if f not in blast_dict:
print (f, 'not found in hardcoded input file!')
else:
db_type, db_title, db_filename = blast_dict[f]
# crude way to determine whether file is gzipped or not
if db_filename[-3:] == '.gz':
sequences = gzip.open(db_filename, 'rt').read()
else:
sequences = open(db_filename).read()
if db_type == 'prot':
# replace/remove stop codons appropriately
# note: while these steps can be easily accomplished with sed (via
# subprocess), there's a chance that malicious filenames might
# compromise the system
sequences = re.sub(r'\*\n>', '\n>', sequences)
sequences = re.sub(r'\*\r\n>', '\r\n>', sequences) # deal with CRLF
sequences = re.sub(r'\*', 'X', sequences)
with tempfile.NamedTemporaryFile() as tf:
tf.write(sequences.encode())
tf.seek(0)
subprocess.run(['makeblastdb', '-in', tf.name,
'-out', f,
'-dbtype', db_type,
'-title', db_title,
'-parse_seqids'])
print (f'SUCCESS: database {f} created, with type {db_type} and title "{db_title}"')