-
Notifications
You must be signed in to change notification settings - Fork 3
/
format_db1.py
executable file
·292 lines (237 loc) · 12.5 KB
/
format_db1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/python
from multiprocessing import Pool
import time
import os
import sys
import argparse
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqUtils import GC
from Bio.SeqRecord import SeqRecord
#from Bio.Seq import Seq
#from Bio.Alphabet import generic_dna
# Copyright(C) 2015 David Ream
# Released under GPL version 3 licence. http://www.gnu.org/licenses/lgpl.html
# Do not remove this comment
# This exists to make the main function easier to read. It contains code to run the argument parser, and does nothing else.
def parser_code():
parser = argparse.ArgumentParser(description='Convert all genbank files found in a specified folder, and optionally a file containing the accessions that you wish to include, and create BLAST searchable databases from them.')
#parser.add_argument("-i", "--infolder", dest="infolder", metavar="DIRECTORY", default='./genomes/',
# help="Folder containing all genbank files for use by the program.")
parser.add_argument("-G", "--genbank_directory", dest="genbank_directory", metavar="DIRECTORY", default='./genomes/',
help="Folder containing all genbank files for use by the program.")
parser.add_argument("-o", "--outfolder", dest="outfolder", metavar="DIRECTORY", default='./db/',
help="Folder where the BLAST searchable databases will be stored.")
parser.add_argument("-f", "--filter", dest="filter", metavar="FILE", default='',
help="File restrictiong which accession numbers this script will process. If no file is provided, filtering is not performed.")
parser.add_argument("-n", "--num_proc", dest="num_proc", metavar="INT", default = os.sysconf("SC_NPROCESSORS_CONF"), type=int,
help="Number of processors that you want this script to run on. The default is every CPU that the system has.")
parser.add_argument("-p", "--protein", dest="protein", default=True, action='store_false',
help="Flag to toggle mode from protein to DNA sequences for use as database construction. The default is protein.")
parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
help="Suppresses most program text outputs.")
return parser.parse_args()
def check_options(parsed_args):
'''
# section of code that checks the infolder entry
if os.path.isdir(parsed_args.infolder):
infolder = parsed_args.infolder
else:
print "The folder %s does not exist." % parsed_args.infolder
sys.exit()
'''
# check the genbank folder
if os.path.isdir(parsed_args.genbank_directory):
genbank_directory = parsed_args.genbank_directory
else:
print "The folder %s does not exist." % parsed_args.genbank_directory
sys.exit()
# if the directory that the user specifies does not exist, then the program makes it for them.
if not os.path.isdir(parsed_args.outfolder):
os.makedirs(parsed_args.outfolder)
outfolder = parsed_args.outfolder
if outfolder[-1] != '/':
outfolder = outfolder + '/'
# Check the filter file
if parsed_args.filter == '' or os.path.exists(parsed_args.filter):
filter_file = parsed_args.filter
else:
print "The file %s does not exist." % parsed_args.filter
sys.exit()
# section of code that deals determining the number of CPU cores that will be used by the program
if parsed_args.num_proc > os.sysconf("SC_NPROCESSORS_CONF"):
num_proc = os.sysconf("SC_NPROCESSORS_CONF")
elif parsed_args.num_proc < 1:
num_proc = 1
else:
num_proc = int(parsed_args.num_proc)
do_protein = parsed_args.protein
quiet = parsed_args.quiet
#return infolder, outfolder, filter_file, num_proc, do_protein
return genbank_directory, outfolder, filter_file, num_proc, do_protein, quiet
#this function will return all of the files that are in a directory. os.walk is recursive traversal.
def returnRecursiveDirFiles(root_dir):
result = []
for path, dir_name, flist in os.walk(root_dir):
for f in flist:
fname = os.path.join(path, f)
if os.path.isfile(fname):
result.append(fname)
return result
#################################################################
#### so yeah.... not gonna do this right now... #####
#### I feel there is no reason to, but keeping #####
#### the code in here just in case i am wrong about that #####
#################################################################
# I need to do an analysis on gc content, and gc skew.
# currently this will return the organism gc, (mean, SD, varience) of the GC in coding regions
# the results will we retained in a file the calling program opens tab delimited
def GCAnalysis(NC, organism, gc_list, seq, outfile):
handle = open(outfile, 'a')
organism_gc = "%3.2f" % GC(seq)
mean = "%3.2f" % numpy.mean(gc_list)
#mode = "%5.2f" % numpy.mode(gc_list)
var = "%5.2f" % numpy.var(gc_list)
std = "%5.2f" % numpy.std(gc_list)
handle.write('\t'.join([NC, organism, organism_gc, mean, var, std]) + NEW_LINE)
# take the genbank file specified by genbank path, and save the customized result file in the db_directory folder
#def convert_genbank(genbank_path, db_directory, error_fname): #, gc_outfile = 'gc_analysis.txt'):
def convert_genbank(genbank_tuple):
genbank_path, db_directory, error_fname, do_protein = genbank_tuple
record_list = []
seq_record = SeqIO.parse(open(genbank_path), "genbank").next()
accession = seq_record.annotations['accessions'][0]
organism = seq_record.annotations['organism'].replace(' ', '_')
err_log = []
gc_list = [] # no need for this right now, but leaving in
# loop over the genbank file
for fnum, feature in enumerate(seq_record.features):
err_flag = False
error_in_field = False
if feature.type == 'CDS':
#print dir(feature.location)
try:
start = int(feature.location.start)
stop = int(feature.location.end)
except:
error_in_field = True
strand = feature.strand
dna_seq = seq_record.seq[start:stop]
#print "dna_seq", type(dna_seq), dna_seq
gc = GC(dna_seq)
gc_list.append(gc)
gc = "%3.2f" % gc
try:
locus = feature.qualifiers['locus_tag'][0]
except:
try:
locus = feature.qualifiers['gene'][0]
except:
locus = 'error'
print "Error in the organism %s with NC # %s" % (organism, accession)
err_flag = True
err_log.append([organism, accession])
if do_protein:
#seq = seq.translate()
#print type(seq)
#print feature.qualifiers.keys()
#seq = dir(feature)
try:
if 'translation' in feature.qualifiers.keys():
prot_seq = Seq(''.join(feature.qualifiers['translation']), IUPAC.protein)
#print "prot_seq", type(prot_seq), prot_seq
if 'gene' in feature.qualifiers:
gene = feature.qualifiers['gene'][0]
#record_list.append(SeqRecord(prot_seq, id = '|'.join([accession, organism, locus, gene, str(start), str(stop), str(strand), gc]).replace(' ', ''), description = ''))
seq_rec_to_store = SeqRecord(prot_seq, id = '|'.join([accession, organism, locus, gene, str(start), str(stop), str(strand), gc]).replace(' ', ''), description = '')
else:
#record_list.append(SeqRecord(prot_seq, id = '|'.join([accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc]).replace(' ', ''),description = ''))
seq_rec_to_store = SeqRecord(prot_seq, id = '|'.join([accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc]).replace(' ', ''),description = '')
#print prot_seq
else:
pass
#print "This was not a protein sequence"
except:
print "Error in function convert_genbank(genbank_tuple) from the format_db.py script, unhandled error in the genbank parse."
else:
# put something in here that will deal with RNA later, if we plan to go that route.
pass
if not error_in_field:
record_list.append(seq_rec_to_store)
else:
print "a record was omitted"
'''
#print len(seq)
if len(seq) < 2:
#pass
print "len seq", len(seq)
elif do_protein:
if 'gene' in feature.qualifiers:
gene = feature.qualifiers['gene'][0]
record_list.append(SeqRecord(seq, id = '|'.join([accession, organism, locus, gene, str(start), str(stop), str(strand), gc]).replace(' ', ''),
description = ''))
else:
record_list.append(
SeqRecord(seq, id = '|'.join([accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc]).replace(' ', ''),
description = ''))
else:
if 'gene' in feature.qualifiers:
gene = feature.qualifiers['gene'][0]
record_list.append(SeqRecord(seq, id = '|'.join([accession, organism, locus, gene, str(start), str(stop), str(strand), gc]).replace(' ', ''),
description = ''))
else:
record_list.append(
SeqRecord(seq, id = '|'.join([accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc]).replace(' ', ''),
description = ''))
'''
#if os.path.isfile(gc_outfile):
# os.remove(gc_outfile)
#GCAnalysis(accession, organism, gc_list, seq_record.seq, gc_outfile)
handle = open(error_fname, 'a')
for i in err_log:
handle.write('\t'.join(i) + '\n')
handle.close()
if not err_flag:
outpath = db_directory + os.path.splitext(os.path.basename(genbank_path))[0] + '.ffc'
#print outpath
out_handle = open(outpath,"w")
SeqIO.write(record_list, out_handle, "fasta")
out_handle.close()
if do_protein:
cmd = "formatdb -i %s -p T -o F" % (outpath)
#print "got here"
else:
cmd = "formatdb -i %s -p F -o F" % (outpath)
os.system(cmd)
#print "Passed main loop"
return outpath, err_flag
def parallel_convert_genbank(file_list, outfolder, num_proc, do_protein, error_fname = "./error_log.txt"):
# Make sure that we have a new error log each time the program is run
if os.path.isfile(error_fname):
os.remove(error_fname)
# Package the variables for the convert_genbank function so everything can be run in parallel
tuple_list = [(i, outfolder, error_fname, do_protein) for i in file_list]
pool = Pool(processes = num_proc)
result = dict(pool.map(convert_genbank, tuple_list))
def main():
start = time.time()
parsed_args = parser_code()
#infolder, outfolder, filter_file, num_proc, do_protein = check_options(parsed_args)
genbank_directory, outfolder, filter_file, num_proc, do_protein, quiet = check_options(parsed_args)
#flist = returnRecursiveDirFiles(infolder)
flist = returnRecursiveDirFiles(genbank_directory)
if filter_file != '':
filter_list = [i.strip() for i in open(filter_file).readlines()]
file_list = [i for i in flist if i.split('/')[-1].split('.')[0] in filter_list]
else:
file_list = flist
#print "do_protein", do_protein
parallel_convert_genbank(file_list, outfolder, num_proc, do_protein)
if not quiet:
print time.time() - start
# A successful command could look like this:
# ./format_db.py -f ./phylo_order.txt
# ./format_db.py -i /home/dave/Desktop/all_genbank -o ./db1/ -f ./phylo_order.txt
if __name__ == '__main__':
main()