-
Notifications
You must be signed in to change notification settings - Fork 2
/
split_fastas.py
executable file
·84 lines (64 loc) · 2.16 KB
/
split_fastas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python2
import os, optparse
import shutil
import math
def main():
usage = "usage: %prog [options]"
option_parser = optparse.OptionParser( usage )
option_parser.add_option(
'-q', '--query', help = 'Fasta query file, can be multiple sequences'
)
option_parser.add_option(
'-t', '--temp'
)
option_parser.add_option(
'--numProcs', default=4, type=int
)
options, arguments = option_parser.parse_args()
input_directory = options.temp
if os.path.exists( input_directory):
shutil.rmtree( input_directory, ignore_errors=True )
os.mkdir( input_directory )
os.chdir( input_directory )
split_fasta( options )
def split_fasta(opts):
#Will hold the names of all the files created
sub_files=[]
names, seqs = read_fasta_lists(opts.query)
num_seqs=len(names)
if num_seqs>=opts.numProcs: sub_size=int(math.ceil(num_seqs/opts.numProcs)) #Rounds up so that the first few subsets might have slightly more than the last
elif num_seqs>0:
opts.numProcs=num_seqs
sub_size=1
else: return sub_files
for start in range(0, num_seqs, sub_size):
sub_names=names[start:start+sub_size]
sub_seqs=seqs[start:start+sub_size]
new_filename='%d_%d.fasta' % (start+1, start+sub_size)
sub_files.append(new_filename)
write_fasta(sub_names, sub_seqs, new_filename)
return sub_files
def read_fasta_lists(in_file):
fin = open( in_file, 'r')
count=0
names=[]
seqs=[]
seq=''
for line in fin:
line=line.strip()
if line and line[0] == '>': #indicates the name of the sequence
count+=1
names.append(line[1:])
if count>1:
seqs.append(seq)
seq=''
else: seq +=line
seqs.append(seq)
return names, seqs
def write_fasta(names, seqs, new_filename):
fout=open(new_filename, 'w')
for i in range(len(names)):
fout.write(">%s\n%s\n" % (names[i], seqs[i]))
fout.close()
if __name__ == '__main__':
main()