-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathseparate_proteins.py
142 lines (121 loc) · 5.54 KB
/
separate_proteins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import sys
import gzip
import pdb
from Bio import SeqIO
class Protein:
def __init__(self, name, p_id, product, start, end, translation, parent_dir):
self.name = name
self.id = p_id
self.product = product
self.start = start
self.end = end
self.translation = translation
self.parent_dir = parent_dir
def return_fasta(self):
"""
generate FASTA sequence with a sequence name
"""
seq_name = [self.parent_dir]
# I can also add 'product' to this list but product was causing problems
# because it has special characters sometimes
for k in ['name', 'id']:
if getattr(self, k):
seq_name.append(getattr(self, k))
seq_name.append("coord_" + str(self.start) + "-" + str(self.end))
return ">" + "_".join(seq_name) + "\n" + self.translation + "\n"
def write_sequences(proteins, prot_name, out_file):
"""
takes the proteins dict and the protein name
outputs the sequences for that protein in the out_file in fasta format
"""
with open(out_file, "w") as out_file:
if prot_name in proteins:
for prot in proteins[prot_name]:
out_file.write(prot.return_fasta())
# check all gbf and gbff files in directory given
if len(sys.argv) < 3:
print("Takes a txt file with a list of gbk files and an output file name/path")
print("accepted file types (gbf, gbff, gbk) or gzipped version of these")
sys.exit()
in_list = sys.argv[1]
out_file = sys.argv[2]
gene_files = []
with open(in_list, "r") as infile:
for f in infile:
f = f.strip()
gene_files.append(f)
"""
I will loop through each file store the features and check if they are CDS
if so, then I take the qualifiers and take the "gene" key for the name
and the "translation" key for the amino acid sequence
I should keep this information for the name of the sequences
file name the sequence came from, gene name, location (start, end)
to get start and end, I take the feature.location.nofuzzy_start and nofuzzy_end
"""
# proteins = dict()
outfile = open(out_file, "w")
for f in gene_files:
# if gzipped then opening in text mode then giving the handle to seqio
if f.endswith("gz"):
handle = gzip.open(f, "rt")
else:
handle = open(f, "r")
for rec in SeqIO.parse(handle, 'genbank'):
for feature in rec.features:
if feature.type == "CDS":
# If the script was running from somewhere else, I take the name of the directory
# where the genbank files are and use it as parent_dir
# if the scrip was ran in the same directory the I get the current working directoy
# and take the last part
splits = f.split(os.sep)
if (len(f.split(os.sep)) == 2) and (splits[0] == "."):
parent_dir = os.getcwd().split(os.sep)[-1]
else:
parent_dir = f.split(os.sep)[-2]
# if "gene" not in feature.qualifiers:
if "translation" in feature.qualifiers:
try:
name = feature.qualifiers['gene'][0]
except KeyError:
name = ""
try:
p_id = feature.qualifiers['protein_id'][0]
except KeyError:
p_id = ""
try:
product_name = feature.qualifiers['product'][0]
product_name = product_name.replace(" ", "-")
except KeyError:
product_name = ""
# pdb.set_trace()
prot = Protein(name=name, p_id=p_id, product=product_name,
start=feature.location.nofuzzy_start, end=feature.location.nofuzzy_end,
translation=feature.qualifiers["translation"][0], parent_dir=parent_dir)
outfile.write(prot.return_fasta())
# if prot.id not in proteins:
# proteins[prot.id] = [prot]
# else:
# proteins[prot.id].append(prot)
# else:
# if "translation" in feature.qualifiers:
#
# prot = Protein(name=feature.qualifiers["gene"][0], p_id=feature.qualifiers["protein_id"][0],
# start=feature.location.nofuzzy_start, end=feature.location.nofuzzy_end,
# translation=feature.qualifiers["translation"][0], parent_dir=parent_dir)
# outfile.write(prot.return_fasta())
# # if prot.name not in proteins:
# # proteins[prot.name] = [prot]
# # else:
# # proteins[prot.name].append(prot)
outfile.close()
## TESTING, need to delete later
# random_prots = ['fldA', 'rpoA', 'yoaE', 'ymgG', 'ydiK', 'ybeL', 'cdsA', 'yqhC', 'rsxG', 'arcB', 'alkA', 'ycaL']
# for prot in random_prots:
# out_f_name = "e_coli_" + prot + ".fasta"
# with open(out_f_name, "w") as out_file:
# for protein in proteins[prot]:
# out_file.write(protein.return_fasta())
# I might add a module that calls clustalo on each outputted protein to generate an MSA and maybe remove the original
# to save space
# $clustalo -i all_proteins.fasta -o all_proteins.aln --threads 10 --distmat-out=all_proteins.mat --force --full --percent-id