forked from tsa4002/find_orthologs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathF18_Final_Project.py
362 lines (315 loc) · 14.3 KB
/
F18_Final_Project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import subprocess
import sys
from Bio.Align.Applications import MuscleCommandline
from StringIO import StringIO
from Bio import AlignIO
def combine_fasta(filename1, filename2): #DM
'''
This function combines two FASTA files into once
It takes in two different FASTA files, one with the human sequence and the
other with the nonhuman sequences and combines the two so that they can be
aligned using MUSCLE
Args:
filename1 - the non-human FASTA file
filename2 - the human FASTA file
Returns:
combined.fasta - a new file with the combined sequences
'''
with open("combined.fasta", "w") as output:
with open(filename1, "r") as f1:
line = f1.readline()
while line:
output.write(line)
line = f1.readline()
with open(filename2, "r") as f2:
output.write('\n')
line = f2.readline()
while line:
output.write(line)
line = f2.readline()
def get_hum_access(filename2):
'''
This function parses out the accession number of the human gene from
filename2. This will later be used to search the ClinVar file for only
relevant mutations.
Args:
filename2 - the human FASTA file
Returns:
hum_access - the accession of the human gene
'''
hum_access = ''
with open(filename2, 'r') as f2:
for line in f2:
if line[0] == '>':
line = line.strip('>')
line = line.split(' ')
print(line[0])
hum_access = hum_access + line[0]
return hum_access
#Creates a new files where the ClustalW alignment will be written to
out_file = "aligned_combined.aln"
#Takes in the combined file of human and non-human sequences and aligns them
muscle_cline=MuscleCommandline(input="combined.fasta", out=out_file, clw=True)
stdout, stderr = muscle_cline()
align=AlignIO.read(out_file, "clustal")
def cleanup_dict(clin_var_data_dict): #NK&NS
'''
Converts the amino acids from three to one letter codes.
Takes in the clin_var_data_dict and accesses the positions containing the
original and mutant amino acids. Depending on length, the function will
either convert from three to one letter code or include an arrow (->) to
indicate a frame shift mutation. In the conversion dictionary, terminating
amino acids are converted into stars (*) as well as deletions. If the amino
acid data is not in the dictionary, the data is passed over.
Args:
clin_var_data_dict - dictionary with accession numbers and data
Returns:
nothing is returned. Only modifies clin_var_data_dict.
'''
amino_conversion= {'Ala':'A','Arg':'R','Asn':'N','Asp':'D','Cys':'C',
'Glu':'E','Gln':'Q','Gly':'G','His':'H','Ile':'I',
'Leu':'L','Lys':'K','Met':'M','Phe':'F','Pro':'P',
'Ser':'S','Thr':'T','Trp':'W','Tyr':'Y','Val':'V',
'Ter':'*', 'del':'*'}
for accession in clin_var_data_dict:
for entry in clin_var_data_dict[accession]:
#Converts original amino acid to one letter code
if len(entry[0]) == 0:
pass
elif len(entry[0]) == 3:
if amino_conversion.get(entry[0]) == None:
pass
else:
entry[0] = amino_conversion[entry[0]]
else:
pass
#Converts mutant amino acid to one letter code
if len(entry[2]) == 0:
pass
elif len(entry[2]) == 3:
if amino_conversion.get(entry[2]) == None:
pass
else:
entry[2] = amino_conversion[entry[2]]
#If mutant results in a frameshift, arrow indicates it
elif len(entry[2]) == 5:
if amino_conversion.get(entry[2][0:3]) == None:
pass
else:
entry[2] = amino_conversion[entry[2][0:3]] + '->'
else:
pass
def clin_var_parser(): #NK&NS
'''
Parses variant_summary.txt into accession number and data.
Creates a dictionary which holds accession numbers as keys and each value is
a list containing each entry under that accession number as a nested list.
Each entry list contains: the original amino, the mutation position, the
mutant amino, whether a deletion occurred ("yes"/"no"), mutation type, its
clinical significance, the phenotype, and the chromosome the mutation occurs
in. It excludes entries that lack protein information or if a large deletion
and an insertion occurred.
Returns:
clin_var_data_dict - dictionary with accession numbers and data
'''
#Initializes the clin_var_data_dict
clin_var_data_dict = {}
with open('variant_summary2.txt', 'r') as fh:
#Variable containing all column labels in fh
column_labels = fh.readline()
#List to contain each line in fh
lines = []
#Parses out the mRNA accession number of each line and creates a
#dictionary entry for each accession number
for line in fh:
line = line.strip('\n')
line = line.split('\t')
lines.append(line)
#Variable containing the accession number of the entry
accession_num = line[2].split('(')[0]
other_Name_data = line[2].split('(')[-1]
#Creates an entry in dictionary for mRNA accession number
if accession_num.find('NM') >= 0:
if other_Name_data.find('p.') >= 0 and \
other_Name_data.find('delins') == -1 and \
other_Name_data.find('_') == -1:
clin_var_data_dict[accession_num] = []
for line in lines:
accession_num = line[2].split('(')[0]
if accession_num.find('NM') >= 0:
#Replacing first parentheses with star to find amino info
line[2] = line[2].replace('(','*',1)
current_line = line[2].split('(')
amino_info = current_line[-1]
original_amino = ''
position = ''
mutant_amino = ''
deletion_y_n = ''
#CASE: if no protein data is given
if amino_info.find('p.') == -1:
#print 'noprotdata', amino_info
pass
#CASE: if the mutation is a large deletion and insertion
elif amino_info.find('delins') >= 0:
pass
#CASE: if the nucleotide mutation is the same amino acid
elif amino_info.find('=') >= 0:
original_amino = amino_info[2:5]
for char in amino_info:
if char.isdigit():
position += char
position = position
mutant_amino = original_amino
deletion_y_n = 'no'
#CASE: if the mutation is a deletion
elif amino_info.find('del') >= 0:
original_amino = amino_info[2:5]
for char in amino_info:
if char.isdigit():
position += char
position = position
mutant_amino = amino_info[(len(amino_info)-4):-1]
deletion_y_n = 'yes'
#CASE: if the mutation is a frameshift
elif amino_info.find('fs') >= 0:
original_amino = amino_info[2:5]
for char in amino_info:
if char.isdigit():
position += char
position = position
mutant_amino = amino_info[(len(amino_info)-6):-1]
deletion_y_n = 'no'
#CASE: if the mutation leads to a termination (Ter)
elif amino_info.find('Ter') >= 0:
original_amino = amino_info[2:5]
for char in amino_info:
if char.isdigit():
position += char
position = position
mutant_amino = amino_info[(len(amino_info)-4):-1]
deletion_y_n = 'no'
#CASE: if the mutation is a simple substitution
else:
original_amino = amino_info[2:5]
for char in amino_info:
if char.isdigit():
position += char
position = position
mutant_amino = amino_info[(len(amino_info)-4):-1]
deletion_y_n = 'no'
#Appends the relevant information to the entry for each
#accession number
other_Name_data = line[2].split('(')[-1]
if other_Name_data.find('p.') >= 0 and \
other_Name_data.find('delins') == -1 and \
other_Name_data.find('_') == -1:
clin_var_data_dict[accession_num].append([original_amino,\
position, mutant_amino, deletion_y_n, line[1],line[6],\
line[13], line[18]])
#Removes duplicates
for accession_num in clin_var_data_dict:
copy = []
for entry in clin_var_data_dict[accession_num]:
if entry not in copy:
copy.append(entry)
clin_var_data_dict[accession_num] = copy
cleanup_dict(clin_var_data_dict)
return clin_var_data_dict
def convert_to_string(align):
'''
MUSCLE outputs the alignment in type Multipe Sequence Alignment (MSA), which is
not mutable and so not very helpful for our needs. This function converts
the alignment into a string.
Args:
align - the MSA output from MUSCLE
Returns:
symbols - a list of the alignment symbols (*,:, or .)
'''
with open('aligned.aln', 'r') as aligned_seq:
header = ''
symbols = ''
for line in aligned_seq:
#takes the header of the file and sets it to its own variable name
if line[0] == 'M':
header = header + line
#if the line does not contain the header or an accession number,
#it appends the contents of the line to the symbols string
if line[0] == ' ':
line = line.strip('\n')
symbols = symbols + line
return symbols
def conserve_CVs(clin_var_data_dict, symbols, hum_access): #KG&AP&AC
'''
Identify Conserved Clinical Variants Between Protein Sequences
This method identifies the clinical variants conserved between the non-human
protein sequence and the human protein sequence. Creates a list of the cv
index positions. Determines if matches are conserved clinical variants and
formats a new alignment string. Identify the conserved clinical variants and
their parameters to generate data for a conserved clinical variant table.
Args:
clin_var_data_dict - dictionary of accession numbers and associated clinical
variants
symbols - list of alignment symbols
hum_access - the accession number of the human gene of interest
Returns:
conserved_var_output - nested list of CONSERVED clinical variants and parameters
'''
#Create List of All Human CV Index Positions
cv_aa_positions = []
for nest in clin_var_data_dict.values():
for inner in nest:
cv_aa_positions.append(inner[1])
#Identify Conserved Clinical Variants & Format Alignment String
index = 0
cv_alignment_str = ''
for symbol in symbols:
if symbol == "*":
index_pos = symbols.find("*", index, len(symbols))
# Find the first aa match from index position
aa_pos = index_pos + 1 #Convert index position to aa position
aa_pos_str = str(aa_pos) #Convert aa to string to search var_output
if aa_pos_str in cv_aa_positions: #If aa position in cv index list
cv_alignment_str = cv_alignment_str + "|"
else: #Else, conserve the alignment symbol from align_ortho
cv_alignment_str = cv_alignment_str + symbol
index = index + 1
else: #Else, conserve the alignment symbol from align_ortho
cv_alignment_str = cv_alignment_str + symbol
index = index + 1
#Identifying Conserved Clinical Variant and Parameters (Table)
conserved_cv_index = [] #List of Conserved CV Index positions
start = 0
#add in conserved indexes for strongly and weakly conserved nucleotides
for symbol in cv_alignment_str:
if symbol == "|":
index = cv_alignment_str.find("|", start) #Find index position of conserved clinical variant
start = index + 1
index = str(index+1)
conserved_cv_index.append(index) #Append conserved CV index position to list
else:
start = start + 1 #Iterate through cv_alignment_str
conserved_var_output = [] # # AP, KG, AC: List of Conserved CVs w/ Parameters (for Table)
for index in conserved_cv_index: # For every conserved index...
for key in clin_var_data_dict:
key_short = key[:len(hum_access)]
if key_short == hum_access:
print('key_short')
#for cv in clin_var_data_dict[hum_access]: # and for every cv in var_output from id_cv...\
for item in clin_var_data_dict[key]:
#print(item)
if index == item[1]: # if conserved cv index is present in the table
print("index")
conserved_var_output.append(item[1]) # append conserved cvs to table list
else:
pass
return conserved_var_output
#return [align, conserved_var_output] # AP, KG, AC: list containing String of cosnerved Alignment symbols and conserved clincial variants
def main():
combine_fasta("NR5A_orthologs.fasta", "human.fasta")
hum_access = get_hum_access("human.fasta")
clin_var_data_dict = clin_var_parser()
symbols = convert_to_string(align)
conserved_var_output = conserve_CVs(clin_var_data_dict, symbols, hum_access)
print('output',conserved_var_output)
if __name__ == '__main__':
main()