-
Notifications
You must be signed in to change notification settings - Fork 0
/
simdifcount.py
134 lines (103 loc) · 4.5 KB
/
simdifcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!usr/bin/python3
#---------------------------------------------------#
# PACKAGES CALL
#---------------------------------------------------#
import sys
#---------------------------------------------------#
# MAIN
#---------------------------------------------------#
def main():
# Retrieve arguments
input_name, fasta_name = getArgs()
# Retrieve accession ID of sequences aligned
seqs = retrieveseqID(fasta_name)
# Counts
conservedRes, strongConserv, weakConserv, noConserv, total = simdifcount(input_name, seqs)
print("Alignment similarities and differences :")
print("Percentage of conserved residues : ", round(conservedRes/total*100,2), "%")
print("Percentage of strongly conserved residue properties : ", round(strongConserv/total*100,2), "%")
print("Percentage of weakly conserved residue properties : ", round(weakConserv/total*100,2), "%")
print("Percentage of residues not conserved : ", round(noConserv/total*100,2), "%")
print("Percentage of overall conservation : ", round((conservedRes+strongConserv+weakConserv)/total*100,2), "%")
#---------------------------------------------------#
# FUNCTIONS
#---------------------------------------------------#
def usage():
print("""
simdifcount.py calculates the percentages of similarities and differences
of an alignment of sequences.
exemple of command line:
=======================
python3 simdifcount.py -i clustalo-res.txt
obligatory:
==========
-i -> result file from clustal omega
-f -> fasta file which was the input for clustal omega
""")
def getArgs():
try:
input_file = sys.argv[sys.argv.index("-i")+1]
print("file to treat : ", input_file)
except:
usage()
print("ERROR : please enter the input file")
sys.exit()
try:
fasta_file = sys.argv[sys.argv.index("-f")+1]
print("fasta to treat : ", fasta_file)
except:
usage()
print("ERROR : please enter the fasta file")
sys.exit()
return (input_file, fasta_file)
def retrieveseqID(fasta):
fastafile = open(fasta, 'r')
fastalines = fastafile.readlines()
fastafile.close()
# Lines starting with '>'
res = []
for i in range(len(fastalines)):
if fastalines[i][0] == '>':
ID = fastalines[i].split(" ")[0]
ID = ID[1:] # remove the >
res.append(ID)
return res
def simdifcount(clustalres, seq):
'''
Results from clustal omega meaning
* -> indicates positions which have a single, fully conserved residue.
: -> indicates conservation between groups of strongly similar properties as below - roughly equivalent to scoring > 0.5 in the Gonnet PAM 250 matrix.
. -> indicates conservation between groups of weakly similar properties as below - roughly equivalent to scoring =< 0.5 and > 0 in the Gonnet PAM 250 matrix.
-> indicates a gap or a non conservation.
'''
# Create a dictionnary for each sequence ID to reform the sequence
dico_seq = {}
for access in seq:
dico_seq[access] = ''
dico_seq["alignment"] = ''
clustalfile = open(clustalres, 'r')
clustallines = clustalfile.readlines()
clustalfile.close()
# Remove header from the Results
clustallines = clustallines[3:]
# calculate the number of caracter befor begining of sequences
start = len(clustallines[0].split("\t")[0]) - 60
for i in range(len(clustallines)):
if (i+1)%4 != 0: # Every 4 lines there is an empty line, just with a \n character
line = clustallines[i].strip("\n") # remov \n character of the string
ID = line.split(" ")[0]
line = line.split("\t")[0]
sequence = line[start :] # retrieving only the 60 characters of the alignment as clustal omega only print 60 characters per line
if len(ID)>0:
dico_seq[ID] += sequence
else:
dico_seq["alignment"] += sequence
conservedRes = dico_seq["alignment"].count('*')
strongConserv = dico_seq["alignment"].count(':')
weakConserv = dico_seq["alignment"].count('.')
noConserv = dico_seq["alignment"].count(' ')
total = conservedRes + strongConserv + weakConserv + noConserv
return conservedRes, strongConserv, weakConserv, noConserv, total
#---------------------------------------------------#
if __name__ == "__main__":
main()