-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsoftconsensus.py
147 lines (109 loc) · 4.27 KB
/
softconsensus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import sys
import numpy as np
from Bio import SeqIO
import RNA
def read_alignment(fpath, ali_format='fasta'):
with open(fpath) as f:
for record in SeqIO.parse(f, ali_format):
seq = str(record.seq)
seq = seq.upper().replace("T", "U")
yield seq
class Alignment:
def __init__(self, alignment, md=RNA.md()):
"""Create alignment object from a given aligned file
"""
self.md = md
self.aligned = [seq for seq in alignment]
self.ungapped = [seq.replace('-','') for seq in self.aligned]
self.consensus = ""
self.fc = None
self.ps_energy = [] # 1-indexed
self.ps_energy_up = [] # 1-indexed
self.dominant_bps = [] # 1-indexed
self._fold()
@classmethod
def from_file(cls, filepath, md=RNA.md(), ali_format='fasta'):
return cls(read_alignment(fpath, ali_format), md)
def _fold(self):
"""Get consensus structure and bpp
"""
fc = RNA.fold_compound(self.aligned, self.md)
ss, mfe = fc.mfe()
fc.exp_params_rescale(mfe)
_, self.fee = fc.pf()
self.consensus = ss
self.fc = fc
self.ps_energy, self.ps_energy_up = self._pseudo_energy()
self.dominant_bps = [(ai, aj) for ai, aj in zip(*np.where(self.ps_energy<0))]
def _pseudo_energy(self):
"""Create pseudo energy matrix
"""
RT = RNA.GASCONST*(self.md.temperature + RNA.K0)/1000
bpp = np.array(self.fc.bpp())
bpp_pp = bpp.copy()
np.set_printoptions(threshold=np.inf)
paired = bpp>=0.99
unpaired = bpp==0
bpp[paired] = np.NaN
bpp[unpaired] = np.NaN
tmp = np.minimum(0, -RT*np.log(bpp/(1-bpp)))
pos_paired = np.argwhere(paired==True)
fc = RNA.fold_compound(self.aligned)
ss, mfe = fc.mfe()
fc.exp_params_rescale(mfe)
for bp in pos_paired:
fc.hc_init()
#this really enforces the base pair
fc.hc_add_bp(int(bp[0]), int(bp[1]), RNA.CONSTRAINT_CONTEXT_ALL_LOOPS | RNA.CONSTRAINT_CONTEXT_ENFORCE)
_, fee_w = fc.pf()
fc.hc_init()
#this one forbids the base pair but accepts all other base pairs that include i and j
fc.hc_add_bp(int(bp[0]), int(bp[1]), RNA.CONSTRAINT_CONTEXT_NONE)
_, fee_wo = fc.pf()
tmp[bp[0]][bp[1]] = fee_w - fee_wo
tmp[unpaired] = 0
bpp_pp += np.transpose(bpp_pp)
paired_prob = np.sum(bpp_pp, axis=0)
paired_pp = paired_prob >= 0.99
unpaired_pp = paired_prob == 0
paired_prob[paired_pp] = np.NaN
paired_prob[unpaired_pp] = np.NaN
pos_paired_pp = np.argwhere(paired_pp==True)
tmp_pp = np.maximum(0, RT*np.log(paired_prob/(1-paired_prob)))
for pos in pos_paired_pp:
fc.hc_init()
fc.hc_add_bp_nonspecific(int(pos), 0, RNA.CONSTRAINT_CONTEXT_ALL_LOOPS | RNA.CONSTRAINT_CONTEXT_ENFORCE)
_, fee_p = fc.pf()
fc.hc_init()
fc.hc_add_up(int(pos), RNA.CONSTRAINT_CONTEXT_ALL_LOOPS | RNA.CONSTRAINT_CONTEXT_ENFORCE)
_, fee_up = fc.pf()
tmp_pp[pos] = -(fee_p - fee_up)
tmp_pp[unpaired_pp] = 0
return tmp, tmp_pp
def get_energy_of_aligned_seq(self, aligned, up_e):
"""Return pseudo energy matrix for aligned sequence
"""
ungap = np.array(['X']+list(aligned)) != '-'
l = np.sum(ungap)
ungap_bps = np.outer(ungap, ungap)
if up_e:
res = self.ps_energy_up.copy()
return res[ungap].reshape(l)
else:
res = self.ps_energy.copy()
return res[ungap_bps].reshape((l,l))
def add_sc(self, fc, aligned, up_e):
"""Add soft constraint to fc
"""
to_add = self.get_energy_of_aligned_seq(aligned, up_e)
if up_e:
fc.sc_add_up(to_add.tolist())
else:
fc.sc_add_bp(to_add.tolist())
def fc_of_aligned_seq(self, aligned, up_e=False):
"""Create fold compound for given aligned sequence with soft constraint
"""
seq = aligned.replace('-', '')
fc = RNA.fold_compound(seq, self.md)
self.add_sc(fc, aligned, up_e)
return fc