-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathHiCtoolbox.py
862 lines (745 loc) · 23.9 KB
/
HiCtoolbox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
#python 3
#2019-2020
#CC-By-SA
#Old code from Carron Leopold
#Modified by Damien Legros, Cédric Cornede, Arnaud Quelin, Rouquaya Mouss, Hamid Hachemi
#####
#Imports
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import sparse
from scipy import stats
from scipy.spatial import distance
from tqdm import tqdm
from hmmlearn import hmm
import seaborn as sns
np.seterr(divide = 'ignore')
#####
#ADDITIONAL LOADER
def EpiGbyres(EpiGfilename,res,achr,sizeatres,NbEpi):
"""
Generate a matrix of repartition of epiG in a chr at the given resolution
Do it the hard way : parse the file to the desired resolution directly
Entries : EpiGfilename -> String
res -> int
achr -> int
sizeatres -> int
NbEpi -> int
Return : Epimat -> list
"""
Epimat=np.zeros((NbEpi,sizeatres))
o=open(EpiGfilename,'r')
l=o.readline()
while l:
ls=l.split()
if ls[0]==achr:
#force to scale bp cover in bin cover here
EpiV=int(ls[3])-1
b=np.float(ls[1])
e=np.float(ls[2])
begin=int(np.floor(b/res)) #force to cast for indexing error
end=int(np.floor(e/res)) #force to cast for indexing error
eC=end*res
bC=begin*res
val=1
if begin==end: #1 bin
Epimat[EpiV,begin]+=val*(e-b)
else:
if (end-begin)==1: #2 bin
Epimat[EpiV,begin]+=val*(eC-b)
Epimat[EpiV,end]+=val*(e-eC)
else: #more than 2 bin
Epimat[EpiV,begin]+=val*(res-(b-bC))
while begin<end:
Epimat[EpiV,begin]+=val*res
begin+=1
if (e-eC)>0:
Epimat[EpiV,begin]+=val*(e-eC) #here begin=end
l=o.readline()
o.close()
return Epimat
#####
#Building tools
def buildMatrix(HiCfilename, printer=True):
"""
Build the matrix from the HiC data
Entries : HiCfilename -> String
printer -> boolean
Return : A -> list
"""
if printer:
bar = tqdm(range(1), desc="Loading Matrix ")
else:
bar = range(1)
for i in bar:
pass
try:
A=np.loadtxt(HiCfilename)
A=np.int_(A)
except:
print("Data was not correctly downloaded, please delete the folder and relaunch the main.py")
#Build array at pb resolution
A=np.concatenate((A,np.transpose(np.array([A[:,1],A[:,0],A[:,2]]))), axis=0)
A = sparse.coo_matrix( (A[:,2], (A[:,0],A[:,1])))
return A
def buildColors(EpiGfilename, chromosome_number, LENTEST, printer=True):
"""
Build the colors for the epigenetic marks
Entries : HiCfilename -> String
chromosome_number -> int
LENTEST -> int
printer -> boolean
Return : color_vec -> list
"""
if printer:
bar = tqdm(range(1), desc="Loading Colors ")
else:
bar = range(1)
for i in bar:
pass
color=pd.read_csv(EpiGfilename,delimiter='\t',header=None,names=[1,2,3,4])
#Take only chr of interest
color=color[color[1]==chromosome_number]
#Number of color in the file
number=color[4].max()
#Build array at pb resolution LENchr * number of color
color_vec=np.zeros((LENTEST,number+1), dtype='uint8')
i=0
while i<np.shape(color)[0]:
color_vec[color[2].iloc[i]:color[3].iloc[i],color[4].iloc[i]]=1
i+=1
return color_vec
#####
#Plotting tools
def plotter(data2D, nameFig="Plot", plotType="Matrix", cmap="hot_r", vmin=None, vmax=None, jupyter=False, nameFile="Plot.pdf", centro_start=None, centro_end=None, Data_type='Contact'):
"""
Plotter used in the analysis
Entries : data2D -> list
nameFig -> String
plotType -> String
cmap -> String
vmin -> float
vmax -> float
jupyter -> boolean
nameFile -> String
centro_start -> int
centro_end -> int
Data_type -> String
Make the various following plots :
- Matrix plot
- AB Compartments plot
- Density plot
- HMM Score plot
- All Results plot
"""
if plotType == "Matrix":
custom_params = {"axes.spines.right": False,
"axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)
fig = plt.figure()
fig.set_dpi(300)
im = plt.imshow(data2D, cmap=cmap, vmin=vmin, vmax=vmax)
plt.colorbar(im, drawedges=False)
plt.xlabel("Position")
plt.ylabel("Position")
plt.title(nameFig)
elif plotType == "AB":
custom_params = {"axes.spines.right": False,
"axes.spines.top": False,}
sns.set_theme(style="ticks", rc=custom_params)
x = np.arange(0.0, np.shape(data2D)[0], 1)
y = data2D
fig, ax = plt.subplots()
fig.set_dpi(300)
ax.fill_between(x, y, where=(data2D>0), color='red')
ax.axhline(0, color="black", linewidth=0.75)
ax.fill_between(x, y, where=(data2D<0), color='blue')
plt.axvspan(centro_start, centro_end, facecolor='red', alpha=0.2)
plt.xlabel("Position")
plt.ylabel("Activation")
plt.title(nameFig)
elif plotType == "Density": #Density plot
custom_params = {"axes.spines.right": False,
"axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)
x = np.arange(0,len(data2D))
y = data2D
fig = plt.figure()
fig.set_dpi(300)
plt.plot(x, y, color='blue', alpha=1, linewidth=0.1)
plt.axvspan(centro_start, centro_end, facecolor='red', alpha=0.2)
plt.xlabel("Position")
plt.ylabel("Density")
plt.title(nameFig)
elif plotType == "HMMScore":
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)
x = np.arange(2,16)
y = data2D
fig = plt.figure()
fig.set_dpi(300)
plt.plot(x, y, color='blue', alpha=0.4)
best = 2
if (Data_type=='Contact'):
threshold = 1
elif Data_type=='Epigenetic':
threshold = 0.5
for i in range(1,len(data2D)):
temp = ((data2D[i]-data2D[i-1])/np.abs(data2D[i]))*100
if temp >= threshold:
best += 1
else:
break
plt.scatter(best, data2D[best-2], alpha=1, color='red', label="predicted number of compartments : " + str(best))
plt.xlabel("Number of compartments")
plt.ylabel("HMM Score")
plt.legend(loc='upper left')
plt.title(nameFig)
elif plotType == "Barcode":
custom_params = {"axes.spines.left": False,
"axes.spines.right": False,
"axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)
pixel_per_bar = 1
dpi = 100
fig = plt.figure(figsize=(len(data2D) * pixel_per_bar / dpi, 2))
fig.set_dpi(300)
ax = fig.add_axes([0, 0, 1, 1])
ax.imshow(data2D.reshape(1, -1), cmap='bwr', aspect='auto', interpolation='nearest')
ax.axes.get_yaxis().set_visible(False)
plt.xlabel("Position")
plt.title(nameFig)
elif plotType == "Visualization":
[barcode, corr, density] = data2D
fig = plt.figure(constrained_layout=True)
fig.set_dpi(300)
gs = fig.add_gridspec(ncols=2,
nrows=2,
width_ratios=[10, 6.25],
wspace=0.,
hspace=0.,
height_ratios=[1, 13])
custom_params = {"axes.spines.right": False,
"axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)
#Barcode
f_ax1 = fig.add_subplot(gs[0, 0])
f_ax1.set_xlim([0,np.shape(barcode)[0]])
f_ax1.grid(False)
f_ax1.set(yticklabels=[], xticklabels=[])
f_ax1.tick_params(left=False, right=False)
x = np.arange(0, np.shape(barcode)[0], 1)
f_ax1.set_title(nameFig, x=0.5, y=1.2)
f_ax1.imshow(barcode.reshape(1, -1), cmap='bwr', aspect='auto', interpolation='nearest')
#Correlation Matrix
f_ax2 = fig.add_subplot(gs[1, 0])
im = f_ax2.imshow(corr, cmap='seismic', vmin=-np.amax(corr), vmax=np.amax(corr))
f_ax2.grid(False)
plt.colorbar(im, location='left')
#Gene density
f_ax3 = fig.add_subplot(gs[1, 1])
f_ax3.axis('off')
x = density
y = np.flip(np.arange(0,np.shape(density)[0]))
f_ax3.axhspan(np.shape(density)[0]-centro_start, np.shape(density)[0]-centro_end, facecolor='red', alpha=0.2)
f_ax3.plot(x, y, color='blue', alpha=1, linewidth=0.1)
if jupyter:
plt.show()
else:
plt.savefig(nameFile)
plt.clf()
#####
#Filtering tools
def filtering(Hicmat,factor=1.5, printer=True):
"""
Filter the matrix and return the matrix filtered and the list of indexs removed
Entries : Hicmat -> list
factor -> float
printer -> boolean
Return : Hicmatreduce -> list
segmenter1 -> list
"""
if printer:
bar = tqdm(range(1), desc="Filtering ")
else:
bar = range(1)
for i in bar:
pass
Filterextremum=True
Hicmatreduce=Hicmat
#first step : filter empty bin
sumHicmat=Hicmat.sum(axis = 0)
segmenter1=sumHicmat>0
A=np.where(segmenter1)
Hicmatreduce=Hicmatreduce[A[1],:]
Hicmatreduce=Hicmatreduce[:,A[1]]
if Filterextremum:
#second step : filter lower bin
sumHicmat=np.sum(Hicmatreduce,0)
msum=np.mean(sumHicmat)
mstd=np.std(sumHicmat)
mini = msum-mstd*factor
maxi = msum+mstd*factor
#Make the bolean condition
newcond=mini < sumHicmat
newcond2=sumHicmat < maxi
newcond=np.logical_and(newcond,newcond2)
B=np.where(newcond)
#Filter
Hicmatreduce=Hicmatreduce[B[1],:]
Hicmatreduce=Hicmatreduce[:,B[1]]
segmenter1=A[1][B[1]] #Create the binsaved index
return Hicmatreduce,segmenter1
def unfiltering(binsaved, contact_map, shape, printer=True):
"""
Unfilter the matrix and return the matrix unfiltered
Entries : binsaved -> list
contact_map -> list
shape -> list
printer -> boolean
Return : unfiltered_map -> list
"""
if printer:
bar = tqdm(range(1), desc="Unfiltering ")
else:
bar = range(1)
for i in bar:
pass
unfiltered_map = contact_map
binunsaved = []
for i in range(shape[0]):
if ((i in binsaved) == False):
binunsaved.append(i)
for i in binunsaved:
unfiltered_map = np.insert(unfiltered_map, i, 0, axis= 0)
unfiltered_map = np.insert(unfiltered_map, i, 0, axis= 1)
return unfiltered_map
#####
#Binning tools
def bin2d(Data,p,q, printer=True):
"""
Data = input matrix
p,q rescaling factors
Written for sparse
"""
if printer:
bar = tqdm(range(1), desc="Changing Resolution ")
else:
bar = range(1)
for i in bar:
pass
n,m=np.shape(Data);
s=(int(np.ceil(n/p)),int(np.ceil(m/q)))
i,j,d = sparse.find(Data);
i=np.int_(np.ceil(i/p))
j=np.int_(np.ceil(j/q))
M=sparse.csr_matrix((d,(i,j)))
return M
def bin1D(anumpyarray, resolutionfrom, resolutionto):
"""
in : A numpy array , number of bin in raw and in col
out : the matrix binned
"""
print(resolutionto,resolutionfrom)
if resolutionto>resolutionfrom:
convertionfactor=np.ceil(resolutionto/resolutionfrom)
s=anumpyarray.shape
print("dimension du vecteur:",s)
#has to be identical as result in other function like chrsizedict)
newsizei=np.ceil(s[0]*resolutionfrom/resolutionto)
newarray=np.zeros(int(newsizei))
print("taille du vecteur appres rescale :",newarray.shape)
i=0
while i<newsizei:
ifrom=int(i*convertionfactor)
ito=int((i+1)*convertionfactor)
if i==newsizei-1:
asum=np.sum(anumpyarray[ifrom:])
else:
asum=np.sum(anumpyarray[ifrom:ito])
newarray[i]=asum
i+=1
return newarray
elif resolutionto==resolutionfrom:
print("no binning")
return anumpyarray
else:
print("wrong resolution parameter in bin1D")
def bin2dfullmat(anumpyarray, resolutionfrom, resolutionto):
"""
in : A numpy array , number of bin in raw and in col
out : the matrix binned
Written for full
"""
print('change of resolution from ',resolutionfrom,' to ',resolutionto)
if resolutionto>resolutionfrom:
convertionfactor=np.ceil(resolutionto/resolutionfrom)
s=anumpyarray.shape
print("Initial HiC size before binning:",s)
#has to be identical as result in other function like chrsizedict)
newsizei=np.ceil(s[0]*resolutionfrom/resolutionto)
newsizej=np.ceil(s[1]*resolutionfrom/resolutionto)
newarray=np.zeros((int(newsizei),int(newsizej)))
print("HiC size after binning :",newarray.shape)
i=0
j=0
while i<newsizei:
while j<newsizej:
ifrom=int(i*convertionfactor)
ito=int((i+1)*convertionfactor)
jfrom=int(j*convertionfactor)
jto=int((j+1)*convertionfactor)
if i==newsizei-1:
asum=np.sum(anumpyarray[ifrom:,jfrom:jto])
elif j==newsizej-1:
asum=np.sum(anumpyarray[ifrom:ito,jfrom:])
elif i==newsizei-1 and j==newsizej-1:
asum=np.sum(anumpyarray[ifrom:,jfrom:])
else:
asum=np.sum(anumpyarray[ifrom:ito,jfrom:jto])
newarray[i,j]=asum
newarray[j,i]=asum
j+=1
i+=1
j=0
return newarray
elif resolutionto==resolutionfrom:
print("No binning")
return anumpyarray
else:
print("Wrong resolution parameter")
#####
#Operations tools
def SVD(D):
"""
Calculate the SVD and return the first eigenvector
out : SVD(D)
"""
eigens_values,eigens_vectors = np.linalg.eig(D)
return eigens_vectors[:,0]
def Corr(D, printer=True):
"""
Calculate the pearson correlation and return the matrix
out : Corr(D)
"""
lines, columns = np.shape(D)
C = np.zeros((lines, columns))
if printer:
bar = tqdm(range(lines), desc="Pearson Correlation ")
else:
bar = range(lines)
for i in bar:
for j in range(columns):
C[i, j] = stats.pearsonr(D[i, :], D[:, j])[0]
return C
def OE(D, printer=True):
"""
Calculate the OE matrix and return it
out : OE(D)
"""
i=0
j=0
L=len(D)
if printer:
bar = tqdm(range(1), desc="O/E ")
else:
bar = range(1)
for i in bar:
pass
while j<L:
thediag=np.diag(D,k=j)
mtg=np.mean(thediag)
if mtg == 0:
mtg = 1e-8
while i<(L-j):
v=D[i,i+j]/mtg
D[i,i+j]=v
D[i+j,i]=v
i+=1
i=0
j+=1
return D
def SCN(D, max_iter = 10, printer=True):
"""
Calculate the SCN and return it
Out : SCN(D)
Code version from Vincent Matthys
"""
# Iteration over max_iter
if printer:
bar = tqdm(range(max_iter), desc="SCN ")
else:
bar = range(max_iter)
for i in bar:
D /= np.maximum(1, D.sum(axis = 0))
D /= np.maximum(1, D.sum(axis = 1))
# To make matrix symetric again
return (D + D.T)/2
def fastFloyd(contact, printer=True):
"""
out : FF(contact)
Code version from Vincent Matthys
"""
n = contact.shape[0]
shortest = contact
if printer:
bar = tqdm(range(n), desc="Fast Floyd ")
else:
bar = range(n)
for k in bar:
i2k = np.tile(shortest[k,:], (n, 1))
k2j = np.tile(shortest[:, k], (n, 1)).T
shortest = np.minimum(shortest, i2k + k2j)
return shortest
def filteramat(Hicmat, Filterextremum=True, factor=1.5, printer=True):
"""
in : a HiCmat without any transformation, factor of reduction
out : the HiCmatreduce,thevector of his transformation
THE filter part from the main in one function
"""
if printer:
bar = tqdm(range(1), desc="Matrix Filtering ")
else:
bar = range(1)
for i in bar:
pass
Hicmatreduce=Hicmat
#first step : filter empty bin
sumHicmat=Hicmat.sum(axis = 0)
segmenter1=sumHicmat>0
A=np.where(segmenter1)
Hicmatreduce=Hicmatreduce[A[1],:]
Hicmatreduce=Hicmatreduce[:,A[1]]
if Filterextremum:
#second step : filter lower bin
sumHicmat=np.sum(Hicmatreduce,0)
msum=np.mean(sumHicmat)
mstd=np.std(sumHicmat)
mini = msum-mstd*factor
maxi = msum+mstd*factor
#Make the bolean condition
newcond=mini < sumHicmat
newcond2=sumHicmat < maxi
newcond=np.logical_and(newcond,newcond2)
B=np.where(newcond)
#Filter
Hicmatreduce=Hicmatreduce[B[1],:]
Hicmatreduce=Hicmatreduce[:,B[1]]
segmenter1=A[1][B[1]] #Create the binsaved index
return Hicmatreduce,segmenter1
def sammon(x, n, display = 2, inputdist = 'raw', maxhalves = 20, maxiter = 500, tolfun = 1e-9, init = 'default'):
"""Perform Sammon mapping on dataset x
y = sammon(x) applies the Sammon nonlinear mapping procedure on
multivariate data x, where each row represents a pattern and each column
represents a feature. On completion, y contains the corresponding
co-ordinates of each point on the map. By default, a two-dimensional
map is created. Note if x contains any duplicated rows, SAMMON will
fail (ungracefully).
[y,E] = sammon(x) also returns the value of the cost function in E (i.e.
the stress of the mapping).
An N-dimensional output map is generated by y = sammon(x,n) .
A set of optimisation options can be specified using optional
arguments, y = sammon(x,n,[OPTS]):
maxiter - maximum number of iterations
tolfun - relative tolerance on objective function
maxhalves - maximum number of step halvings
input - {'raw','distance'} if set to 'distance', X is
interpreted as a matrix of pairwise distances.
display - 0 to 2. 0 least verbose, 2 max verbose.
init - {'pca', 'cmdscale', random', 'default'}
default is 'pca' if input is 'raw',
'msdcale' if input is 'distance'
The default options are retrieved by calling sammon(x) with no
parameters.
File : sammon.py
Date : 18 April 2014
Authors : Tom J. Pollard (tom.pollard.11@ucl.ac.uk)
: Ported from MATLAB implementation by
Gavin C. Cawley and Nicola L. C. Talbot
Description : Simple python implementation of Sammon's non-linear
mapping algorithm [1].
References : [1] Sammon, John W. Jr., "A Nonlinear Mapping for Data
Structure Analysis", IEEE Transactions on Computers,
vol. C-18, no. 5, pp 401-409, May 1969.
Copyright : (c) Dr Gavin C. Cawley, November 2007.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
# Create distance matrix unless given by parameters
if inputdist == 'distance':
D = x
if init == 'default':
init = 'cmdscale'
else:
D = distance.cdist(x, x)
if init == 'default':
init = 'pca'
if inputdist == 'distance' and init == 'pca':
raise ValueError("Cannot use init == 'pca' when inputdist == 'distance'")
if np.count_nonzero(np.diagonal(D)) > 0:
raise ValueError("The diagonal of the dissimilarity matrix must be zero")
# Remaining initialisation
N = x.shape[0]
scale = 0.5 / D.sum()
D = D + np.eye(N)
if np.count_nonzero(D<=0) > 0:
raise ValueError("Off-diagonal dissimilarities must be strictly positive")
Dinv = 1 / D
if init == 'pca':
[UU,DD,_] = np.linalg.svd(x)
y = UU[:,:n]*DD[:n]
elif init == 'cmdscale':
from cmdscale import cmdscale
y,e = cmdscale(D)
y = y[:,:n]
else:
y = np.random.normal(0.0,1.0,[N,n])
one = np.ones([N,n])
d = distance.cdist(y,y) + np.eye(N)
dinv = 1. / d
delta = D-d
E = ((delta**2)*Dinv).sum()
# Get on with it
for i in range(maxiter):
# Compute gradient, Hessian and search direction (note it is actually
# 1/4 of the gradient and Hessian, but the step size is just the ratio
# of the gradient and the diagonal of the Hessian so it doesn't
# matter).
delta = dinv - Dinv
deltaone = np.dot(delta,one)
g = np.dot(delta,y) - (y * deltaone)
dinv3 = dinv ** 3
y2 = y ** 2
H = np.dot(dinv3,y2) - deltaone - np.dot(2,y) * np.dot(dinv3,y) + y2 * np.dot(dinv3,one)
s = -g.flatten(order='F') / np.abs(H.flatten(order='F'))
y_old = y
# Use step-halving procedure to ensure progress is made
for j in range(maxhalves):
s_reshape = np.reshape(s, (-1,n),order='F')
y = y_old + s_reshape
d = distance.cdist(y, y) + np.eye(N)
dinv = 1 / d
delta = D - d
E_new = ((delta**2)*Dinv).sum()
if E_new < E:
break
else:
s = 0.5*s
# Bomb out if too many halving steps are required
if j == maxhalves-1:
print('Warning: maxhalves exceeded. Sammon mapping may not converge...')
# Evaluate termination criterion
if abs((E - E_new) / E) < tolfun:
if display:
print('TolFun exceeded: Optimisation terminated')
break
# Report progress
E = E_new
if display > 1 and printer:
print('epoch = %d : E = %12.10f'% (i+1, E * scale))
if i == maxiter-1:
print('Warning: maxiter exceeded. Sammon mapping may not have converged...')
# Fiddle stress to match the original Sammon paper
E = E * scale
return [y,E]
#####
#Predicting sub-compartments tools
def gaussianHMM(vector, nb_comp=2, n_iter=100):
"""
generate a simple hmm to have compartment on correlation map
Return the model of the hmm and the states of the prediction at given N
Entries : vector -> list
nb_comp -> int
n_iter -> int
Return : labels_list -> list
scores_list -> list
"""
# Run Gaussian HMM
model = hmm.GaussianHMM(nb_comp, "tied",n_iter=n_iter)
model.fit(vector)
labels = model.predict(vector) #etats/compartiments
score = model.score(vector)
return labels, score
def multiplegaussianHMM(vector, nb_comp_max=16, n_iter=100):
"""
generate the hmms of each compartment on correlation map
Return the model of all hmm and the states of the prediction at given N
Entries : vector -> list
nb_comp_max -> int
n_iter -> int
Return : labels_list -> list
scores_list -> list
"""
labels_list = []
scores_list = []
for i in tqdm (range(2, nb_comp_max), desc="Finding Compartments with HMM "):
labels, score = gaussianHMM(vector, i, n_iter=n_iter)
labels_list.append(labels)
scores_list.append(score)
return labels_list, scores_list
def expr_repr_scoring(color_bins, marks, scores):
"""
Calculate the expression repression score and return it
Entries : color_bins -> list
marks -> list
scores -> list
Return : float
"""
output = color_bins[:,0].toarray() #only zeros
for i in range(len(marks)):
mark = marks[i]
score = scores[i]
X = color_bins[:,mark].toarray()
output+=X*score
return output
def similarity_score(val_data, data):
"""
Calculate the similarity score and return it
Entries : val_data -> list
data -> list
Return : float
"""
cnt = 0
for i in range(np.shape(data)[0]):
if val_data[i] == data[i]:
cnt += 1
similarity = cnt/np.shape(data)[0] * 100
return round(similarity, 3)
#####
#PDB maker tool
def writePDB(fnameout, value, EpiValue, printer=True):
"""
Write a PDB file from values
Entries : fnameout -> String
value -> int
EpiValue -> list
"""
if printer:
bar = tqdm(range(1), desc="Writing PDB ")
else:
bar = range(1)
for i in bar:
pass
Sh=np.shape(value)
#out
fout=open(fnameout,'w')
i=1
while i<=Sh[0]:
S="{:6s}{:5d} {:^4s}{:1s}{:3s} {:1s}{:4d}{:1s} {:8.3f}{:8.3f}{:8.3f}{:6.2f}{:6.2f} {:>2s}{:2s}" #PDB format in python
#print("here",value[i-1,0],value[i-1,1],EpiValue[i-1])
S=S.format('ATOM',i,'CA','','ALA','A',i,'',float(value[i-1,0]),float(value[i-1,1]),float(value[i-1,2]),1,float(EpiValue[i-1]),'C','')
#print(S)
fout.write(S+"\n")
i+=1
fout.close()