-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_clustering.py
73 lines (61 loc) · 2.87 KB
/
data_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import argparse
import numpy as np
import pandas as pd
from scipy.spatial.distance import jensenshannon
def jsd_correlation_matrix(source_data):
num = len(source_data[0])
record = []
jsd = np.zeros([num, num])
for i in range(num - 1):
for j in range(i + 1, num):
value = jensenshannon(source_data[:, i], source_data[:, j])
jsd[i, j] = value
jsd[j, i] = value
record.append(value)
record = np.array(jsd)
return record
def min_value(corr_matrix):
min_val = corr_matrix[0][0]
row_index, col_index = 0, 0
for i in range(0, corr_matrix.shape[0]):
for j in range(0, corr_matrix.shape[1]):
if corr_matrix[j][i] <= min_val:
min_val = corr_matrix[i][j]
row_index = i
col_index = j
return min_val, row_index, col_index
def index_update(source_index, row_index, col_index):
if isinstance(source_index[row_index], int) is True:
source_index_update = [source_index[row_index], source_index[col_index]]
else:
source_index[row_index].append(source_index[col_index])
source_index_update = source_index[row_index]
source_index.pop(row_index)
source_index.pop(col_index)
source_index.append(source_index_update)
return source_index
def update_matrix(source_data, row_index, col_index):
cluster_data = np.sum(source_data.take([row_index, col_index], axis=1), axis=1)
cluster_data = np.reshape(cluster_data, (cluster_data.shape[0], 1))
delete_data = np.delete(source_data, [row_index, col_index], axis=1)
source_data_update = np.concatenate((delete_data, cluster_data), axis=1)
corr_matrix_update = jsd_correlation_matrix(source_data_update)
np.fill_diagonal(corr_matrix_update, 1)
return source_data_update, corr_matrix_update
def update_proportion(sink_abundance, row_index, col_index):
cluster_proportion = np.sum(sink_abundance.take([row_index, col_index], axis=1), axis=1)
cluster_proportion = np.reshape(cluster_proportion, (cluster_proportion.shape[0], 1))
delete_proportion = np.delete(sink_abundance, [row_index, col_index], axis=1)
sink_abundance_update = np.concatenate((delete_proportion, cluster_proportion), axis=1)
return sink_abundance_update
def data_cluster(sources, corr_matrix, jsd_value):
source_index = list(range(sources.shape[1]))
np.fill_diagonal(corr_matrix, 1)
for i in range(1, sources.shape[1]):
min_val, row_index, col_index = min_value(corr_matrix)
if min_val > jsd_value:
break
source_index = index_update(source_index, row_index, col_index)
sources, corr_matrix = update_matrix(sources, row_index, col_index)
# true_proportion = update_proportion(true_proportion, row_index, col_index)
return source_index, sources