-
Notifications
You must be signed in to change notification settings - Fork 4
/
Nmetrics.py
74 lines (62 loc) · 2.91 KB
/
Nmetrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import numpy as np
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, v_measure_score, accuracy_score
nmi = normalized_mutual_info_score
vmeasure = v_measure_score
ari = adjusted_rand_score
def acc(y_true, y_pred):
"""
Calculate clustering accuracy. Require scikit-learn installed
# Arguments
y: true labels, numpy.array with shape `(n_samples,)`
y_pred: predicted labels, numpy.array with shape `(n_samples,)`
# Return
accuracy, in [0,1]
"""
y_true = y_true.astype(np.int64)
assert y_pred.size == y_true.size
D = max(y_pred.max(), y_true.max()) + 1
w = np.zeros((D, D), dtype=np.int64)
for i in range(y_pred.size):
w[y_pred[i], y_true[i]] += 1
# from scipy.optimize import linear_sum_assignment
from sklearn.utils.linear_assignment_ import linear_assignment
ind = linear_assignment(w.max() - w)
# ind = linear_sum_assignment(w.max() - w)
return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size
def purity(y_true, y_pred):
"""Purity score
Args:
y_true(np.ndarray): n*1 matrix Ground truth labels
y_pred(np.ndarray): n*1 matrix Predicted clusters
Returns:
float: Purity score
"""
# matrix which will hold the majority-voted labels
y_voted_labels = np.zeros(y_true.shape)
# Ordering labels
## Labels might be missing e.g with set like 0,2 where 1 is missing
## First find the unique labels, then map the labels to an ordered set
## 0,2 should become 0,1
labels = np.unique(y_true)
ordered_labels = np.arange(labels.shape[0])
for k in range(labels.shape[0]):
y_true[y_true==labels[k]] = ordered_labels[k]
# Update unique labels
labels = np.unique(y_true)
# We set the number of bins to be n_classes+2 so that
# we count the actual occurence of classes between two consecutive bins
# the bigger being excluded [bin_i, bin_i+1[
bins = np.concatenate((labels, [np.max(labels)+1]), axis=0)
for cluster in np.unique(y_pred):
hist, _ = np.histogram(y_true[y_pred==cluster], bins=bins)
# Find the most present label in the cluster
winner = np.argmax(hist)
y_voted_labels[y_pred==cluster] = winner
return accuracy_score(y_true, y_voted_labels)
def test(y_true, y_pred):
print("ACC:%.4f, NMI:%.4f, VME:%.4f, ARI:%.4f, PUR:%.4f" % (acc(y_true, y_pred),
nmi(y_true, y_pred),
vmeasure(y_true, y_pred),
ari(y_true, y_pred),
purity(y_true, y_pred)))
return acc(y_true, y_pred), nmi(y_true, y_pred), vmeasure(y_true, y_pred), ari(y_true, y_pred), purity(y_true, y_pred)