-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClusteringAccuracy.m
365 lines (290 loc) · 13.2 KB
/
ClusteringAccuracy.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
function accuracy = ClusteringAccuracy(clustNums, trueClustNums, varargin)
% accuracy = ClusteringAccuracy(clustNums, trueClustNums, ...
% compareConfusionMat, numRandom)
% Calculate the accuracy in a clustering algorithm operating on
% known true clusters.
% Accuracy is reported as the adjusted mutual information.
% INPUTS:
% -clustNums list of indices to found clusters
% -trueClustNums list of indices to true clusters
% OPTIONS:
% -calcAdjustedMi true
% computed the adjusted mutual information
% -calcPValue true
% compute the probability of the null hypothesis that this clustering
% is the same as the one specified by compareConfusionMat
% -compareConfusionMat []
% when computing p-values, compare the cluster results to those
% specified by compareConfusionMat. If empty, compare to random numbers
% -numRandom 10000
% when computing a distribution of clusterings, make this many
% OUTPUTS:
% -Accuracy: structure containing information about accuracy/results of
% the clustering. Depending on the input options, some
% fields may not be computed
% mutualInfo mutual information between found and true clusters
% maxMi mutual information of perfect agreement
% meanMi (only if calcAdjustedMi) mean of distribution of
% random clusterings
% adjustedMi (only if calcAdjustedMi) mutual information scaled
% so that random ~ 0 and perfect = 1
% pValue (only if calcPValue) probability of the null
% hypothesis that this clustering is the same as the
% one specified by compareConfusionMat
% clustNums remapped cluster numbers (organized so that the
% maximum number of points are assigned to the
% "correct" group)
% confusionMat confusion matrix of remapped clustNums
% proportionCorrect fraction of points correctly assigned after the
% remapping proceedure
if length(trueClustNums) ~= length(clustNums)
error(['clustered group numbers and true group numbers have ' ...
'different lengths'])
end
% get the options to control how accuracy is computed
defaultOptions = {...
'calcAdjustedMi', true, ...
'calcPValue', true, ...
'compareConfusionMat', [], ...
'numRandom', 10000 ...
};
options = GetOptions(defaultOptions, varargin, true);
% convert the information in clustNums and trueClustNums into a confusion
% matrix, a format that is more convenient for analyzing accuracy
confusionMat = getConfusionMatrix(clustNums, trueClustNums);
% compute the accuracy in terms of mutual information
accuracy = getAccuracy(confusionMat, options);
% remap the cluster assignments (swap rows) so as to achieve the largest
% possible fraction correct (maximize the main diagonal of confusionMat)
[accuracy.confusionMat, accuracy.clustNums] = ...
remapClusterAssignments(confusionMat, clustNums);
% compute the fraction of points correctly clustered
accuracy.proportionCorrect = ...
sum(diag(accuracy.confusionMat)) / sum(accuracy.confusionMat(:));
return
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function mat = getConfusionMatrix(clustList, trueClustList)
% make sure that we're dealing with integers from 1 to numTypes
[cNames, ~, cNums] = unique(clustList);
[trueCNames, ~, trueCNums] = unique(trueClustList);
numC = length(cNames);
numTrueC = length(trueCNames);
mat = zeros(numC, numTrueC);
for ind = 1:numC
for trueInd = 1:numTrueC
mat(ind,trueInd) = sum(cNums == ind & trueCNums == trueInd);
end
end
return
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function accuracy = getAccuracy(confusionMat, options)
% calculate the adjusted mutual information, and p-value that the
% clustering is no better than random (or the clustering described by the
% given confusion matrix compareConfusionMat.
% calculate mutual information as well as entropy of each separate
% clustering
[mutualInfo, entropyClust, entropyTrueClust] = getMutualInfo(confusionMat);
% this is the maximum mutual information: when the clustering with the most
% entropy tells you *everything* about the other clustering
maxMutualInfo = max(entropyClust, entropyTrueClust);
% organize info computed so far into struct
accuracy = struct( ...
'mutualInfo', mutualInfo, ...
'maxMi', maxMutualInfo ...
);
if ~options.calcAdjustedMi && ~options.calcPValue
% calculated all that was requested, so return
return
end
% Adjusted Mutual Info (AMI) or p-value was requested, so we need to
% compute the probability distribution of mutual information.
% -For AMI, we need to compute the mean value of mutual info for random
% clusterings
% -For p-values we need to compute the full distribution associated with
% compareConfusionMat (assumed to be random if compareConfusionMat is
% empty)
% fix the number of true clusters:
nTrueClust = sum(confusionMat, 1);
% when forming the distribution of clustering outcomes, measure mutual info
measureFunc = @getMutualInfo;
if options.calcAdjustedMi || isempty(options.compareConfusionMat)
% need to compute the distribution of mutual info for random clusterings
% generate a random confusion matrix with
% 1. the correct number of points in each true cluster (column)
% 2. equal probability of assignment to each found cluster (row)
compareConfusionMat = repmat(nTrueClust, size(confusionMat, 1), 1);
% compute the distribution of mutual information associated with this matrix
mi = getRandomClusteringDist(options.numRandom, measureFunc, ...
nTrueClust, compareConfusionMat);
if options.calcAdjustedMi
% compute the mean value of mutual info for random clusterings
accuracy.meanMi = mean(mi);
% compute AMI
accuracy.adjustedMi = ...
(mutualInfo - accuracy.meanMi) / (maxMutualInfo - accuracy.meanMi);
end
if options.calcPValue && isempty(options.compareConfusionMat)
% compute the p-value of this cluster being better than random
% if you assume p has a uniform prior on [0,1] and compute the expectation
% value of p given the distribution of mi observed, this is what you obtain
accuracy.pValue = (1 + sum(mi >= mutualInfo)) / (1 + length(mi));
end
end
if options.calcPValue && ~isempty(options.compareConfusionMat)
% compute the p-value of this cluster being different from another cluster,
% specified by options.compareConfusionMat
compareConfusionMat = options.compareConfusionMat;
% get the mutual info of the comparision cluster to see which is better
compareMi = getMutualInfo(compareConfusionMat);
% form the distribution of mutual info for whichever clustering is worse
% then compute the probability that the better clustering was drawn from the
% distribution of mutual info of the worse one
if mutualInfo < compareMi
% form the distribution of mutual info by drawing the number of true
% clusters specified by the columns of compareConfusionMat, but the
% assignement probabilities specified by confusionMat
% if the two clusterings were performed on the same data, then
% nCompareClust == nTrueClust
nCompareClust = sum(compareConfusionMat, 1);
mi = getRandomClusteringDist(options.numRandom, measureFunc, ...
nCompareClust, confusionMat);
accuracy.pValue = (1 + sum(mi >= compareMi)) / (1 + length(mi));
else
% form the distribution of mutual info by drawing the number of true
% clusters specified by the columns of confusionMat, but the assignement
% probabilities specified by compareConfusionMat
mi = getRandomClusteringDist(options.numRandom, measureFunc, ...
nTrueClust, compareConfusionMat);
accuracy.pValue = (1 + sum(mi >= mutualInfo)) / (1 + length(mi));
end
end
return
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function [mutualInfo, varargout] = getMutualInfo(confusionMat)
% compute the mutual information from a confusion matrix
% INPUTS
% confusionMat a confusion matrix for a given clustering
% OUTPUTS
% mutualInfo mutual information between rows and columns of the matrix
% (decrease in entropy by knowing both true and found
% clusters)
% OPTIONAL
% entropyClust entropy of the rows (found clusters)
% entropyTrueClust entropy of the columns (true clusters)
% turn the integer counts in confusionMat into a matrix of probabilities
probMat = confusionMat ./ sum(confusionMat(:));
% calculate the entropy of the clusters, true clusters, and the joint
% partition:
% 1. found clusters are the rows
pClust = sum(probMat, 2);
entropyClust = getEntropy(pClust);
% 2. true clusters are the columns
pTrueClust = sum(probMat, 1);
entropyTrueClust = getEntropy(pTrueClust);
% 3. the joint entropy is considering each matrix element separately
entropyJoint = getEntropy(probMat(:));
% calculate the mutual information: the decrease in entropy by knowing both
% clusterings:
mutualInfo = entropyClust + entropyTrueClust - entropyJoint;
if nargout > 1
varargout = {entropyClust, entropyTrueClust};
end
return
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function entropy = getEntropy(probList)
% compute the entropy of a set of probabilities (in this case, they are derived
% from the probability of certain outcomes in clustering, as specified by a
% confusion matrix, although the formula for entropy is general
% get only the non-zero probabilities
probList = probList(probList > 0);
% scale to make sure they're probabilities
probList = probList / sum(probList);
% compute the entropy
entropy = -sum(probList .* log(probList));
return
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function qualityDist = ...
getRandomClusteringDist(numRandom, measureFunc, nTrueClust, seedProbMat)
% get the distribution of some quality (calculated by measureFunc) given a
% contingency matrix of probabilities
qualityDist = NaN(1, numRandom);
% make sure seedProbMat is a list of probabilities
for n = 1:size(seedProbMat, 2);
seedProbMat(:,n) = seedProbMat(:,n) / sum(seedProbMat(:,n));
end
parfor n = 1:numRandom
confusionMat = generateRandomConfusionMat(nTrueClust, seedProbMat);
qualityDist(n) = measureFunc(confusionMat); %#ok<PFBNS>
end
return
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function confusionMat = generateRandomConfusionMat(nTrueClust, seedProbMat)
% randomly generate a confusion matrix (output of a clustering algorithm)
% INPUTS
% nTrueClust 1 x numTrueClusters array of number of points in each
% true cluster
% seedProbMat numFoundClusters x numTrueClusters matrix whose columns
% specify the probability that a point in the
% corresponding true cluster will be found to be in the
% corresponding row.
% OUTPUTS
% confusionMat numFoundClusters x numTrueClusters matrix specifying the
% number of points in true cluster (column) that were
% classified in a found cluster (row)
confusionMat = zeros(size(seedProbMat));
while true
% keep trying until a confusion matrix is successfully generated
for trueInd = 1:size(seedProbMat, 2)
% loop for each column (each true cluster)
% assign numAssign points to random clusters
numAssign = nTrueClust(trueInd);
% assign them weighted by seedProbMat
pAssign = cumsum(seedProbMat(:,trueInd));
for n = 1:numAssign
% calculate the random index
ind = find(rand(1) <= pAssign, 1, 'first');
% add a point in that joint true cluster / assigned cluster location
confusionMat(ind, trueInd) = confusionMat(ind, trueInd) + 1;
end
end
% nClust is the sum across columns for each row
nClust = sum(confusionMat, 2);
% clustering algorithms always assign at least one point to each cluster,
% so we're only successful if each cluster has some non-zero probability
if any(nClust == 0)
% zero out the confusion matrix and try again
confusionMat(:) = 0;
else
% successfully generated a valid confusion matrix, return
break
end
end
return
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function [confusionMat, groupNums] = ...
remapClusterAssignments(confusionMat, groupNums)
% remap the cluster assignments (swap rows) so as to achieve the largest
% possible fraction correct (maximize the main diagonal of confusionMat)
labels = unique(groupNums);
numTrueClust = size(confusionMat, 2);
for n = 1:numTrueClust
[maxVal, maxRow] = max(confusionMat(:,n));
if confusionMat(n,n) == maxVal
continue
end
if confusionMat(n,n) + confusionMat(maxRow,maxRow) > ...
maxVal + confusionMat(n,maxRow)
% not worth swapping
continue
end
%swap rows
temp = confusionMat(n,:);
confusionMat(n,:) = confusionMat(maxRow,:);
confusionMat(maxRow,:) = temp;
%swap group assignments
ind_n = (groupNums == labels(n));
ind_maxRow = (groupNums == labels(maxRow));
groupNums(ind_n) = labels(maxRow);
groupNums(ind_maxRow) = labels(n);
end
return