forked from ilarinieminen/SOM-Toolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
som_drmake.m
167 lines (143 loc) · 5.39 KB
/
som_drmake.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
function [sR,best,sig,Cm] = som_drmake(D,inds1,inds2,sigmea,nanis)
% SOM_DRMAKE Make descriptive rules for given group within the given data.
%
% sR = som_drmake(D,[inds1],[inds2],[sigmea],[nanis])
%
% D (struct) map or data struct
% (matrix) the data, of size [dlen x dim]
% [inds1] (vector) indeces belonging to the group
% (the whole data set by default)
% [inds2] (vector) indeces belonging to the contrast group
% (the rest of the data set by default)
% [sigmea] (string) significance measure: 'accuracy',
% 'mutuconf' (default), or 'accuracyI'.
% (See definitions below).
% [nanis] (scalar) value given for NaNs: 0 (=FALSE, default),
% 1 (=TRUE) or NaN (=ignored)
%
% sR (struct array) best rule for each component. Each
% struct has the following fields:
% .type (string) 'som_rule'
% .name (string) name of the component
% .low (scalar) the low end of the rule range
% .high (scalar) the high end of the rule range
% .nanis (scalar) how NaNs are handled: NaN, 0 or 1
%
% best (vector) indeces of rules which make the best combined rule
% sig (vector) significance measure values for each rule, and for the combined rule
% Cm (matrix) A matrix of vectorized confusion matrices for each rule,
% and for the combined rule: [a, c, b, d] (see below).
%
% For each rule, such rules sR.low <= x < sR.high are found
% which optimize the given significance measure. The confusion
% matrix below between the given grouping (G: group - not G: contrast group)
% and rule (R: true or false) is used to determine the significance values:
%
% G not G
% --------------- accuracy = (a+d) / (a+b+c+d)
% true | a | b |
% |-------------- mutuconf = a*a / ((a+b)(a+c))
% false | c | d |
% --------------- accuracyI = a / (a+b+c)
%
% See also SOM_DREVAL, SOM_DRTABLE.
% Contributed to SOM Toolbox 2.0, January 7th, 2002 by Juha Vesanto
% Copyright (c) by Juha Vesanto
% http://www.cis.hut.fi/projects/somtoolbox/
% Version 2.0beta juuso 070102
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% input arguments
if isstruct(D),
switch D.type,
case 'som_data', cn = D.comp_names; D = D.data;
case 'som_map', cn = D.comp_names; D = D.codebook;
end
else
cn = cell(size(D,2),1);
for i=1:size(D,2), cn{i} = sprintf('Variable%d',i); end
end
[dlen,dim] = size(D);
if nargin<2 || isempty(inds1), inds1 = 1:dlen; end
if nargin<3 || isempty(inds2), i = ones(dlen,1); i(inds1) = 0; inds2 = find(i); end
if nargin<4, sigmea = 'mutuconf'; end
if nargin<5, nanis = 0; end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% input arguments
sig = zeros(dim+1,1);
Cm = zeros(dim+1,4);
sR1tmp = struct('type','som_rule','name','','low',-Inf,'high',Inf,'nanis',nanis,'lowstr','','highstr','');
sR = sR1tmp;
% single variable rules
for i=1:dim,
% bin edges
mi = min(D(:,i));
ma = max(D(:,i));
[histcount,bins] = hist([mi,ma],10);
if size(bins,1)>1, bins = bins'; end
edges = [-Inf, (bins(1:end-1)+bins(2:end))/2, Inf];
% find the rule for this variable
[low,high,s,cm] = onevar_descrule(D(inds1,i),D(inds2,i),sigmea,nanis,edges);
sR1 = sR1tmp;
sR1.name = cn{i};
sR1.low = low;
sR1.high = high;
sR(i) = sR1;
sig(i) = s;
Cm(i,:) = cm;
end
% find combined rule
[dummy,order] = sort(-sig);
maxsig = sig(order(1)); bestcm = Cm(order(1),:);
best = order(1);
for i=2:dim,
com = [best, order(i)];
[s,cm,truex,truey] = som_dreval(sR(com),D(:,com),sigmea,inds1,inds2,'and');
if s>maxsig, best = com; maxsig = s; bestcm = cm; end
end
sig(end) = maxsig;
Cm(end,:) = cm;
return;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%55
%% descriptive rules
function [low,high,sig,cm] = onevar_descrule(x,y,sigmea,nanis,edges)
% Given a set of bin edges, find the range of bins with best significance.
%
% x data values in cluster
% y data values not in cluster
% sigmea significance measure
% bins bin centers
% nanis how to handle NaNs
% histogram counts
if isnan(nanis), x = x(~isnan(x)); y = y(~isnan(y)); end
[xcount,xbin] = histc(x,edges);
[ycount,ybin] = histc(y,edges);
xcount = xcount(1:end-1);
ycount = ycount(1:end-1);
xnan=sum(isnan(x));
ynan=sum(isnan(y));
% find number of true items in both groups in all possible ranges
n = length(xcount);
V = zeros(n*(n+1)/2,4);
s1 = cumsum(xcount);
s2 = cumsum(xcount(end:-1:1)); s2 = s2(end:-1:1);
m = s1(end);
Tx = triu(s1(end)-m*log(exp(s1/m)*exp(s2/m)')+repmat(xcount',[n 1])+repmat(xcount,[1 n]),0);
s1 = cumsum(ycount);
s2 = cumsum(ycount(end:-1:1)); s2 = s2(end:-1:1);
Ty = triu(s1(end)-m*log(exp(s1/m)*exp(s2/m)')+repmat(ycount',[n 1])+repmat(ycount,[1 n]),0);
[i,j] = find(Tx+Ty);
k = sub2ind(size(Tx),i,j);
V = [i, j, Tx(k), Ty(k)];
tix = V(:,3) + nanis*xnan;
tiy = V(:,4) + nanis*ynan;
% select the best range
nix = length(x);
niy = length(y);
Cm = [tix,nix-tix,tiy,niy-tiy];
[s,k] = max(som_drsignif(sigmea,Cm));
% output
low = edges(V(k,1));
high = edges(V(k,2)+1);
sig = s;
cm = Cm(k,:);
return;