-
Notifications
You must be signed in to change notification settings - Fork 1
/
kde2d.m
220 lines (193 loc) · 7.12 KB
/
kde2d.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
function [bandwidth,density,X,Y]=kde2d(data,n,MIN_XY,MAX_XY)
% fast and accurate state-of-the-art
% bivariate kernel density estimator
% with diagonal bandwidth matrix.
% The kernel is assumed to be Gaussian.
% The two bandwidth parameters are
% chosen optimally without ever
% using/assuming a parametric model for the data or any "rules of thumb".
% Unlike many other procedures, this one
% is immune to accuracy failures in the estimation of
% multimodal densities with widely separated modes (see examples).
% INPUTS: data - an N by 2 array with continuous data
% n - size of the n by n grid over which the density is computed
% n has to be a power of 2, otherwise n=2^ceil(log2(n));
% the default value is 2^8;
% MIN_XY,MAX_XY- limits of the bounding box over which the density is computed;
% the format is:
% MIN_XY=[lower_Xlim,lower_Ylim]
% MAX_XY=[upper_Xlim,upper_Ylim].
% The dafault limits are computed as:
% MAX=max(data,[],1); MIN=min(data,[],1); Range=MAX-MIN;
% MAX_XY=MAX+Range/4; MIN_XY=MIN-Range/4;
% OUTPUT: bandwidth - a row vector with the two optimal
% bandwidths for a bivaroate Gaussian kernel;
% the format is:
% bandwidth=[bandwidth_X, bandwidth_Y];
% density - an n by n matrix containing the density values over the n by n grid;
% density is not computed unless the function is asked for such an output;
% X,Y - the meshgrid over which the variable "density" has been computed;
% the intended usage is as follows:
% surf(X,Y,density)
% Example (simple Gaussian mixture)
% clear all
% % generate a Gaussian mixture with distant modes
% data=[randn(500,2);
% randn(500,1)+3.5, randn(500,1);];
% % call the routine
% [bandwidth,density,X,Y]=kde2d(data);
% % plot the data and the density estimate
% contour3(X,Y,density,50), hold on
% plot(data(:,1),data(:,2),'r.','MarkerSize',5)
%
% Example (Gaussian mixture with distant modes):
%
% clear all
% % generate a Gaussian mixture with distant modes
% data=[randn(100,1), randn(100,1)/4;
% randn(100,1)+18, randn(100,1);
% randn(100,1)+15, randn(100,1)/2-18;];
% % call the routine
% [bandwidth,density,X,Y]=kde2d(data);
% % plot the data and the density estimate
% surf(X,Y,density,'LineStyle','none'), view([0,60])
% colormap hot, hold on, alpha(.8)
% set(gca, 'color', 'blue');
% plot(data(:,1),data(:,2),'w.','MarkerSize',5)
%
% Example (Sinusoidal density):
%
% clear all
% X=rand(1000,1); Y=sin(X*10*pi)+randn(size(X))/3; data=[X,Y];
% % apply routine
% [bandwidth,density,X,Y]=kde2d(data);
% % plot the data and the density estimate
% surf(X,Y,density,'LineStyle','none'), view([0,70])
% colormap hot, hold on, alpha(.8)
% set(gca, 'color', 'blue');
% plot(data(:,1),data(:,2),'w.','MarkerSize',5)
%
% Notes: If you have a more accurate density estimator
% (as measured by which routine attains the smallest
% L_2 distance between the estimate and the true density) or you have
% problems running this code, please email me at botev@maths.uq.edu.au
% Reference: Z. I. Botev, J. F. Grotowski and D. P. Kroese
% "KERNEL DENSITY ESTIMATION VIA DIFFUSION" ,Submitted to the
% Annals of Statistics, 2009
global N A2 I
if nargin<2
n=2^8;
end
n=2^ceil(log2(n)); % round up n to the next power of 2;
N=size(data,1);
if nargin<3
MAX=max(data,[],1); MIN=min(data,[],1); Range=MAX-MIN;
MAX_XY=MAX+Range/4; MIN_XY=MIN-Range/4;
end
scaling=MAX_XY-MIN_XY;
if N<=size(data,2)
error('data has to be an N by 2 array where each row represents a two dimensional observation')
end
transformed_data=(data-repmat(MIN_XY,N,1))./repmat(scaling,N,1);
%bin the data uniformly using regular grid;
initial_data=ndhist(transformed_data,n);
% discrete cosine transform of initial data
a= dct2d(initial_data);
% now compute the optimal bandwidth^2
I=(0:n-1).^2; A2=a.^2;
t_star=fzero( @(t)(t-evolve(t)),[0,0.1]);
p_02=func([0,2],t_star);p_20=func([2,0],t_star); p_11=func([1,1],t_star);
t_y=(p_02^(3/4)/(4*pi*N*p_20^(3/4)*(p_11+sqrt(p_20*p_02))))^(1/3);
t_x=(p_20^(3/4)/(4*pi*N*p_02^(3/4)*(p_11+sqrt(p_20*p_02))))^(1/3);
% smooth the discrete cosine transform of initial data using t_star
a_t=exp(-(0:n-1)'.^2*pi^2*t_x/2)*exp(-(0:n-1).^2*pi^2*t_y/2).*a;
% now apply the inverse discrete cosine transform
if nargout>1
density=idct2d(a_t)*(numel(a_t)/prod(scaling));
[X,Y]=meshgrid(MIN_XY(1):scaling(1)/(n-1):MAX_XY(1),MIN_XY(2):scaling(2)/(n-1):MAX_XY(2));
end
bandwidth=sqrt([t_x,t_y]).*scaling;
end
%#######################################
function [out,time]=evolve(t)
global N
Sum_func = func([0,2],t) + func([2,0],t) + 2*func([1,1],t);
time=(2*pi*N*Sum_func)^(-1/3);
out=(t-time)/time;
end
%#######################################
function out=func(s,t)
global N
if sum(s)<=4
Sum_func=func([s(1)+1,s(2)],t)+func([s(1),s(2)+1],t); const=(1+1/2^(sum(s)+1))/3;
time=(-2*const*K(s(1))*K(s(2))/N/Sum_func)^(1/(2+sum(s)));
out=psi(s,time);
else
out=psi(s,t);
end
end
%#######################################
function out=psi(s,Time)
global I A2
% s is a vector
w=exp(-I*pi^2*Time).*[1,.5*ones(1,length(I)-1)];
wx=w.*(I.^s(1));
wy=w.*(I.^s(2));
out=(-1)^sum(s)*(wy*A2*wx')*pi^(2*sum(s));
end
%#######################################
function out=K(s)
out=(-1)^s*prod((1:2:2*s-1))/sqrt(2*pi);
end
%#######################################
function data=dct2d(data)
% computes the 2 dimensional discrete cosine transform of data
% data is an nd cube
[nrows,ncols]= size(data);
if nrows~=ncols
error('data is not a square array!')
end
% Compute weights to multiply DFT coefficients
w = [1;2*(exp(-i*(1:nrows-1)*pi/(2*nrows))).'];
weight=w(:,ones(1,ncols));
data=dct1d(dct1d(data)')';
function transform1d=dct1d(x)
% Re-order the elements of the columns of x
x = [ x(1:2:end,:); x(end:-2:2,:) ];
% Multiply FFT by weights:
transform1d = real(weight.* fft(x));
end
end
%#######################################
function data = idct2d(data)
% computes the 2 dimensional inverse discrete cosine transform
[nrows,ncols]=size(data);
% Compute wieghts
w = exp(i*(0:nrows-1)*pi/(2*nrows)).';
weights=w(:,ones(1,ncols));
data=idct1d(idct1d(data)');
function out=idct1d(x)
y = real(ifft(weights.*x));
out = zeros(nrows,ncols);
out(1:2:nrows,:) = y(1:nrows/2,:);
out(2:2:nrows,:) = y(nrows:-1:nrows/2+1,:);
end
end
%#######################################
function binned_data=ndhist(data,M)
% this function computes the histogram
% of an n-dimensional data set;
% 'data' is nrows by n columns
% M is the number of bins used in each dimension
% so that 'binned_data' is a hypercube with
% size length equal to M;
[nrows,ncols]=size(data);
bins=zeros(nrows,ncols);
for i=1:ncols
[dum,bins(:,i)] = histc(data(:,i),[0:1/M:1],1);
bins(:,i) = min(bins(:,i),M);
end
% Combine the vectors of 1D bin counts into a grid of nD bin
% counts.
binned_data = accumarray(bins(all(bins>0,2),:),1/nrows,M(ones(1,ncols)));
end