-
Notifications
You must be signed in to change notification settings - Fork 0
/
EM_for_high_dimension.py
49 lines (42 loc) · 1.93 KB
/
EM_for_high_dimension.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def EM_for_high_dimension(data, means, covs, weights, cov_smoothing=1e-5, maxiter=int(1e3), thresh=1e-4, verbose=False):
# cov_smoothing: specifies the default variance assigned to absent features in a cluster.
# If we were to assign zero variances to absent features, we would be overconfient,
# as we hastily conclude that those featurese would NEVER appear in the cluster.
# We'd like to leave a little bit of possibility for absent features to show up later.
n = data.shape[0]
dim = data.shape[1]
mu = deepcopy(means)
Sigma = deepcopy(covs)
K = len(mu)
weights = np.array(weights)
ll = None
ll_trace = []
for i in range(maxiter):
# E-step: compute responsibilities
logresp = np.zeros((n,K))
for k in xrange(K):
logresp[:,k] = np.log(weights[k]) + logpdf_diagonal_gaussian(data, mu[k], Sigma[k])
ll_new = np.sum(log_sum_exp(logresp, axis=1))
if verbose:
print(ll_new)
sys.stdout.flush()
logresp -= np.vstack(log_sum_exp(logresp, axis=1))
resp = np.exp(logresp)
counts = np.sum(resp, axis=0)
# M-step: update weights, means, covariances
weights = counts / np.sum(counts)
for k in range(K):
mu[k] = (diag(resp[:,k]).dot(data)).sum(axis=0)/counts[k]
mu[k] = mu[k].A1
Sigma[k] = diag(resp[:,k]).dot( data.multiply(data)-2*data.dot(diag(mu[k])) ).sum(axis=0) \
+ (mu[k]**2)*counts[k]
Sigma[k] = Sigma[k].A1 / counts[k] + cov_smoothing*np.ones(dim)
# check for convergence in log-likelihood
ll_trace.append(ll_new)
if ll is not None and (ll_new-ll) < thresh and ll_new > -np.inf:
ll = ll_new
break
else:
ll = ll_new
out = {'weights':weights,'means':mu,'covs':Sigma,'loglik':ll_trace,'resp':resp}
return out