-
Notifications
You must be signed in to change notification settings - Fork 2
/
simulated_data.py
143 lines (123 loc) · 5.04 KB
/
simulated_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import numpy as np
import scipy.linalg as la
from scipy.stats import multinomial
def random_multivar_normal(n, d, k, sigma=.1):
'''
Generate random samples from a random multivariate normal distribution
with covariance A A^T + sigma^2 I.
Input:
n: int, number of samples
d: int, dimension of samples
k: int, number of samples approximated
sigma: optional float > 0, default .1, the standard deviation
of the sample noise.
Output:
cov: d x d array, the covariance matrix for the distibution
A0: d x d array, the eigenvectors we want to estimate (note the
eigenvectors are columns of the array, in descending order)
X: n x d array of n d-dimensional samples.
'''
A0 = la.qr(np.random.rand(d, k), mode='economic')[0]
cov = A0 @ A0.T + sigma**2 * np.eye(d)
X = np.random.multivariate_normal(np.zeros(d), cov, size=n)
return cov, A0, X
def spiked_covariance(n, d, k, sigma=.1):
'''
Generate random samples from a random multivariate normal distribution
with covariance A D A^T + sigma^2 I.
Here A is a set of k orthogonal vectors and D is a diagonal matrix with
random, uniform entries, sorted and scaled so that the first entry = 1.
Input:
n: int, number of samples
d: int, dimension of samples
k: int, number of samples approximated
sigma: optional float > 0, default .1, the standard deviation
of the sample noise.
Output:
cov: d x d array, the true covariance matrix for the distibution
w: d vector of the diagonal values from matrix D.
A0: d x k array, the eigenvectors we want to estimate (note the
eigenvectors are columns of the array, in descending order)
X: n x d array of n d-dimensional samples.
'''
A0 = la.qr(np.random.rand(d, k), mode='economic')[0]
w = np.sort(np.random.rand(k, 1), axis=0)[::-1]
w /= w.max()
cov = A0 @ (w**2 * A0.T) + sigma**2 * np.eye(d)
X = np.random.multivariate_normal(np.zeros(d), cov, size=n)
return cov, w, A0, X
def random_multinomial(n, d, trials=100, mean0 = True, scale=1):
'''
Generate random samples from a random multinomial distribution with p_i ~ U(0,1).
Input:
n: int, number of samples
d: int, dimension of samples
trials: optional int, the number of trials for each sample from the
multinomial distribution default is 100.
mean0: optional boolean, default True. Indicates whether to normalize
the samples so they are mean 0.
Output:
cov: d x d array, the true covariance matrix for the distribution
e: d-dimensional array, the eigenvalues of the covariance matrix
v: d x d array, the eigenvectors of the covariance matrix
X: n x d array of n d-dimensional samples from the random_dirichlet
distribution with covariance cov.
'''
# Initialize p values
p = np.random.rand(d)
p /= p.sum()
# Calculate the covariance matrix for the multinomial distribution
# For large d > 10000, use multinomial.cov(d,p)
if d >= 10000:
cov = multinomial.cov(trials, p)
else:
cov = -np.outer(p, p) * trials
cov[np.diag_indices(d)] = trials * p * (1-p)
cov *= scale**2
# Obtain the eigenvectors of the covariance matrix.
e, v = la.eigh(cov)
e = e[::-1]
v = v[:,::-1]
# Obtain a sample from the multinomial distribution of size n
X = np.random.multinomial(trials, p, n).astype(float)
if mean0:
# Let X have mean 0
X -= trials * p
X *= scale
return cov, e, v, X
def random_dirichlet(n, d, mean0=True, scale=1):
'''
Generate random samples from a random dirichlet distribution with a_i ~ U(0,1).
Input:
n: int, number of samples
d: int, dimension of samples
mean0: optional boolean, default True. Indicates whether to normalize
the samples so they are mean 0.
Output:
cov: d x d array, the true covariance matrix for the distribution
e: d-dimensional array, the eigenvalues of the covariance matrix
v: d x d array, the eigenvectors of the covariance matrix
(note the eigenvectors are columns of the array, in descending order)
X: n x d array of n d-dimensional samples from the random_dirichlet
distribution with covariance cov.
'''
# Initialize a random set of parameters a drawn from the
# uniform distribution.
a = np.random.rand(d)
a0 = a.sum()
a_denom = a0**2 * (a0 + 1)
# Obtain the covariance matrix for the dirichlet distribution.
# Note that scipy doesn't currently have a builtin method for this
# (I may add one myself)
cov = -np.outer(a, a) / a_denom # i neq j case
cov[np.diag_indices(d)] = a * (a0 - a) / a_denom # i = j case
cov *= scale**2
# Obtain the eigenvectors of the covariance matrix.
e, v = la.eigh(cov)
e = e[::-1]
v = v[:,::-1]
X = np.random.dirichlet(a, n)
if mean0:
X -= a / a0
X *= scale
return cov, e, v, X