-
Notifications
You must be signed in to change notification settings - Fork 2
/
random_data.py
84 lines (75 loc) · 2.23 KB
/
random_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
from itertools import product
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# Increase seaborn font size
sns.set(font_scale=1.4)
# Set style
sns.set_style("whitegrid")
# Set seed for reproducibility.
np.random.seed(0)
num_data_list = [10, 100, 1000]
num_features_list = [10, 100, 1000]
num_repeat_list = np.arange(100).tolist()
df_results = []
for num_data, num_features, repeat_idx in product(
num_data_list, num_features_list, num_repeat_list
):
print(
"num_data:", num_data, "num_features:", num_features, "repeat_idx:", repeat_idx
)
X = np.random.normal(size=(num_data, num_features))
eigenvalues = np.square(np.linalg.svd(X, full_matrices=False, compute_uv=False))
# Manually add the missing zeros:
eigenvalues = np.concatenate(
[eigenvalues, np.zeros(max(num_data, num_features) - eigenvalues.shape[0])]
)
df_results.append(
pd.DataFrame(
{
"num_data": np.full_like(eigenvalues, fill_value=num_data),
"num_features": np.full_like(eigenvalues, fill_value=num_features),
"repeat_idx": np.full_like(eigenvalues, fill_value=repeat_idx),
"Eigenvalue": eigenvalues,
}
)
)
df = pd.concat(df_results).reset_index(drop=True)
df["num_features"] = df["num_features"].astype(int)
df["num_data"] = df["num_data"].astype(int)
results_dir = "results/random_data"
os.makedirs(results_dir, exist_ok=True)
plt.close()
g = sns.FacetGrid(
data=df,
col="num_data",
row="num_features",
sharey=False,
sharex=False,
margin_titles=True,
)
g.map_dataframe(
sns.histplot,
x="Eigenvalue",
stat="probability",
bins=100,
line_kws={"linewidth": 0},
)
g.set_titles(
col_template="Num Data: {col_name}", row_template="Num Features: {row_name}"
)
g.set(yscale="log")
# axes = g.axes
# for row_idx in range(axes.shape[0]):
# for col_idx in range(axes.shape[1]):
# axes[row_idx, col_idx].axvline(0, ls='--', c='k',
# label='Chance')
plt.savefig(
os.path.join(results_dir, f"random_data_eigenvalue_distribution.png"),
bbox_inches="tight",
dpi=300,
)
plt.show()
plt.close()