-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate-graph-data.py
187 lines (151 loc) · 6.2 KB
/
generate-graph-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env python3
# **IMportant!** This is a development script to quickly get lists and then
# a matrix of packages. It is far from perfect, and I suspect buggy because
# the spack.yaml files come in so many different flavors. If you have an idea
# for how to clean this up, please submit a PR! It will be greatly appreciated!
from sklearn import manifold
import pandas
import logging
import fnmatch
import numpy
import os
import yaml
from jinja2 import Environment, FileSystemLoader, select_autoescape
logging.basicConfig(level=logging.INFO)
# We want the root
here = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
env = Environment(
autoescape=select_autoescape(["html"]), loader=FileSystemLoader("templates")
)
def read_yaml(filename):
with open(filename, "r") as fd:
content = yaml.load(fd, Loader=yaml.FullLoader)
return content
def recursive_find(base, pattern="*.yaml"):
"""recursive find will yield python files in all directory levels
below a base path.
Arguments:
- base (str) : the base directory to search
- pattern: a pattern to match, defaults to *.py
"""
for root, _, filenames in os.walk(base):
for filename in fnmatch.filter(filenames, pattern):
yield os.path.join(root, filename)
def get_packages(key, content):
"""
Return a list of packages found in a spack.yaml
"""
found = set()
if key not in content.get("spack", {}):
return []
specs = content.get("spack", {}).get(key, {})
def get_definition(name):
defs = content.get("spack", {}).get("definitions")
specs = []
if not defs:
return []
for definition in defs:
if name in definition:
specs = definition[name]
return specs
if not specs:
specs = get_definition(key)
if isinstance(specs, dict):
specs = list(specs.keys())
while specs:
spec = specs.pop(0)
# if it's a matrix
if isinstance(spec, dict) and "matrix" in spec:
for key in spec["matrix"][0]:
specs += get_definition(key.strip("$"))
continue
# This is a reference to another variable
if spec.startswith("$"):
group = get_definition(spec.strip("$"))
while group:
spec = group.pop(0)
if isinstance(spec, dict) and "matrix" in spec:
for key in spec["matrix"][0]:
group += get_definition(key.strip("$"))
else:
for char in ["@", " ", "%", "~", "+"]:
spec = spec.split(char)[0]
found.add(spec.strip())
else:
for char in ["@", " ", "%", "~", "+"]:
spec = spec.split(char)[0]
found.add(spec.strip())
if "all" in found:
found.remove("all")
return list(found)
def main():
"""
Entrypoint to generation of graph data
"""
# Keep track of envs and spack keys
spack_keys = 0
env_keys = 0
# Get list of all spack.yaml files
stacks = os.path.join(here, "_stacks")
spackyamls = list(recursive_find(stacks))
# First count the keys, and get unique package names
packages = set()
for spackyaml in spackyamls:
# Try to get specs by parsing spack.yaml
content = read_yaml(spackyaml)
if not isinstance(content, dict):
continue
if "spack" in content:
spack_keys += 1
elif "env" in content:
env_keys += 1
# Try parsing both specs and packages. This is imperfect, but best effort
[packages.add(x) for x in get_packages("specs", content)]
[packages.add(x) for x in get_packages("packages", content)]
# Create a data frame
print("Found %s spack.yaml env keys and %s spack keys." % (env_keys, spack_keys))
packages = sorted(list(packages))
df = pandas.DataFrame(columns=packages, index=packages)
df.fillna(0, inplace=True)
# Now populate the data frame with counts
print("Found %s spack.yaml env keys and %s spack keys." % (env_keys, spack_keys))
for i, spackyaml in enumerate(spackyamls):
print("Counting spack.yaml %s of %s" % (i, len(spackyamls)))
# Try to get specs by parsing spack.yaml
content = read_yaml(spackyaml)
if isinstance(content, list):
continue
# Try parsing both specs and packages. This is imperfect, but best effort
packageset = get_packages("specs", content) + get_packages("packages", content)
for pkg1 in packageset:
for pkg2 in packageset:
df.loc[pkg1, pkg2] += 1
# Save the counts (intermediate data)
counts_df = os.path.join(here, "data", "package-pair-counts.csv")
df.to_csv(counts_df)
# We need to normalize by packages that appear across a lot of packages. So we want to divide by the square root of the row sums and the columns sums. We do this so it's still symmetric, and we use a square root because we want the units to go away.
rowsums = numpy.sqrt(df.sum(axis=0)) # row and column sums are the same
distinct = (df != 0).sum()
# This is a symmetric matrix, min 0 and max 1.0 normalized counts
normalized = df.divide(rowsums).transpose().divide(rowsums)
counts_df = os.path.join(here, "data", "package-normalized-counts.csv")
normalized.to_csv(counts_df)
# If counts is high, distance is close. If counts is low, distance is far
# 1 == packages only appear together, 0 == never appear together
# now smaller is closer in distance
distance = 1 / normalized # this has inf
distance.replace([numpy.inf, -numpy.inf], 0, inplace=True)
counts_df = os.path.join(here, "data", "package-distances.csv")
distance.to_csv(counts_df)
# Make the tsne!
fit = manifold.TSNE(n_components=2)
embedding = fit.fit_transform(distance)
df = pandas.DataFrame(embedding, index=distance.index, columns=["x", "y"])
df.index.name = "name"
# use normalization values for the radius (color)
df["norm"] = rowsums
df["distinct"] = distinct
counts_df = os.path.join(here, "_data", "packages-embeddings.csv")
df.to_csv(counts_df)
if __name__ == "__main__":
main()