-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
214 lines (168 loc) · 7.12 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# main executable
import json
import pandas as pd
import numpy as np
import datacommons_pandas as dc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import sys
from absl import flags
FLAGS = flags.FLAGS
flags.DEFINE_boolean('create_training_data_set', False, 'Outputs a training data set')
flags.DEFINE_string('training_data_set_filename', 'names_to_topics', 'Filename to use for training data.')
flags.DEFINE_boolean('evaluate_model', False, 'Outputs predictions for test data')
flags.DEFINE_boolean('test_accuracy', False, 'Whether to test accuracy with test_train_split.')
flags.DEFINE_string('test_data_set_path', 'dc_sample_data.csv', 'Path to the csv file to use for training data.')
flags.DEFINE_string('prediction_filename', 'prediction.txt', 'Filename for the prediction')
FLAGS(sys.argv)
def filter_topics(variable):
if variable.startswith("dc/topic"):
return True
return False
def filter_statVars(variable):
# TODO(gmechali): Confirm there are no StatVarGroups associated with topics.
return not filter_topics(variable) and not filter_statVarPeerGroups(variable)
def filter_statVarPeerGroups(variable):
if variable.startswith("dc/svpg"):
return True
return False
def get_topics():
"""Fetches all topics in the Data Commons Graph, starting at dc/topic/Root
and iterating through it using BFS. Topics have subtopics via the relevantVariable
property.
"""
dc_topics = []
next_level_topics = ['dc/topic/Root']
while len(next_level_topics) > 0:
response = dc.get_property_values(next_level_topics,'relevantVariable')
next_level_topics.clear()
for topic in response:
next_level_topics.extend(list(filter(filter_topics, response[topic])))
if len(next_level_topics) > 0:
dc_topics.extend(next_level_topics)
return dc_topics
def get_names_from_topics(dc_topics):
"""Fetch the names of all statVars associated to the list of topics passed in.
We rely on the statVars directly associated with topics, or associated indirectly
via a StatVarPeerGroup.
"""
response = dc.get_property_values(dc_topics,'relevantVariable')
topics_to_stat_vars = {}
for topic in response:
# Gets all statVars and StatVarPeerGroups from the response.
stat_vars = list(filter(filter_statVars, response[topic]))
stat_var_peer_groups = list(filter(filter_statVarPeerGroups, response[topic]))
if stat_var_peer_groups:
# Fetch statVars associated with the statVarPeerGroups found.
svpg_resp = dc.get_property_values(stat_var_peer_groups,'member')
for svpg in svpg_resp:
stat_vars.extend(svpg_resp[svpg])
# TODO(gmechali): Confirm if there can be double nestedness.
topics_to_stat_vars[topic] = stat_vars
print("Done fetching topic to stat vars")
# topics_to_stat_vars is a dict containing the topic as the key, and the list of all
# associated statVars as the value.
# Fetch the test_data_stat_vars to ensure they are excluded from training data.
unused, unused, test_stat_vars = fetch_testing_data()
names_to_topics = {}
skip_count = 0
for topic in topics_to_stat_vars:
if topics_to_stat_vars[topic]:
# Fetch names of all statVars
response = dc.get_property_values(topics_to_stat_vars[topic],'name')
stat_var_names = []
for stat_var in response:
if not FLAGS.evaluate_model or not np.any(test_stat_vars[:] == stat_var):
stat_var_names.extend(response[stat_var])
else:
# Skipping statVar
skip_count += 1
for name in stat_var_names:
names_to_topics[name] = topic
# Formatted as: "Total Population": "dc/topic/Demographics"
print("Skp Count = ", skip_count)
return names_to_topics
def fetch_training_data():
print("Fetching training data")
print("Fetching all topics")
dc_topics = get_topics()
print("Now we have found all DataCommons topics, for a total topic count of ", len(dc_topics))
# DC API marks a limit of 500 nodes per API call so we should be able to get all the topic metadata in one.
# Note we're duping calls that were already made for simplicity then.
names_to_topics = get_names_from_topics(dc_topics)
# Write the final output of Topics to list of names for associated variable into a file, so we don't have to always
# regenerate this!
with open(FLAGS.training_data_set_filename+'.json', 'w') as convert_file:
convert_file.write(json.dumps(names_to_topics))
# Formatted as: "Total Population": "dc/topic/Demographics"
# Keeping it since it's more human-readable.
with open(FLAGS.training_data_set_filename + '.txt', 'w') as f:
for key, value in names_to_topics.items():
f.write('%s:%s\n' % (key, value))
def train_model():
"""Train the model using json output.
Returns the model and the CountVectorizer.
"""
f = open(FLAGS.training_data_set_filename+'.json')
data = json.load(f)
X = []
y = []
for name in data:
X.append(name)
y.append(data[name])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
# After a few iterations, we noticed using the ngram range (1,3) yields the best results.
vectorizer = CountVectorizer(stop_words="english", analyzer="word", ngram_range=(1,2))
X_train = vectorizer.fit_transform(X_train)
# Either use LogisticRegression() or MultinomialNB() - yields different results.
model = LogisticRegression()
model.fit(X_train,y_train)
if FLAGS.test_accuracy:
X_test = vectorizer.transform(X_test)
test_prediction = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, test_prediction))
return model, vectorizer
def fetch_testing_data():
test_df = pd.read_csv(FLAGS.test_data_set_path, usecols=["Name", "Chart Title", "StatVar"])
name_samples = test_df['Name'].values.astype('U')
chart_title_samples = test_df['Chart Title'].values.astype('U')
stat_var_samples = test_df['StatVar'].values.astype('U')
return name_samples, chart_title_samples, stat_var_samples
def evaluate():
"""Evaluates a given model using test data. We first try based on the `Name` test data, and fall back to `Chart Title`
Outputs the prediction in a json file.
"""
model, vectorizer = train_model()
name_samples, chart_title_samples, unused = fetch_testing_data()
# Classify based on Name.
X_test = vectorizer.transform(name_samples)
name_prediction = model.predict(X_test)
# Classify based on Chart Title.
X_test = vectorizer.transform(chart_title_samples)
title_prediction = model.predict(X_test)
# Store the predictions in dictionary. Keep the Names prediction when available. Fall back to Chart Title.
index = 0
predictions = {}
for pr in name_prediction:
if name_samples[index] != "nan":
predictions[name_samples[index]] = pr
index += 1
index = 0
for pr in title_prediction:
if chart_title_samples[index] != "nan" and name_samples[index] == "nan":
predictions[chart_title_samples[index]] = pr
index += 1
# Output the prediction in txt file.
with open(FLAGS.prediction_filename, 'w') as f:
for key, value in predictions.items():
f.write('%s:%s\n' % (key, value))
def main():
if FLAGS.create_training_data_set:
fetch_training_data()
if FLAGS.evaluate_model:
evaluate()
if __name__ == "__main__":
main()