-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
check in notebook code #23
Changes from 1 commit
5cfc44c
8cdf71e
ceccf02
109932b
dd7eb30
0d3fcd4
8b211f4
cd25861
0bcbf5b
a5fe5c6
102b9bd
4d0cbdd
6eb56b1
1fba74a
bfd6ba6
a6851c6
3d2dff0
22a427b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,8 @@ | |
|
||
|
||
def loadModelStage(filename): | ||
import emission.analysis.classification.inference.mode.seed.pipeline as seedp | ||
import jsonpickle.ext.numpy as jsonpickle_numpy | ||
jsonpickle_numpy.register_handlers() | ||
model = loadModel(filename) | ||
return model | ||
|
||
|
@@ -44,13 +45,29 @@ def predict_labels(trip): | |
trip_loc_feat = trip_feat[0:4] | ||
try: | ||
# load locations of bins(1st round of clustering) | ||
# e.g.{'0': [[start lon1, start lat1, end lon1, end lat1],[start lon, start lat, end lon, end lat]]} | ||
# another explanation: -'0': label from the 1st round | ||
# - the value of key '0': all trips that in this bin | ||
# - for every trip: the coordinates of start/end locations | ||
bin_locations = loadModelStage('locations_' + str(user))[0] | ||
|
||
# load models from the 2nd round of clustering | ||
# we use Kmeans to build the model in the previous model building step | ||
# assume that we have 2 clusters from the 1st round(that means 2 bins), | ||
# the following is an example of the saved models. | ||
# e.g. {'0': KMeans(n_clusters=2, random_state=0), '1': KMeans(n_clusters=5, random_state=0)} | ||
models = loadModelStage('models_' + str(user))[0] | ||
|
||
# load user labels in all clusters | ||
# assume that we have 1 cluster(bin) from the 1st round of clustering, which has label '0', | ||
# and we have 1 cluster from the 2nd round, which has label '1' | ||
# the value of key '0' contains all 2nd round clusters | ||
# the value of key '1' contains all user labels and probabilities in this cluster | ||
# e.g. {'0': [{'1': [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'home', 'replaced_mode': 'drove_alone'}}]}]} | ||
shankari marked this conversation as resolved.
Show resolved
Hide resolved
|
||
user_labels = loadModelStage('user_labels_' + str(user))[0] | ||
|
||
except IOError: | ||
return {} | ||
return [] | ||
|
||
first_round_label_set = list(bin_locations.keys()) | ||
sel_fl = None | ||
|
@@ -64,18 +81,28 @@ def predict_labels(trip): | |
sel_fl = fl | ||
break | ||
if not sel_fl: | ||
return {} | ||
return [] | ||
# choose selected model | ||
sel_model = models[sel_fl] | ||
# predict 2nd label for the new trip | ||
# the value of sel_model.predict([trip_feat]) by default is numpy.ndarray, e.g.[1] | ||
# so we need to turn it into '1' so that it can be the same type as dict key in user_labels dicts | ||
sel_sl = str(sel_model.predict([trip_feat])[0]) | ||
shankari marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# values of the selected 1st round label | ||
sel_1st_round_val = user_labels[sel_fl][0] | ||
second_label_ls = list(sel_1st_round_val.keys()) | ||
|
||
# - seccond_round_result: values of the key of selected 1st round label | ||
# e.g.{'0': [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'home', 'replaced_mode': 'drove_alone'}, | ||
# 'p': 1.0}], | ||
# '1': [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, | ||
# 'p': 0.9333333333333333},{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'entertainment', | ||
# 'replaced_mode': 'drove_alone'},'p': 0.06666666666666667}]} | ||
# more explanation: '0' and '1' are 2nd round clusters from a bin(1st round cluster) | ||
# cluster '0' contains all user label combinations and probabilities in this cluster | ||
seccond_round_result = user_labels[sel_fl][0] | ||
second_label_ls = list(seccond_round_result.keys()) | ||
if sel_sl not in second_label_ls: | ||
return {} | ||
return [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do you need this? I think you can directly use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because Gabriel's function need a list, I follow the result from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not about the return type. It is about the need for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh ok, I see your point. |
||
# values of the selected 2nd round label, wrapped in a list | ||
sel_2nd_round_val = sel_1st_round_val[sel_sl] | ||
sel_2nd_round_val = seccond_round_result[sel_sl] | ||
|
||
return sel_2nd_round_val | ||
|
||
|
@@ -84,9 +111,36 @@ def predict_labels(trip): | |
if __name__ == '__main__': | ||
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0})) | ||
all_users = [u["user_id"] for u in participant_uuid_obj] | ||
|
||
# case 1: the new trip matches a bin from the 1st round and a cluster from the 2nd round | ||
user = all_users[0] | ||
radius = 100 | ||
trips = preprocess.read_data(user) | ||
filter_trips = preprocess.filter_data(trips, radius) | ||
new_trip = [filter_trips[4]] | ||
# result is [{'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'church', 'replaced_mode': 'drove_alone'}, | ||
# 'p': 0.9333333333333333}, {'labels': {'mode_confirm': 'shared_ride', 'purpose_confirm': 'entertainment', | ||
# 'replaced_mode': 'drove_alone'}, 'p': 0.06666666666666667}] | ||
print(predict_labels(new_trip)) | ||
|
||
# case 2: no existing files for the user who has the new trip: | ||
# 1. the user is invalid(< 10 existing fully labeled trips, or < 50% of trips that fully labeled) | ||
# 2. the user doesn't have common trips | ||
user = all_users[1] | ||
trips = preprocess.read_data(user) | ||
new_trip = [trips[0]] | ||
# result is [] | ||
print(predict_labels(new_trip)) | ||
|
||
# case3: the new trip is novel trip(doesn't fall in any 1st round bins) | ||
user = all_users[0] | ||
radius = 100 | ||
trips = preprocess.read_data(user) | ||
filter_trips = preprocess.filter_data(trips, radius) | ||
new_trip = [filter_trips[0]] | ||
# result is [] | ||
print(predict_labels(new_trip)) | ||
|
||
# case 4: the new trip falls in a 1st round bin, but predict to be a new cluster in the 2nd round | ||
# result is [] | ||
# no example for now | ||
shankari marked this conversation as resolved.
Show resolved
Hide resolved
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is going to be completely changed in the next commit, so ignoring for now.