-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
check in notebook code #23
Changes from 4 commits
5cfc44c
8cdf71e
ceccf02
109932b
dd7eb30
0d3fcd4
8b211f4
cd25861
0bcbf5b
a5fe5c6
102b9bd
4d0cbdd
6eb56b1
1fba74a
bfd6ba6
a6851c6
3d2dff0
22a427b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
import emission.analysis.modelling.tour_model.similarity as similarity | ||
import numpy as np | ||
import get_request_percentage as grp | ||
import get_scores as gs | ||
import label_processing as lp | ||
import data_preprocessing as preprocess | ||
import get_tuning_score as tuning | ||
|
||
def second_round(first_label_set,first_labels,bin_trips,filter_trips,low,dist_pct,sim,new_labels,track): | ||
for l in first_label_set: | ||
# store second round trips data | ||
second_round_trips = [] | ||
# create a track to store indices and labels for the second round | ||
second_round_idx_labels = [] | ||
for index, first_label in enumerate(first_labels): | ||
if first_label == l: | ||
second_round_trips.append(bin_trips[index]) | ||
second_round_idx_labels.append([index, first_label]) | ||
x = preprocess.extract_features(second_round_trips) | ||
|
||
# We choose single-linkage clustering. | ||
# See examples and explanations at https://en.wikipedia.org/wiki/Single-linkage_clustering | ||
# It is based on grouping clusters in bottom-up fashion (agglomerative clustering), | ||
# at each step combining two clusters that contain the closest pair of elements not yet belonging | ||
# to the same cluster as each other. | ||
method = 'single' | ||
# get the second label from the second round of clustering using hierarchical clustering | ||
second_labels = lp.get_second_labels(x, method, low, dist_pct) | ||
# concatenate the first label (label from the first round) and the second label (label | ||
# from the second round) (e.g.first label[1,1,1], second label[1,2,3], new_labels is [11,12,13] | ||
new_labels = lp.get_new_labels(second_labels, second_round_idx_labels, new_labels) | ||
# change the labels in track with new_labels | ||
track = lp.change_track_labels(track, new_labels) | ||
|
||
# get request percentage for the subset for the second round | ||
percentage_second = grp.get_req_pct(new_labels, track, filter_trips, sim) | ||
|
||
# get homogeneity score for the second round | ||
homo_second = gs.score(bin_trips, new_labels) | ||
return percentage_second,homo_second | ||
|
||
|
||
# we use functions in similarity to build the first round of clustering | ||
def first_round(data,radius): | ||
sim = similarity.similarity(data, radius) | ||
filter_trips = sim.data | ||
sim.bin_data() | ||
sim.delete_bins() | ||
bins = sim.bins | ||
bin_trips = sim.newdata | ||
return sim, bins, bin_trips, filter_trips | ||
|
||
|
||
def get_first_label(bins): | ||
# get first round labels | ||
# the labels from the first round are the indices of bins | ||
# e.g. in bin 0 [trip1, trip2, trip3], the labels of this bin is [0,0,0] | ||
first_labels = [] | ||
for b in range(len(bins)): | ||
for trip in bins[b]: | ||
first_labels.append(b) | ||
return first_labels | ||
|
||
|
||
def get_track(bins, first_labels): | ||
# create a list idx_labels_track to store indices and labels | ||
# the indices of the items will be the same in the new label list after the second round clustering | ||
# item[0] is the original index of the trip in filter_trips | ||
# item[1] is the label from the first round of clustering | ||
idx_labels_track = [] | ||
for bin in bins: | ||
for ori_idx in bin: | ||
idx_labels_track.append([ori_idx]) | ||
# store first round labels in idx_labels_track list | ||
for i in range(len(first_labels)): | ||
idx_labels_track[i].append(first_labels[i]) | ||
|
||
return idx_labels_track | ||
|
||
|
||
def init_score(): | ||
# collect request percentage for a user for the first round | ||
pct_collect_first = [] | ||
# collect homogeneity score for a user for the first round | ||
homo_collect_first = [] | ||
# collect request percentage for a user for the second round | ||
pct_collect_second = [] | ||
# collect homogeneity score for a user for the second round | ||
homo_collect_second = [] | ||
return pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second | ||
|
||
|
||
def tuning_test(data,radius,pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs, | ||
coll_tune_score,tune=None, test=None): | ||
|
||
# run every subset | ||
for j in range(len(data)): | ||
sim,bins,bin_trips,filter_trips = first_round(data[j], radius) | ||
# it is possible that we don't have common trips for tuning or testing | ||
# bins contain common trips indices | ||
if len(bins) is not 0: | ||
gs.compare_trip_orders(bins, bin_trips, filter_trips) | ||
first_labels = get_first_label(bins) | ||
# new_labels temporary stores the labels from the first round, but later the labels in new_labels will be | ||
# updated with the labels after two rounds of clustering. | ||
new_labels = first_labels.copy() | ||
first_label_set = list(set(first_labels)) | ||
track = get_track(bins, first_labels) | ||
# get request percentage for the subset for the first round | ||
percentage_first = grp.get_req_pct(new_labels, track, filter_trips, sim) | ||
# get homogeneity score for the subset for the first round | ||
homo_first = gs.score(bin_trips, first_labels) | ||
pct_collect_first.append(percentage_first) | ||
homo_collect_first.append(homo_first) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the code for each subset can be pulled out. |
||
|
||
if tune: | ||
# collect tuning scores and parameters | ||
tune_score = {} | ||
|
||
for dist_pct in np.arange(0.15, 0.6, 0.02): | ||
for low in range(250, 600): | ||
percentage_second,homo_second = second_round(first_label_set,first_labels,bin_trips,filter_trips,low,dist_pct, | ||
sim,new_labels,track) | ||
|
||
curr_score = tuning.get_tuning_score(homo_second,percentage_second) | ||
if curr_score not in tune_score: | ||
tune_score[curr_score] = (low, dist_pct, homo_second, percentage_second) | ||
|
||
best_score = max(tune_score) | ||
coll_tune_score.append(best_score) | ||
sel_tradeoffs = tune_score[best_score] | ||
|
||
coll_tradeoffs.append(sel_tradeoffs[0:2]) | ||
homo_collect_second.append(sel_tradeoffs[2]) | ||
pct_collect_second.append(sel_tradeoffs[3]) | ||
|
||
if test: | ||
low = coll_tradeoffs[j][0] | ||
dist_pct = coll_tradeoffs[j][1] | ||
percentage_second, homo_second = second_round(first_label_set,first_labels,bin_trips,filter_trips,low, | ||
dist_pct,sim,new_labels,track) | ||
homo_collect_second.append(homo_second) | ||
pct_collect_second.append(percentage_second) | ||
coll_tune_score = [] | ||
else: | ||
percentage_first = 1 | ||
homo_first = 1 | ||
pct_collect_first.append(percentage_first) | ||
homo_collect_first.append(homo_first) | ||
coll_tradeoffs.append((0,0)) | ||
homo_collect_second.append(1) | ||
pct_collect_second.append(1) | ||
coll_tune_score.append(None) | ||
|
||
return pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,coll_tune_score | ||
|
||
|
||
def main(uuid = None): | ||
user = uuid | ||
radius = 100 | ||
trips = preprocess.read_data(user) | ||
filter_trips = preprocess.filter_data(trips, radius) | ||
tune_idx, test_idx = preprocess.split_data(filter_trips) | ||
tune_data = preprocess.get_subdata(filter_trips, test_idx) | ||
test_data = preprocess.get_subdata(filter_trips, tune_idx) | ||
pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = init_score() | ||
coll_tune_score = [] | ||
coll_tradeoffs = [] | ||
pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,\ | ||
coll_tune_score= tuning_test(tune_data,radius,pct_collect_first,homo_collect_first,pct_collect_second, | ||
homo_collect_second,coll_tradeoffs,coll_tune_score,tune = True) | ||
pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = init_score() | ||
pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,\ | ||
coll_tune_score = tuning_test(test_data,radius, pct_collect_first, homo_collect_first,pct_collect_second, | ||
homo_collect_second, coll_tradeoffs,coll_tune_score,test=True) | ||
return pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second | ||
|
||
|
||
if __name__ == '__main__': | ||
main(uuid=None) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "mighty-ukraine", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import emission.core.get_database as edb\n", | ||
"import emission.analysis.modelling.tour_model.similarity as similarity\n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"import get_request_percentage as grp\n", | ||
"import get_scores as gs\n", | ||
"import label_processing as lp\n", | ||
"import get_users as gu\n", | ||
"import data_preprocessing as preprocess\n", | ||
"import get_tuning_score as tuning\n", | ||
"import evaluation_pipeline as ep\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"import get_plot as plot\n", | ||
"import emission.core.common as ecc" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "cathedral-pointer", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n", | ||
"all_users = [u[\"user_id\"] for u in participant_uuid_obj]" | ||
Comment on lines
+33
to
+34
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @corinne-hcr The participant UUID check was specific to the CanBikeCO mini-pilot - it is not true for other programs or datasets. I have created a new mongodump of only the participant data, you should load that instead and change this to the call from |
||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "exotic-livestock", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"radius = 100" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "powered-airfare", | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# get all/valid user list\n", | ||
"user_ls, valid_users = gu.get_user_ls(all_users, radius)\n", | ||
"\n", | ||
"# collect request percentage for the first or second round (requested trips / total trips) for all users\n", | ||
"all_percentage_first_tune = []\n", | ||
"all_percentage_first_test = []\n", | ||
"all_percentage_second_tune = []\n", | ||
"all_percentage_second_test = []\n", | ||
"\n", | ||
"# collect homogeneity score for the first/second round for all users\n", | ||
"all_homogeneity_score_first_tune = []\n", | ||
"all_homogeneity_score_first_test = []\n", | ||
"all_homogeneity_score_second_tune = []\n", | ||
"all_homogeneity_score_second_test = []\n", | ||
"\n", | ||
"for a in range(len(all_users)):\n", | ||
" user = all_users[a]\n", | ||
" trips = preprocess.read_data(user)\n", | ||
" filter_trips = preprocess.filter_data(trips, radius)\n", | ||
" print('user', a + 1, 'filter_trips len', len(filter_trips))\n", | ||
"\n", | ||
" # filter out users that don't have enough valid labeled trips\n", | ||
" if not gu.valid_user(filter_trips, trips):\n", | ||
" continue\n", | ||
" tune_idx, test_idx = preprocess.split_data(filter_trips)\n", | ||
"\n", | ||
" # choose tuning/test set to run the model\n", | ||
" # this step will use KFold (5 splits) to split the data into different subsets\n", | ||
" # - tune: tuning set\n", | ||
" # - test: test set\n", | ||
" # Here we user a bigger part of the data for testing and a smaller part for tuning\n", | ||
" tune_data = preprocess.get_subdata(filter_trips, test_idx)\n", | ||
" test_data = preprocess.get_subdata(filter_trips, tune_idx)\n", | ||
"\n", | ||
" pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = ep.init_score()\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you have so much state, it seems like it would be better to have a class that encapsulates it. |
||
" \n", | ||
" # collect tuning parameters\n", | ||
" coll_tune_score = []\n", | ||
" coll_tradeoffs = []\n", | ||
" # tuning\n", | ||
" pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs, coll_tune_score= ep.tuning_test(tune_data,radius,pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,coll_tune_score,tune = True)\n", | ||
shankari marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"\n", | ||
" pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = ep.init_score()\n", | ||
" # testing\n", | ||
" pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,coll_tune_score = ep.tuning_test(test_data,radius, pct_collect_first, homo_collect_first,pct_collect_second,homo_collect_second, coll_tradeoffs,coll_tune_score,test=True)\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At this point, I am not sure if we still need the tradeoffs after the clustering, so I keep it. |
||
"\n", | ||
" print('colle_tune_score ', coll_tune_score)\n", | ||
" print('coll_tradeoffs',coll_tradeoffs)\n", | ||
"\n", | ||
" # collect request percentage for the first round for all users\n", | ||
" all_percentage_first_test.append(pct_collect_first)\n", | ||
"\n", | ||
" # collect homogeneity score for the first round for all users\n", | ||
" all_homogeneity_score_first_test.append(homo_collect_first)\n", | ||
"\n", | ||
" # collect request percentage for the second round for all users\n", | ||
" all_percentage_second_test.append(pct_collect_second)\n", | ||
"\n", | ||
" # collect homogeneity score for the second round for all users\n", | ||
" all_homogeneity_score_second_test.append(homo_collect_second)\n", | ||
"\n", | ||
"print('all_percentage_first_test', all_percentage_first_test)\n", | ||
"print('all_homogeneity_score_first_test', all_homogeneity_score_first_test)\n", | ||
"print('all_percentage_second_test', all_percentage_second_test)\n", | ||
"print('all_homogeneity_score_second_test', all_homogeneity_score_second_test)\n", | ||
"\n", | ||
"# plot evaluation scatter for the first round\n", | ||
"plt.figure()\n", | ||
"plot.get_scatter(all_percentage_first_test, all_homogeneity_score_first_test, valid_users)\n", | ||
"\n", | ||
"# plot evaluation scatter for the second round\n", | ||
"plt.figure()\n", | ||
"plot.get_scatter(all_percentage_second_test, all_homogeneity_score_second_test, valid_users)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "variable-faculty", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# This function is used for tuning | ||
# It aims to find the best pair of trade-offs. | ||
# - homo_second: the homogeneity score after the second round of clustering | ||
# - percentage_second: the user labels request percentage | ||
def get_tuning_score(homo_second,percentage_second): | ||
curr_score = 0.5 * homo_second + 0.5 * (1 - percentage_second) | ||
curr_score = float('%.3f' % curr_score) | ||
return curr_score |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do you really need 9 mandatory arguments to this function? What do you need
sim
for, for example, if you already havefilter_trips
andtrack
?do you really need both
first_label_set
andfirst_labels
? Why can you not getfirst_label_set
fromfirst_label
in this function?At a high level, if you get to so many arguments, I will recommend that you create a class to pass them in. But given our time constraints, I am not going to push for this now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sim
is to find out trips below the cutoff in the 1st round. Those trips didn't go through further analysis except the 1st round, so I have to usesim
I don't understand this. Maybe I can reduce 1 or 2 arguments. But given that there are still many arguments I have to pass in, what is the difference between writing a pure function to accept them and writing a class?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can write a class/struct to encapsulate them so that people don't have to write really long lines passing them back and forth. You can have a single class that you both pass in and return and that has the current state of the algorithm. Makes everything a lot less verbose.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have created a class for the second round. See
second_round_of_clustering.py
Although it is not a perfect one, but it works.