Skip to content

Commit

Permalink
check in the changes in the notebook, have put the original clusterin…
Browse files Browse the repository at this point in the history
…g code in evaluation_pipeline
  • Loading branch information
corinne-hcr committed Jul 19, 2021
1 parent 1fba74a commit bfd6ba6
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 114 deletions.
104 changes: 66 additions & 38 deletions tour_model_eval/evaluation_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import emission.analysis.modelling.tour_model.similarity as similarity
import numpy as np
import emission.core.get_database as edb
import emission.analysis.modelling.tour_model.get_request_percentage as grp
import emission.analysis.modelling.tour_model.get_scores as gs
import emission.analysis.modelling.tour_model.label_processing as lp
import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
import second_round_of_clustering as sr
import emission.analysis.modelling.tour_model.second_round_of_clustering as sr
import emission.analysis.modelling.tour_model.get_users as gu
import pandas as pd
import jsonpickle as jpickle

def second_round(bin_trips,filter_trips,first_labels,track,low,dist_pct,sim,kmeans):
sec = sr.SecondRoundOfClustering(bin_trips,first_labels)
Expand Down Expand Up @@ -122,43 +125,68 @@ def test(data,radius,low,dist_pct,kmeans):
return homo_first,percentage_first,homo_second,percentage_second,scores


def main(uuid=None):
user = uuid
def main(all_users):
radius = 100
df = pd.DataFrame(columns=['user','user_id','percentage of 1st round','homogeneity socre of 1st round','percentage of 2nd round',
'homogeneity socre of 2nd roun','scores','lower boundary','distance percentage'])
trips = preprocess.read_data(user)
filter_trips = preprocess.filter_data(trips, radius)
tune_idx, test_idx = preprocess.split_data(filter_trips)
tune_data = preprocess.get_subdata(filter_trips, test_idx)
test_data = preprocess.get_subdata(filter_trips, tune_idx)

# tune data
for j in range(len(tune_data)):
low, dist_pct = tune(tune_data[j], radius, kmeans=False)
df.loc[j,'lower boundary']=low
df.loc[j,'distance percentage']=dist_pct


# testing
for k in range(len(test_data)):
low = df.loc[k,'lower boundary']
dist_pct = df.loc[k,'distance percentage']

# for testing, we add kmeans to re-build the model
homo_first, percentage_first, homo_second, percentage_second, scores = test(test_data[k],radius,low,
dist_pct,kmeans=True)

df.loc[k, 'percentage of 1st round'] = percentage_first
df.loc[k, 'homogeneity socre of 1st round'] = homo_first
df.loc[k, 'percentage of 2nd round'] = percentage_second
df.loc[k, 'homogeneity socre of 2nd round'] = homo_second
df.loc[k, 'scores'] = scores
df['user_id'] = user
df['user'] = 'user0'

filename = "user_" + str(user) + ".csv"
df.to_csv(filename, index=True, index_label='split')
# get all/valid user list
user_ls, valid_users = gu.get_user_ls(all_users, radius)
all_filename = []
for a in range(len(all_users)):
user = all_users[a]
df = pd.DataFrame(columns=['user','user_id','percentage of 1st round','homogeneity socre of 1st round',
'percentage of 2nd round','homogeneity socre of 2nd roun','scores','lower boundary',
'distance percentage'])
trips = preprocess.read_data(user)
filter_trips = preprocess.filter_data(trips, radius)
# filter out users that don't have enough valid labeled trips
if not gu.valid_user(filter_trips, trips):
continue
tune_idx, test_idx = preprocess.split_data(filter_trips)
# choose tuning/test set to run the model
# this step will use KFold (5 splits) to split the data into different subsets
# - tune: tuning set
# - test: test set
# Here we user a bigger part of the data for testing and a smaller part for tuning
tune_data = preprocess.get_subdata(filter_trips, test_idx)
test_data = preprocess.get_subdata(filter_trips, tune_idx)

# tune data
for j in range(len(tune_data)):
# for tuning, we don't add kmeans for re-clustering. We just need to get tuning parameters
# - low: the lower boundary of the dendrogram. If the final distance of the dendrogram is lower than "low",
# this bin no need to be re-clutered.
# - dist_pct: the higher boundary of the dendrogram. If the final distance is higher than "low",
# the cutoff of the dendrogram is (the final distance of the dendrogram * dist_pct)
low, dist_pct = tune(tune_data[j], radius, kmeans=False)
df.loc[j,'lower boundary']=low
df.loc[j,'distance percentage']=dist_pct

# testing
for k in range(len(test_data)):
low = df.loc[k,'lower boundary']
dist_pct = df.loc[k,'distance percentage']

# for testing, we add kmeans to re-build the model
homo_first, percentage_first, homo_second, percentage_second, scores = test(test_data[k],radius,low,
dist_pct,kmeans=True)
df.loc[k, 'percentage of 1st round'] = percentage_first
df.loc[k, 'homogeneity socre of 1st round'] = homo_first
df.loc[k, 'percentage of 2nd round'] = percentage_second
df.loc[k, 'homogeneity socre of 2nd round'] = homo_second
df.loc[k, 'scores'] = scores
df['user_id'] = user
df['user']='user'+str(a+1)

filename = "user_" + str(user) + ".csv"
all_filename.append(filename)
df.to_csv(filename, index=True, index_label='split')

# collect filename in a file, use it to plot the scatter
collect_filename = jpickle.dumps(all_filename)
with open("collect_filename", "w") as fd:
fd.write(collect_filename)


if __name__ == '__main__':
main(uuid=None)
participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
all_users = [u["user_id"] for u in participant_uuid_obj]
main(all_users)
Loading

0 comments on commit bfd6ba6

Please sign in to comment.