check in the changes in the notebook, have put the original clusterin…

…g code in evaluation_pipeline
e-mission · Jul 19, 2021 · bfd6ba6 · bfd6ba6
1 parent 1fba74a
commit bfd6ba6
Show file tree

Hide file tree

Showing 2 changed files with 157 additions and 114 deletions.
diff --git a/tour_model_eval/evaluation_pipeline.py b/tour_model_eval/evaluation_pipeline.py
@@ -1,11 +1,14 @@
 import emission.analysis.modelling.tour_model.similarity as similarity
 import numpy as np
+import emission.core.get_database as edb
 import emission.analysis.modelling.tour_model.get_request_percentage as grp
 import emission.analysis.modelling.tour_model.get_scores as gs
 import emission.analysis.modelling.tour_model.label_processing as lp
 import emission.analysis.modelling.tour_model.data_preprocessing as preprocess
-import second_round_of_clustering as sr
+import emission.analysis.modelling.tour_model.second_round_of_clustering as sr
+import emission.analysis.modelling.tour_model.get_users as gu
 import pandas as pd
+import jsonpickle as jpickle
 
 def second_round(bin_trips,filter_trips,first_labels,track,low,dist_pct,sim,kmeans):
     sec = sr.SecondRoundOfClustering(bin_trips,first_labels)
@@ -122,43 +125,68 @@ def test(data,radius,low,dist_pct,kmeans):
     return homo_first,percentage_first,homo_second,percentage_second,scores
 
 
-def main(uuid=None):
-    user = uuid
+def main(all_users):
     radius = 100
-    df = pd.DataFrame(columns=['user','user_id','percentage of 1st round','homogeneity socre of 1st round','percentage of 2nd round',
-                              'homogeneity socre of 2nd roun','scores','lower boundary','distance percentage'])
-    trips = preprocess.read_data(user)
-    filter_trips = preprocess.filter_data(trips, radius)
-    tune_idx, test_idx = preprocess.split_data(filter_trips)
-    tune_data = preprocess.get_subdata(filter_trips, test_idx)
-    test_data = preprocess.get_subdata(filter_trips, tune_idx)
-
-    # tune data
-    for j in range(len(tune_data)):
-        low, dist_pct = tune(tune_data[j], radius, kmeans=False)
-        df.loc[j,'lower boundary']=low
-        df.loc[j,'distance percentage']=dist_pct
-
-
-    # testing
-    for k in range(len(test_data)):
-        low = df.loc[k,'lower boundary']
-        dist_pct = df.loc[k,'distance percentage']
-
-        # for testing, we add kmeans to re-build the model
-        homo_first, percentage_first, homo_second, percentage_second, scores = test(test_data[k],radius,low,
-                                                                                    dist_pct,kmeans=True)
-
-        df.loc[k, 'percentage of 1st round'] = percentage_first
-        df.loc[k, 'homogeneity socre of 1st round'] = homo_first
-        df.loc[k, 'percentage of 2nd round'] = percentage_second
-        df.loc[k, 'homogeneity socre of 2nd round'] = homo_second
-        df.loc[k, 'scores'] = scores
-        df['user_id'] = user
-        df['user'] = 'user0'
-
-    filename = "user_" + str(user) + ".csv"
-    df.to_csv(filename, index=True, index_label='split')
+    # get all/valid user list
+    user_ls, valid_users = gu.get_user_ls(all_users, radius)
+    all_filename = []
+    for a in range(len(all_users)):
+        user = all_users[a]
+        df = pd.DataFrame(columns=['user','user_id','percentage of 1st round','homogeneity socre of 1st round',
+                                   'percentage of 2nd round','homogeneity socre of 2nd roun','scores','lower boundary',
+                                   'distance percentage'])
+        trips = preprocess.read_data(user)
+        filter_trips = preprocess.filter_data(trips, radius)
+        # filter out users that don't have enough valid labeled trips
+        if not gu.valid_user(filter_trips, trips):
+            continue
+        tune_idx, test_idx = preprocess.split_data(filter_trips)
+        # choose tuning/test set to run the model
+        # this step will use KFold (5 splits) to split the data into different subsets
+        # - tune: tuning set
+        # - test: test set
+        # Here we user a bigger part of the data for testing and a smaller part for tuning
+        tune_data = preprocess.get_subdata(filter_trips, test_idx)
+        test_data = preprocess.get_subdata(filter_trips, tune_idx)
+
+        # tune data
+        for j in range(len(tune_data)):
+            # for tuning, we don't add kmeans for re-clustering. We just need to get tuning parameters
+            # - low: the lower boundary of the dendrogram. If the final distance of the dendrogram is lower than "low",
+            # this bin no need to be re-clutered.
+            # - dist_pct: the higher boundary of the dendrogram. If the final distance is higher than "low",
+            # the cutoff of the dendrogram is (the final distance of the dendrogram * dist_pct)
+            low, dist_pct = tune(tune_data[j], radius, kmeans=False)
+            df.loc[j,'lower boundary']=low
+            df.loc[j,'distance percentage']=dist_pct
+
+        # testing
+        for k in range(len(test_data)):
+            low = df.loc[k,'lower boundary']
+            dist_pct = df.loc[k,'distance percentage']
+
+            # for testing, we add kmeans to re-build the model
+            homo_first, percentage_first, homo_second, percentage_second, scores = test(test_data[k],radius,low,
+                                                                                        dist_pct,kmeans=True)
+            df.loc[k, 'percentage of 1st round'] = percentage_first
+            df.loc[k, 'homogeneity socre of 1st round'] = homo_first
+            df.loc[k, 'percentage of 2nd round'] = percentage_second
+            df.loc[k, 'homogeneity socre of 2nd round'] = homo_second
+            df.loc[k, 'scores'] = scores
+            df['user_id'] = user
+            df['user']='user'+str(a+1)
+
+        filename = "user_" + str(user) + ".csv"
+        all_filename.append(filename)
+        df.to_csv(filename, index=True, index_label='split')
+
+    # collect filename in a file, use it to plot the scatter
+    collect_filename = jpickle.dumps(all_filename)
+    with open("collect_filename", "w") as fd:
+        fd.write(collect_filename)
+
 
 if __name__ == '__main__':
-    main(uuid=None)
+    participant_uuid_obj = list(edb.get_profile_db().find({"install_group": "participant"}, {"user_id": 1, "_id": 0}))
+    all_users = [u["user_id"] for u in participant_uuid_obj]
+    main(all_users)