e-mission · shankari · Jul 24, 2021 · Jun 2, 2021 · Jun 13, 2021 · Jun 13, 2021
diff --git a/tour_model_eval/evaluation_pipeline.py b/tour_model_eval/evaluation_pipeline.py
@@ -0,0 +1,180 @@
+import emission.analysis.modelling.tour_model.similarity as similarity
+import numpy as np
+import get_request_percentage as grp
+import get_scores as gs
+import label_processing as lp
+import data_preprocessing as preprocess
+import get_tuning_score as tuning
+
+def second_round(first_label_set,first_labels,bin_trips,filter_trips,low,dist_pct,sim,new_labels,track):
+    for l in first_label_set:
+        # store second round trips data
+        second_round_trips = []
+        # create a track to store indices and labels for the second round
+        second_round_idx_labels = []
+        for index, first_label in enumerate(first_labels):
+            if first_label == l:
+                second_round_trips.append(bin_trips[index])
+                second_round_idx_labels.append([index, first_label])
+        x = preprocess.extract_features(second_round_trips)
+
+        # We choose single-linkage clustering.
+        # See examples and explanations at https://en.wikipedia.org/wiki/Single-linkage_clustering
+        # It is based on grouping clusters in bottom-up fashion (agglomerative clustering),
+        # at each step combining two clusters that contain the closest pair of elements not yet belonging
+        # to the same cluster as each other.
+        method = 'single'
+        # get the second label from the second round of clustering using hierarchical clustering
+        second_labels = lp.get_second_labels(x, method, low, dist_pct)
+        # concatenate the first label (label from the first round) and the second label (label
+        # from the second round) (e.g.first label[1,1,1], second label[1,2,3], new_labels is [11,12,13]
+        new_labels = lp.get_new_labels(second_labels, second_round_idx_labels, new_labels)
+        # change the labels in track with new_labels
+        track = lp.change_track_labels(track, new_labels)
+
+    # get request percentage for the subset for the second round
+    percentage_second = grp.get_req_pct(new_labels, track, filter_trips, sim)
+
+    # get homogeneity score for the second round
+    homo_second = gs.score(bin_trips, new_labels)
+    return percentage_second,homo_second
+
+
+# we use functions in similarity to build the first round of clustering
+def first_round(data,radius):
+    sim = similarity.similarity(data, radius)
+    filter_trips = sim.data
+    sim.bin_data()
+    sim.delete_bins()
+    bins = sim.bins
+    bin_trips = sim.newdata
+    return sim, bins, bin_trips, filter_trips
+
+
+def get_first_label(bins):
+    # get first round labels
+    # the labels from the first round are the indices of bins
+    # e.g. in bin 0 [trip1, trip2, trip3], the labels of this bin is [0,0,0]
+    first_labels = []
+    for b in range(len(bins)):
+        for trip in bins[b]:
+            first_labels.append(b)
+    return first_labels
+
+
+def get_track(bins, first_labels):
+    # create a list idx_labels_track to store indices and labels
+    # the indices of the items will be the same in the new label list after the second round clustering
+    # item[0] is the original index of the trip in filter_trips
+    # item[1] is the label from the first round of clustering
+    idx_labels_track = []
+    for bin in bins:
+        for ori_idx in bin:
+            idx_labels_track.append([ori_idx])
+    # store first round labels in idx_labels_track list
+    for i in range(len(first_labels)):
+        idx_labels_track[i].append(first_labels[i])
+
+    return idx_labels_track
+
+
+def init_score():
+    # collect request percentage for a user for the first round
+    pct_collect_first = []
+    # collect homogeneity score for a user for the first round
+    homo_collect_first = []
+    # collect request percentage for a user for the second round
+    pct_collect_second = []
+    # collect homogeneity score for a user for the second round
+    homo_collect_second = []
+    return pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second
+
+
+def tuning_test(data,radius,pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,
+                coll_tune_score,tune=None, test=None):
+
+    # run every subset
+    for j in range(len(data)):
+        sim,bins,bin_trips,filter_trips = first_round(data[j], radius)
+        # it is possible that we don't have common trips for tuning or testing
+        # bins contain common trips indices
+        if len(bins) is not 0:
+            gs.compare_trip_orders(bins, bin_trips, filter_trips)
+            first_labels = get_first_label(bins)
+            # new_labels temporary stores the labels from the first round, but later the labels in new_labels will be
+            # updated with the labels after two rounds of clustering.
+            new_labels = first_labels.copy()
+            first_label_set = list(set(first_labels))
+            track = get_track(bins, first_labels)
+            # get request percentage for the subset for the first round
+            percentage_first = grp.get_req_pct(new_labels, track, filter_trips, sim)
+            # get homogeneity score for the subset for the first round
+            homo_first = gs.score(bin_trips, first_labels)
+            pct_collect_first.append(percentage_first)
+            homo_collect_first.append(homo_first)
+
+            if tune:
+                # collect tuning scores and parameters
+                tune_score = {}
+
+                for dist_pct in np.arange(0.15, 0.6, 0.02):
+                    for low in range(250, 600):
+                        percentage_second,homo_second = second_round(first_label_set,first_labels,bin_trips,filter_trips,low,dist_pct,
+                                                                     sim,new_labels,track)
+
+                        curr_score = tuning.get_tuning_score(homo_second,percentage_second)
+                        if curr_score not in tune_score:
+                            tune_score[curr_score] = (low, dist_pct, homo_second, percentage_second)
+
+                best_score = max(tune_score)
+                coll_tune_score.append(best_score)
+                sel_tradeoffs = tune_score[best_score]
+
+                coll_tradeoffs.append(sel_tradeoffs[0:2])
+                homo_collect_second.append(sel_tradeoffs[2])
+                pct_collect_second.append(sel_tradeoffs[3])
+
+            if test:
+                low = coll_tradeoffs[j][0]
+                dist_pct = coll_tradeoffs[j][1]
+                percentage_second, homo_second = second_round(first_label_set,first_labels,bin_trips,filter_trips,low,
+                                                              dist_pct,sim,new_labels,track)
+                homo_collect_second.append(homo_second)
+                pct_collect_second.append(percentage_second)
+                coll_tune_score = []
+        else:
+            percentage_first = 1
+            homo_first = 1
+            pct_collect_first.append(percentage_first)
+            homo_collect_first.append(homo_first)
+            coll_tradeoffs.append((0,0))
+            homo_collect_second.append(1)
+            pct_collect_second.append(1)
+            coll_tune_score.append(None)
+
+    return pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,coll_tune_score
+
+
+def main(uuid = None):
+    user = uuid
+    radius = 100
+    trips = preprocess.read_data(user)
+    filter_trips = preprocess.filter_data(trips, radius)
+    tune_idx, test_idx = preprocess.split_data(filter_trips)
+    tune_data = preprocess.get_subdata(filter_trips, test_idx)
+    test_data = preprocess.get_subdata(filter_trips, tune_idx)
+    pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = init_score()
+    coll_tune_score = []
+    coll_tradeoffs = []
+    pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,\
+    coll_tune_score= tuning_test(tune_data,radius,pct_collect_first,homo_collect_first,pct_collect_second,
+                                 homo_collect_second,coll_tradeoffs,coll_tune_score,tune = True)
+    pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = init_score()
+    pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,\
+    coll_tune_score = tuning_test(test_data,radius, pct_collect_first, homo_collect_first,pct_collect_second,
+                                  homo_collect_second, coll_tradeoffs,coll_tune_score,test=True)
+    return pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second
+
+
+if __name__ == '__main__':
+    main(uuid=None)
diff --git a/tour_model_eval/first_second_round_evaluation.ipynb b/tour_model_eval/first_second_round_evaluation.ipynb
@@ -0,0 +1,161 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "mighty-ukraine",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import emission.core.get_database as edb\n",
+    "import emission.analysis.modelling.tour_model.similarity as similarity\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import get_request_percentage as grp\n",
+    "import get_scores as gs\n",
+    "import label_processing as lp\n",
+    "import get_users as gu\n",
+    "import data_preprocessing as preprocess\n",
+    "import get_tuning_score as tuning\n",
+    "import evaluation_pipeline as ep\n",
+    "import matplotlib.pyplot as plt\n",
+    "import get_plot as plot\n",
+    "import emission.core.common as ecc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cathedral-pointer",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n",
+    "all_users = [u[\"user_id\"] for u in participant_uuid_obj]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "exotic-livestock",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "radius = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "powered-airfare",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# get all/valid user list\n",
+    "user_ls, valid_users = gu.get_user_ls(all_users, radius)\n",
+    "\n",
+    "# collect request percentage for the first or second round (requested trips / total trips) for all users\n",
+    "all_percentage_first_tune = []\n",
+    "all_percentage_first_test = []\n",
+    "all_percentage_second_tune = []\n",
+    "all_percentage_second_test = []\n",
+    "\n",
+    "# collect homogeneity score for the first/second round for all users\n",
+    "all_homogeneity_score_first_tune = []\n",
+    "all_homogeneity_score_first_test = []\n",
+    "all_homogeneity_score_second_tune = []\n",
+    "all_homogeneity_score_second_test = []\n",
+    "\n",
+    "for a in range(len(all_users)):\n",
+    "    user = all_users[a]\n",
+    "    trips = preprocess.read_data(user)\n",
+    "    filter_trips = preprocess.filter_data(trips, radius)\n",
+    "    print('user', a + 1, 'filter_trips len', len(filter_trips))\n",
+    "\n",
+    "    # filter out users that don't have enough valid labeled trips\n",
+    "    if not gu.valid_user(filter_trips, trips):\n",
+    "        continue\n",
+    "    tune_idx, test_idx = preprocess.split_data(filter_trips)\n",
+    "\n",
+    "    # choose tuning/test set to run the model\n",
+    "    # this step will use KFold (5 splits) to split the data into different subsets\n",
+    "    # - tune: tuning set\n",
+    "    # - test: test set\n",
+    "    # Here we user a bigger part of the data for testing and a smaller part for tuning\n",
+    "    tune_data = preprocess.get_subdata(filter_trips, test_idx)\n",
+    "    test_data = preprocess.get_subdata(filter_trips, tune_idx)\n",
+    "\n",
+    "    pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = ep.init_score()\n",
+    "   \n",
+    "    # collect tuning parameters\n",
+    "    coll_tune_score = []\n",
+    "    coll_tradeoffs = []\n",
+    "    # tuning\n",
+    "    pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,  coll_tune_score= ep.tuning_test(tune_data,radius,pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,coll_tune_score,tune = True)\n",
+    "\n",
+    "    pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = ep.init_score()\n",
+    "    # testing\n",
+    "    pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,coll_tune_score = ep.tuning_test(test_data,radius, pct_collect_first, homo_collect_first,pct_collect_second,homo_collect_second, coll_tradeoffs,coll_tune_score,test=True)\n",
+    "\n",
+    "    print('colle_tune_score ', coll_tune_score)\n",
+    "    print('coll_tradeoffs',coll_tradeoffs)\n",
+    "\n",
+    "    # collect request percentage for the first round for all users\n",
+    "    all_percentage_first_test.append(pct_collect_first)\n",
+    "\n",
+    "    # collect homogeneity score for the first round for all users\n",
+    "    all_homogeneity_score_first_test.append(homo_collect_first)\n",
+    "\n",
+    "    # collect request percentage for the second round for all users\n",
+    "    all_percentage_second_test.append(pct_collect_second)\n",
+    "\n",
+    "    # collect homogeneity score for the second round for all users\n",
+    "    all_homogeneity_score_second_test.append(homo_collect_second)\n",
+    "\n",
+    "print('all_percentage_first_test', all_percentage_first_test)\n",
+    "print('all_homogeneity_score_first_test', all_homogeneity_score_first_test)\n",
+    "print('all_percentage_second_test', all_percentage_second_test)\n",
+    "print('all_homogeneity_score_second_test', all_homogeneity_score_second_test)\n",
+    "\n",
+    "# plot evaluation scatter for the first round\n",
+    "plt.figure()\n",
+    "plot.get_scatter(all_percentage_first_test, all_homogeneity_score_first_test, valid_users)\n",
+    "\n",
+    "# plot evaluation scatter for the second round\n",
+    "plt.figure()\n",
+    "plot.get_scatter(all_percentage_second_test, all_homogeneity_score_second_test, valid_users)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "variable-faculty",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tour_model_eval/get_tuning_score.py b/tour_model_eval/get_tuning_score.py
@@ -0,0 +1,8 @@
+# This function is used for tuning
+# It aims to find the best pair of trade-offs.
+# - homo_second: the homogeneity score after the second round of clustering
+# - percentage_second: the user labels request percentage
+def get_tuning_score(homo_second,percentage_second):
+    curr_score = 0.5 * homo_second + 0.5 * (1 - percentage_second)
+    curr_score = float('%.3f' % curr_score)
+    return curr_score