Merge pull request #28 from shankari/tune_clustering_params

Finally compare dataset characteristics against each other
e-mission · Jun 7, 2022 · b8368d9 · b8368d9
2 parents 37839c8 + 93590d8
commit b8368d9
Show file tree

Hide file tree

Showing 17 changed files with 9,666 additions and 6 deletions.
diff --git a/tour_model_eval/Compare user mode mapping effect with outputs.ipynb b/tour_model_eval/Compare user mode mapping effect with outputs.ipynb
diff --git a/tour_model_eval/Evaluate sim wrt filtration and different radii - unrolled.ipynb b/tour_model_eval/Evaluate sim wrt filtration and different radii - unrolled.ipynb
diff --git a/tour_model_eval/Explore multiple datasets.ipynb b/tour_model_eval/Explore multiple datasets.ipynb
diff --git a/tour_model_eval/Explore sim usage (common trips -> labeling) unrolled-outputs.ipynb b/tour_model_eval/Explore sim usage (common trips -> labeling) unrolled-outputs.ipynb
diff --git a/tour_model_eval/Explore sim usage (common trips -> labeling) unrolled.ipynb b/tour_model_eval/Explore sim usage (common trips -> labeling) unrolled.ipynb
diff --git a/tour_model_eval/Explore trip clustering using DBSCAN unrolled.ipynb b/tour_model_eval/Explore trip clustering using DBSCAN unrolled.ipynb
diff --git a/tour_model_eval/Exploring basic datasets for model building validity-outputs.ipynb b/tour_model_eval/Exploring basic datasets for model building validity-outputs.ipynb
diff --git a/tour_model_eval/Exploring basic datasets for model building validity.ipynb b/tour_model_eval/Exploring basic datasets for model building validity.ipynb
diff --git a/tour_model_eval/Federating and saving multiple datasets.ipynb b/tour_model_eval/Federating and saving multiple datasets.ipynb
@@ -169,20 +169,60 @@
     "      len(all_expanded_df[all_expanded_df.program == \"stage\"].user_id.unique()))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "brown-poison",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_expanded_df.reset_index(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fabulous-aruba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_expanded_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "limiting-gazette",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_expanded_df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "impaired-growing",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import bson.json_util as bju"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "civil-strike",
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_expanded_df.to_csv(\"/tmp/federated_trip_only_dataset.csv\")"
+    "all_expanded_df.to_json(\"/tmp/federated_trip_only_dataset.json\", orient=\"records\", default_handler=bju.default)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "critical-shepherd",
+   "id": "contained-banner",
    "metadata": {},
    "outputs": [],
    "source": []

diff --git a/tour_model_eval/README.md b/tour_model_eval/README.md
@@ -0,0 +1,10 @@
+This directory contains the ipython notebooks used to tune and evaluate the
+first round cluster algorithm. It uses an O(n^2) algorithm to iterate over a
+set of n trips, and cluster them into bins based on proximity of start and end
+points.
+
+To understand the evolution of this process, including a comparison of this
+algorithm with DBSCAN, please see 
+https://github.com/e-mission/e-mission-eval-private-data/pull/28
+
+which includes explanations and intermediate results
diff --git a/tour_model_eval/Radius selection exploration unrolled.ipynb b/tour_model_eval/Radius selection exploration unrolled.ipynb
@@ -832,7 +832,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "closed-azerbaijan",
    "metadata": {},
@@ -843,7 +842,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "naval-assignment",
    "metadata": {},
@@ -852,7 +850,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "designed-capture",
    "metadata": {},

diff --git a/..._eval/confirmed_trips_eval_all_bins.ipynb → ...lysis/confirmed_trips_eval_all_bins.ipynb b/..._eval/confirmed_trips_eval_all_bins.ipynb → ...lysis/confirmed_trips_eval_all_bins.ipynb
@@ -39,7 +39,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n",
+    "participant_uuid_obj = list(edb.get_profile_db().find({}, {\"user_id\": 1, \"_id\": 0}))\n",
     "all_users = [u[\"user_id\"] for u in participant_uuid_obj]"
    ]
   },

diff --git a/..._eval/first_second_round_evaluation.ipynb → ...lysis/first_second_round_evaluation.ipynb b/..._eval/first_second_round_evaluation.ipynb → ...lysis/first_second_round_evaluation.ipynb
diff --git a/.../first_second_round_evaluation_test.ipynb → .../first_second_round_evaluation_test.ipynb b/.../first_second_round_evaluation_test.ipynb → .../first_second_round_evaluation_test.ipynb
diff --git a/...eval/viz_bins_clusters_above_cutoff.ipynb → ...ysis/viz_bins_clusters_above_cutoff.ipynb b/...eval/viz_bins_clusters_above_cutoff.ipynb → ...ysis/viz_bins_clusters_above_cutoff.ipynb
diff --git a/...del_eval/viz_bins_clusters_all_data.ipynb → ...analysis/viz_bins_clusters_all_data.ipynb b/...del_eval/viz_bins_clusters_all_data.ipynb → ...analysis/viz_bins_clusters_all_data.ipynb
diff --git a/...model_eval/viz_similarity_unlabeled.ipynb → ...e_analysis/viz_similarity_unlabeled.ipynb b/...model_eval/viz_similarity_unlabeled.ipynb → ...e_analysis/viz_similarity_unlabeled.ipynb