From 4f0461e1818bdde39793a8c7f9871b0672da1952 Mon Sep 17 00:00:00 2001
From: Shankari <shankari@eecs.berkeley.edu>
Date: Thu, 22 Jul 2021 11:51:36 -0700
Subject: [PATCH] Add simple script to federate multiple datasets and save them
 to a csv

We were going to do the analysis here too, but loading the databases at the
same time takes too much memory and disk for regular docker settings. I had to
bump up the settings to get everything to work.

This two stage process helps others who may not have as robust computers to
also experiment with the combined dataset
---
 ...erating and saving multiple datasets.ipynb | 212 ++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 tour_model_eval/Federating and saving multiple datasets.ipynb

diff --git a/tour_model_eval/Federating and saving multiple datasets.ipynb b/tour_model_eval/Federating and saving multiple datasets.ipynb
new file mode 100644
index 0000000..f06ccd2
--- /dev/null
+++ b/tour_model_eval/Federating and saving multiple datasets.ipynb	
@@ -0,0 +1,212 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "mechanical-hamburg",
+   "metadata": {},
+   "source": [
+    "In this notebook, we will attempt to federate multiple datasets, so that we can put the results into context.\n",
+    "The datasets used are:\n",
+    "- CanBikeCO mini-pilot\n",
+    "- NREL location history\n",
+    "- CanBikeCO staging\n",
+    "\n",
+    "This notebook assumes that the datasets are loaded into separate docker containers with ports exposed at 27071, 27018 and 27019. It relies on a new commit that allows for reloading the database connection.\n",
+    "\n",
+    "Note that I had to bump up my docker resource limits to 200GB of disk space and 20GB of RAM to get this to work.\n",
+    "With the previous 50GB and 2GB limits, the containers crashed consistently.\n",
+    "\n",
+    "Because of the high resource requirements for this notebook, and the fact that we are not currently using trajectories for this analysis, we will simply save a csv dataframe for now. The real analysis can read the csv dataframe and move on from there. This will make it easier for others (aka interns) to run the analysis scripts, improve the outputs and generate results.\n",
+    "\n",
+    "This doesn't need to be a notebook, but will leave it as one for now since all the other top level scripts here are notebooks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "moderate-january",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "charged-metro",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import emission.core.get_database as edb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "editorial-yukon",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import emission.storage.timeseries.abstract_timeseries as esta\n",
+    "import emission.storage.decorations.trip_queries as esdtq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fitted-medicaid",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_expanded_df = pd.DataFrame()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "indoor-creature",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_expanded_df_list(uuid_list):\n",
+    "    expanded_df_list = []\n",
+    "    valid_lambda = lambda u: edb.get_analysis_timeseries_db().count_documents({\"user_id\": u,\n",
+    "                                                                               \"metadata.key\": \"analysis/confirmed_trip\"}) > 0\n",
+    "    valid_user_list = list(filter(valid_lambda, uuid_list))\n",
+    "    print(f\"After filtering, went from {len(uuid_list)} -> {len(valid_user_list)}\")\n",
+    "    for u in valid_user_list:\n",
+    "        ts = esta.TimeSeries.get_time_series(u)\n",
+    "        ct_df = ts.get_data_df(\"analysis/confirmed_trip\")\n",
+    "        print(u, len(ct_df))\n",
+    "        lt_df = esdtq.filter_labeled_trips(ct_df)\n",
+    "        expanded_df_list.append(esdtq.expand_userinputs(lt_df))\n",
+    "    return expanded_df_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "isolated-serial",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_program_df(program, uuid_list):\n",
+    "    program_expanded_df_list = pd.concat(get_expanded_df_list(uuid_list))\n",
+    "    program_expanded_df_list[\"program\"] = pd.Categorical([program] * len(program_expanded_df_list))\n",
+    "    return program_expanded_df_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "guilty-louis",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_expanded_df = pd.concat([all_expanded_df, get_program_df(\"minipilot\", esta.TimeSeries.get_uuid_list())]); all_expanded_df.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "angry-subsection",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "esta.TimeSeries._reset_url(\"localhost:27018\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dependent-garage",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_expanded_df = pd.concat([all_expanded_df, get_program_df(\"nrel_lh\", esta.TimeSeries.get_uuid_list())]); all_expanded_df.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "black-sunday",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "edb.get_profile_db().distinct(\"client\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bridal-custom",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "esta.TimeSeries._reset_url(\"localhost:27019\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "split-draft",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_expanded_df = pd.concat([all_expanded_df, get_program_df(\"stage\", esta.TimeSeries.get_uuid_list())]); all_expanded_df.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "improved-deficit",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(all_expanded_df[all_expanded_df.program == \"minipilot\"].user_id.unique()),\n",
+    "      len(all_expanded_df[all_expanded_df.program == \"nrel_lh\"].user_id.unique()),\n",
+    "      len(all_expanded_df[all_expanded_df.program == \"stage\"].user_id.unique()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "civil-strike",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_expanded_df.to_csv(\"/tmp/federated_trip_only_dataset.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "critical-shepherd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}