Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check in notebook code #23

Merged
merged 18 commits into from
Jul 24, 2021
Merged
Changes from 2 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
5cfc44c
check in notebook code
corinne-hcr Jun 2, 2021
8cdf71e
update notebook
corinne-hcr Jun 13, 2021
ceccf02
update notebook code, the previous one was not the latest one
corinne-hcr Jun 13, 2021
109932b
add some functions
corinne-hcr Jun 13, 2021
dd7eb30
refactor evaluation_pipeline code, pull out get_socre function from g…
corinne-hcr Jul 8, 2021
0d3fcd4
add kmeans after running hierarchical clustering for re-building the …
corinne-hcr Jul 10, 2021
8b211f4
adding kmeans only in the test step
corinne-hcr Jul 10, 2021
cd25861
check in codes for generating result for Gabriel's function, will mov…
corinne-hcr Jul 16, 2021
0bcbf5b
change the line of importing label_processing
corinne-hcr Jul 16, 2021
a5fe5c6
address the problems from the previous commit, but has not yet done …
corinne-hcr Jul 16, 2021
102b9bd
Update tour_model_eval/build_save_model.py
corinne-hcr Jul 19, 2021
4d0cbdd
refactored notebook code, not done with plot
corinne-hcr Jul 19, 2021
6eb56b1
update build_save_model according to notebook refactoring
corinne-hcr Jul 19, 2021
1fba74a
Merge branch 'notebook_code' of https://github.com/corinne-hcr/e-miss…
corinne-hcr Jul 19, 2021
bfd6ba6
check in the changes in the notebook, have put the original clusterin…
corinne-hcr Jul 19, 2021
a6851c6
delete output from the notebook
corinne-hcr Jul 19, 2021
3d2dff0
modify test notebook and add comments on it, remove extraneous files
corinne-hcr Jul 22, 2021
22a427b
add plot code, read filename directly from user id, add another way t…
corinne-hcr Jul 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions tour_model_eval/first_second_round_evaluation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "mighty-ukraine",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"storage not configured, falling back to sample, default configuration\n",
"Connecting to database URL localhost\n"
]
}
],
"source": [
"# import logging\n",
"import emission.core.get_database as edb\n",
"import emission.analysis.modelling.tour_model.similarity as similarity\n",
"import pandas as pd\n",
"import numpy as np\n",
"import get_request_percentage as grp\n",
"import get_scores as gs\n",
"import label_processing as lp\n",
"import get_users as gu\n",
"import data_preprocessing as preprocess\n",
"import get_tuning_score as tuning\n",
"import evaluation_pipeline as ep\n",
"import matplotlib.pyplot as plt\n",
"import get_plot as plot\n",
"import emission.core.common as ecc"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cathedral-pointer",
"metadata": {},
"outputs": [],
"source": [
"participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n",
"all_users = [u[\"user_id\"] for u in participant_uuid_obj]"
Comment on lines +33 to +34
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@corinne-hcr The participant UUID check was specific to the CanBikeCO mini-pilot - it is not true for other programs or datasets. I have created a new mongodump of only the participant data, you should load that instead and change this to the call from emission.storage.timeseries.abstract_timeseries

]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "exotic-livestock",
"metadata": {},
"outputs": [],
"source": [
"radius = 100"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "powered-airfare",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"user 1 filter_trips len 207\n"
]
}
],
"source": [
"# get all/valid user list\n",
"user_ls, valid_users = gu.get_user_ls(all_users, radius)\n",
"\n",
"# collect request percentage for the first or second round (requested trips / total trips) for all users\n",
"all_percentage_first_tune = []\n",
"all_percentage_first_test = []\n",
"all_percentage_second_tune = []\n",
"all_percentage_second_test = []\n",
"\n",
"# collect homogeneity score for the first/second round for all users\n",
"all_homogeneity_score_first_tune = []\n",
"all_homogeneity_score_first_test = []\n",
"all_homogeneity_score_second_tune = []\n",
"all_homogeneity_score_second_test = []\n",
"\n",
"for a in range(len(all_users)):\n",
" user = all_users[a]\n",
" trips = preprocess.read_data(user)\n",
" filter_trips = preprocess.filter_data(trips, radius)\n",
" print('user', a + 1, 'filter_trips len', len(filter_trips))\n",
"\n",
" # filter out users that don't have enough valid labeled trips\n",
" if not gu.valid_user(filter_trips, trips):\n",
" continue\n",
" tune_idx, test_idx = preprocess.split_data(filter_trips)\n",
"\n",
" # choose tuning/test set to run the model\n",
" # this step will use KFold (5 splits) to split the data into different subsets\n",
" # - tune: tuning set\n",
" # - test: test set\n",
" # Here we user a bigger part of the data for testing and a smaller part for tuning\n",
" tune_data = preprocess.get_subdata(filter_trips, test_idx)\n",
" test_data = preprocess.get_subdata(filter_trips, tune_idx)\n",
"\n",
" pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = ep.init_score()\n",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you have so much state, it seems like it would be better to have a class that encapsulates it.
But it also seems like overkill to create a function just to initialize the score.
It seems like you have not finished the more fundamental refactoring.

" \n",
" # collect tuning parameters\n",
" coll_tune_score = []\n",
" coll_tradeoffs = []\n",
" # tuning\n",
" pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs, coll_tune_score= ep.tuning_test(tune_data,radius,pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,coll_tune_score,tune = True)\n",
shankari marked this conversation as resolved.
Show resolved Hide resolved
"\n",
" pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = ep.init_score()\n",
" # testing\n",
" pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,coll_tune_score = ep.tuning_test(test_data,radius, pct_collect_first, homo_collect_first,pct_collect_second,homo_collect_second, coll_tradeoffs,coll_tune_score,test=True)\n",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we return coll_tradeoffs and coll_tune_score from the testing invocation

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point, I am not sure if we still need the tradeoffs after the clustering, so I keep it.
I am still figuring out how to predict the labels

"\n",
" print('colle_tune_score ', coll_tune_score)\n",
" print('coll_tradeoffs',coll_tradeoffs)\n",
"\n",
" # collect request percentage for the first round for all users\n",
" all_percentage_first_test.append(pct_collect_first)\n",
"\n",
" # collect homogeneity score for the first round for all users\n",
" all_homogeneity_score_first_test.append(homo_collect_first)\n",
"\n",
" # collect request percentage for the second round for all users\n",
" all_percentage_second_test.append(pct_collect_second)\n",
"\n",
" # collect homogeneity score for the second round for all users\n",
" all_homogeneity_score_second_test.append(homo_collect_second)\n",
"\n",
"print('all_percentage_first_test', all_percentage_first_test)\n",
"print('all_homogeneity_score_first_test', all_homogeneity_score_first_test)\n",
"print('all_percentage_second_test', all_percentage_second_test)\n",
"print('all_homogeneity_score_second_test', all_homogeneity_score_second_test)\n",
"\n",
"# plot evaluation scatter for the first round\n",
"plt.figure()\n",
"plot.get_scatter(all_percentage_first_test, all_homogeneity_score_first_test, valid_users)\n",
"\n",
"# plot evaluation scatter for the second round\n",
"plt.figure()\n",
"plot.get_scatter(all_percentage_second_test, all_homogeneity_score_second_test, valid_users)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "variable-faculty",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}