-
Notifications
You must be signed in to change notification settings - Fork 13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
check in notebook code #23
Changes from 2 commits
5cfc44c
8cdf71e
ceccf02
109932b
dd7eb30
0d3fcd4
8b211f4
cd25861
0bcbf5b
a5fe5c6
102b9bd
4d0cbdd
6eb56b1
1fba74a
bfd6ba6
a6851c6
3d2dff0
22a427b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "mighty-ukraine", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"storage not configured, falling back to sample, default configuration\n", | ||
"Connecting to database URL localhost\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# import logging\n", | ||
"import emission.core.get_database as edb\n", | ||
"import emission.analysis.modelling.tour_model.similarity as similarity\n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"import get_request_percentage as grp\n", | ||
"import get_scores as gs\n", | ||
"import label_processing as lp\n", | ||
"import get_users as gu\n", | ||
"import data_preprocessing as preprocess\n", | ||
"import get_tuning_score as tuning\n", | ||
"import evaluation_pipeline as ep\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"import get_plot as plot\n", | ||
"import emission.core.common as ecc" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "cathedral-pointer", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"participant_uuid_obj = list(edb.get_profile_db().find({\"install_group\": \"participant\"}, {\"user_id\": 1, \"_id\": 0}))\n", | ||
"all_users = [u[\"user_id\"] for u in participant_uuid_obj]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "exotic-livestock", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"radius = 100" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "powered-airfare", | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"user 1 filter_trips len 207\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# get all/valid user list\n", | ||
"user_ls, valid_users = gu.get_user_ls(all_users, radius)\n", | ||
"\n", | ||
"# collect request percentage for the first or second round (requested trips / total trips) for all users\n", | ||
"all_percentage_first_tune = []\n", | ||
"all_percentage_first_test = []\n", | ||
"all_percentage_second_tune = []\n", | ||
"all_percentage_second_test = []\n", | ||
"\n", | ||
"# collect homogeneity score for the first/second round for all users\n", | ||
"all_homogeneity_score_first_tune = []\n", | ||
"all_homogeneity_score_first_test = []\n", | ||
"all_homogeneity_score_second_tune = []\n", | ||
"all_homogeneity_score_second_test = []\n", | ||
"\n", | ||
"for a in range(len(all_users)):\n", | ||
" user = all_users[a]\n", | ||
" trips = preprocess.read_data(user)\n", | ||
" filter_trips = preprocess.filter_data(trips, radius)\n", | ||
" print('user', a + 1, 'filter_trips len', len(filter_trips))\n", | ||
"\n", | ||
" # filter out users that don't have enough valid labeled trips\n", | ||
" if not gu.valid_user(filter_trips, trips):\n", | ||
" continue\n", | ||
" tune_idx, test_idx = preprocess.split_data(filter_trips)\n", | ||
"\n", | ||
" # choose tuning/test set to run the model\n", | ||
" # this step will use KFold (5 splits) to split the data into different subsets\n", | ||
" # - tune: tuning set\n", | ||
" # - test: test set\n", | ||
" # Here we user a bigger part of the data for testing and a smaller part for tuning\n", | ||
" tune_data = preprocess.get_subdata(filter_trips, test_idx)\n", | ||
" test_data = preprocess.get_subdata(filter_trips, tune_idx)\n", | ||
"\n", | ||
" pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = ep.init_score()\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you have so much state, it seems like it would be better to have a class that encapsulates it. |
||
" \n", | ||
" # collect tuning parameters\n", | ||
" coll_tune_score = []\n", | ||
" coll_tradeoffs = []\n", | ||
" # tuning\n", | ||
" pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs, coll_tune_score= ep.tuning_test(tune_data,radius,pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,coll_tune_score,tune = True)\n", | ||
shankari marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"\n", | ||
" pct_collect_first, homo_collect_first, pct_collect_second, homo_collect_second = ep.init_score()\n", | ||
" # testing\n", | ||
" pct_collect_first,homo_collect_first,pct_collect_second,homo_collect_second,coll_tradeoffs,coll_tune_score = ep.tuning_test(test_data,radius, pct_collect_first, homo_collect_first,pct_collect_second,homo_collect_second, coll_tradeoffs,coll_tune_score,test=True)\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At this point, I am not sure if we still need the tradeoffs after the clustering, so I keep it. |
||
"\n", | ||
" print('colle_tune_score ', coll_tune_score)\n", | ||
" print('coll_tradeoffs',coll_tradeoffs)\n", | ||
"\n", | ||
" # collect request percentage for the first round for all users\n", | ||
" all_percentage_first_test.append(pct_collect_first)\n", | ||
"\n", | ||
" # collect homogeneity score for the first round for all users\n", | ||
" all_homogeneity_score_first_test.append(homo_collect_first)\n", | ||
"\n", | ||
" # collect request percentage for the second round for all users\n", | ||
" all_percentage_second_test.append(pct_collect_second)\n", | ||
"\n", | ||
" # collect homogeneity score for the second round for all users\n", | ||
" all_homogeneity_score_second_test.append(homo_collect_second)\n", | ||
"\n", | ||
"print('all_percentage_first_test', all_percentage_first_test)\n", | ||
"print('all_homogeneity_score_first_test', all_homogeneity_score_first_test)\n", | ||
"print('all_percentage_second_test', all_percentage_second_test)\n", | ||
"print('all_homogeneity_score_second_test', all_homogeneity_score_second_test)\n", | ||
"\n", | ||
"# plot evaluation scatter for the first round\n", | ||
"plt.figure()\n", | ||
"plot.get_scatter(all_percentage_first_test, all_homogeneity_score_first_test, valid_users)\n", | ||
"\n", | ||
"# plot evaluation scatter for the second round\n", | ||
"plt.figure()\n", | ||
"plot.get_scatter(all_percentage_second_test, all_homogeneity_score_second_test, valid_users)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "variable-faculty", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@corinne-hcr The participant UUID check was specific to the CanBikeCO mini-pilot - it is not true for other programs or datasets. I have created a new mongodump of only the participant data, you should load that instead and change this to the call from
emission.storage.timeseries.abstract_timeseries