diff --git a/.gitignore b/.gitignore index 8a96515..56f8495 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +data/ + # Jupyter checkpoints **/.ipynb_checkpoints .pytest_cache/* diff --git a/CHANGELOG.md b/CHANGELOG.md index 218c2b5..b856e43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # ChoiceModels change log +### 0.2 (2019-01-25) + +- production release + ### 0.2.dev10 (2019-01-25) - moves the `choicemodels.tools.distancematrix` functions directly into `choicemodels.tools` diff --git a/choicemodels/__init__.py b/choicemodels/__init__.py index 59adfbf..c439460 100644 --- a/choicemodels/__init__.py +++ b/choicemodels/__init__.py @@ -3,4 +3,4 @@ from .mnl import MultinomialLogit, MultinomialLogitResults -version = __version__ = '0.2.dev10' +version = __version__ = '0.2' diff --git a/choicemodels/mnl.py b/choicemodels/mnl.py index e750ea3..f671c66 100644 --- a/choicemodels/mnl.py +++ b/choicemodels/mnl.py @@ -76,9 +76,6 @@ class MultinomialLogit(object): and the alternatives. Attributes of a particular alternative may vary for different choosers (distance, for example), but this must be set up manually in the input data. - [TO DO: comparison of the estimation engines] - [TO DO: testing and input validation] - Note that prediction methods are in a separate class: see MultinomialLogitResults(). Parameters @@ -250,7 +247,7 @@ class MultinomialLogitResults(object): If not provided, these will be extracted from the raw results. estimation_engine : str, optional - 'ChoiceModels' (default) or 'PyLogit'. # TO DO - infer from model_expression? + 'ChoiceModels' (default) or 'PyLogit'. """ def __init__(self, model_expression, results=None, fitted_parameters=None, @@ -287,11 +284,6 @@ def probabilities(self, data): Generate predicted probabilities for a table of choice scenarios, using the fitted parameters stored in the results object. - TO DO - make sure this handles pylogit case - - TO DO - does MergedChoiceTable guarantee that alternatives for a single scenario - are consecutive? seems like a requirement here; should document it - Parameters ---------- data : choicemodels.tools.MergedChoiceTable @@ -307,6 +299,11 @@ def probabilities(self, data): pandas.Series with indexes matching the input """ + # TO DO - make sure this handles pylogit case + + # TO DO - does MergedChoiceTable guarantee that alternatives for a single scenario + # are consecutive? seems like a requirement here; should document it + df = data.to_frame() numalts = data.sample_size # TO DO - make this an official MCT param diff --git a/choicemodels/tools/simulation.py b/choicemodels/tools/simulation.py index b5e3755..b5a1130 100644 --- a/choicemodels/tools/simulation.py +++ b/choicemodels/tools/simulation.py @@ -139,9 +139,9 @@ def iterative_lottery_choices(choosers, alternatives, mct_callable, probs_callab all choosers are matched or no alternatives remain. chooser_batch_size : int or None, optional - Size of the batches for processing smaller groups of choosers one at a time. Useful - when the anticipated size of the merged choice tables (choosers X alternatives - X covariates) will be too large for python/pandas to handle. + Size of the batches for processing smaller groups of choosers one at a time. + Useful when the anticipated size of the merged choice tables (choosers X + alternatives X covariates) will be too large for python/pandas to handle. Returns ------- diff --git a/data/.gitignore b/data/.gitignore deleted file mode 100644 index ca893bc..0000000 --- a/data/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -*.zip -*.csv -*.pdf -tl_2010_06_tract10 \ No newline at end of file diff --git a/data/README.md b/data/README.md deleted file mode 100644 index 44c779a..0000000 --- a/data/README.md +++ /dev/null @@ -1,9 +0,0 @@ -The demo notebooks use data from the 2010-2012 California Household Travel Survey. - -Information about the survey: http://www.dot.ca.gov/hq/tpp/offices/omsp/statewide_travel_analysis/chts.html - -Data download: https://www.nrel.gov/transportation/secure-transportation-data.html - -The data is open access, but you will need to fill out a registration form. Download the file named `caltrans_full_survey.zip`, 233.2 MB, and place it in this directory. - -You can download the California census tracts shapefile `tl_2010_06_tract10` from: https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2010&layergroup=Census+Tracts diff --git a/docs/source/index.rst b/docs/source/index.rst index ae27ebd..8d83c77 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -8,7 +8,7 @@ ChoiceModels ChoiceModels is a Python library for discrete choice modeling, with utilities for sampling, simulation, and other ancillary tasks. It's part of the `Urban Data Science Toolkit `__ (UDST). -v0.2.dev10, released January 25, 2019 +v0.2, released January 25, 2019 Contents diff --git a/notebooks/CHTS-exploration-02.ipynb b/notebooks/CHTS-exploration-02.ipynb deleted file mode 100644 index 41bd82a..0000000 --- a/notebooks/CHTS-exploration-02.ipynb +++ /dev/null @@ -1,2164 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring the public CHTS data\n", - "\n", - "Sam Maurer, August 2017 | Python 3.6\n", - "\n", - "Original version June 2017 (v01) \n", - "Updated Aug 2017 (v02) to look into data type issues in the raw data" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "\n", - "import matplotlib\n", - "import numpy as np\n", - "import pandas as pd\n", - "import zipfile" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# See ../data/README.md for instructions about how to get the data\n", - "\n", - "z = zipfile.ZipFile('../data/caltrans_full_survey.zip')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Households\n", - "\n", - "Households that participated in the travel diary survey" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "42426" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)\n", - "\n", - "len(households)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "9715" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Limit to the Bay Area\n", - "\n", - "households_ba = households[households.home_county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]\n", - "\n", - "len(households_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "SAN FRANCISCO 1076\n", - "SAN JOSE 939\n", - "OAKLAND 459\n", - "SANTA ROSA 321\n", - "BERKELEY 251\n", - "NAPA 228\n", - "PALO ALTO 218\n", - "SUNNYVALE 200\n", - "SAN MATEO 197\n", - "FREMONT 177\n", - "WALNUT CREEK 173\n", - "REDWOOD CITY 170\n", - "FAIRFIELD 159\n", - "CONCORD 158\n", - "SAN RAFAEL 158\n", - "Name: home_city, dtype: int64" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Top home locations\n", - "\n", - "households_ba.home_city.value_counts()[:15]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 2.571462\n", - "std 1.373733\n", - "min 1.000000\n", - "25% 2.000000\n", - "50% 2.000000\n", - "75% 3.000000\n", - "max 8.000000\n", - "Name: persons_count, dtype: float64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.persons_count.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 0.999955\n", - "std 0.704667\n", - "min 0.003498\n", - "25% 0.447392\n", - "50% 0.915924\n", - "75% 1.376790\n", - "max 5.400840\n", - "Name: hhwgt, dtype: float64" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.hhwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 293.007784\n", - "std 206.482227\n", - "min 1.025146\n", - "25% 131.095416\n", - "50% 268.385115\n", - "75% 403.428487\n", - "max 1582.559559\n", - "Name: exphhwgt, dtype: float64" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.exphhwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 4.242100e+04\n", - "mean 6.056293e+09\n", - "std 2.944557e+07\n", - "min 6.001400e+09\n", - "25% 6.037207e+09\n", - "50% 6.059042e+09\n", - "75% 6.079011e+09\n", - "max 6.115041e+09\n", - "Name: home_tract_id, dtype: float64" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.home_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Persons" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "109113" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)\n", - "\n", - "len(persons)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno travel_date gender relation education race1\n", - "0 7128119 1 2013-01-27 1 1 6 1.0\n", - "1 7128119 3 2013-01-27 2 3 1 1.0\n", - "2 7128138 1 2012-11-05 2 1 5 1.0\n", - "3 7128262 1 2012-12-21 2 1 1 1.0\n", - "4 7128262 3 2012-12-21 2 3 2 1.0\n", - "5 7128262 2 2012-12-21 1 2 1 1.0\n", - "6 7128288 2 2013-01-22 1 3 3 1.0\n", - "7 7128288 1 2013-01-22 2 1 5 1.0\n", - "8 7128316 1 2012-12-29 2 1 4 1.0\n", - "9 7128372 1 2012-12-29 2 1 6 1.0\n" - ] - } - ], - "source": [ - "print(persons[['sampno', 'perno', 'travel_date', 'gender', 'relation', \n", - " 'education', 'race1']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 108776.000000\n", - "mean 3.233838\n", - "std 2.954577\n", - "min 0.000000\n", - "25% 1.000000\n", - "50% 2.000000\n", - "75% 5.000000\n", - "max 33.000000\n", - "Name: person_trips, dtype: float64" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What is `person_trips`? -- not sure, but it looks related to the `tripno` field\n", - "\n", - "persons.person_trips.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 4.311100e+04\n", - "mean 6.241008e+09\n", - "std 3.120183e+09\n", - "min 2.614000e+03\n", - "25% 6.037238e+09\n", - "50% 6.059064e+09\n", - "75% 6.079011e+09\n", - "max 1.000000e+11\n", - "Name: empl_tract_id, dtype: float64" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.empl_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113\n", - "unique 2\n", - "top False\n", - "freq 66002\n", - "Name: empl_tract_id, dtype: object" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.empl_tract_id.notnull().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 2.543800e+04\n", - "mean 6.342777e+09\n", - "std 3.678070e+09\n", - "min 4.005001e+09\n", - "25% 6.037233e+09\n", - "50% 6.059063e+09\n", - "75% 6.079010e+09\n", - "max 1.000000e+11\n", - "Name: school_tract_id, dtype: float64" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.school_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113\n", - "unique 2\n", - "top False\n", - "freq 83675\n", - "Name: school_tract_id, dtype: object" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.school_tract_id.notnull().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113.000000\n", - "mean 0.999999\n", - "std 0.962373\n", - "min 0.000568\n", - "25% 0.322230\n", - "50% 0.717519\n", - "75% 1.329846\n", - "max 5.060089\n", - "Name: perwgt, dtype: float64" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.perwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Places\n", - "\n", - "Each record represents a single visit to a place" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "460524" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)\n", - "\n", - "len(places)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['sampno', 'perno', 'plano', 'vehno', 'tripno', 'place_name',\n", - " 'travel_date', 'arr_time', 'dep_time', 'mode', 'trip_distance_miles',\n", - " 'air_trip_distance_miles', 'tripdistanceflag', 'prev_trip_duration_min',\n", - " 'act_dur', 'act_cnt', 'block_id', 'tract_id', 'county_id', 'state_id',\n", - " 'place_primarycity', 'city', 'zipcode', 'state', 'parked_loc_type',\n", - " 'parked_other_loc_type', 'parked_address', 'parked_minutes',\n", - " 'parked_payed', 'parked_amount_payed', 'parked_unit', 'parked_pay_type',\n", - " 'parked_other_pay_type', 'parked_paymen_ne', 'got_out_vehicle',\n", - " 'transit_system', 'transit_system_other', 'perwgt', 'expperwgt', 'tcf',\n", - " 'tcfperwgt', 'exptcfperwgt', 'tottr', 'hhmem', 'lon', 'lat',\n", - " 'non_hh_members', 'route', 'per1', 'per2', 'per3', 'per4', 'per5',\n", - " 'geom'],\n", - " dtype='object')\n" - ] - } - ], - "source": [ - "print(places.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "b'1031985,1,1,,,REDACTED,2012-05-01,03:00:00,08:00:00,,,,,,340.0,3.0,REDACTED,252202.0,95.0,6.0,VALLEJO,VALLEJO,94591,CA,,,REDACTED,,,,,,,,,,,0.052086,17.647568,,,,,,REDACTED,REDACTED,,,,,,,,REDACTED\\r\\n'\n", - "b'1031985,1,2,97.0,1.0,REDACTED,2012-05-01,09:00:00,12:00:00,6.0,13.428271,7.647539,,22.0,231.0,1.0,REDACTED,252108.0,95.0,6.0,BENICIA,BENICIA,94510,CA,,,REDACTED,,,,,,,,1.0,,,0.052086,17.647568,0.969788,0.050512,17.114408,2.0,0.0,REDACTED,REDACTED,1.0,,,,,,,REDACTED\\r\\n'\n" - ] - } - ], - "source": [ - "# Print some raw data to make sure the pandas type inferences are reasonable\n", - "\n", - "with z.open('caltrans_full_survey/survey_place.csv', 'r') as f:\n", - " _ = f.readline() # discard column headers\n", - " print(f.readline())\n", - " print(f.readline())" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Filter for places visited by people who live in the Bay Area (may want to do use a\n", - "# different filter depending on the application)\n", - "\n", - "places_ba = places[places.sampno.isin(households_ba.sampno)]\n", - "\n", - "len(places_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno plano tripno\n", - "0 1031985 1 1 NaN\n", - "1 1031985 1 2 1.0\n", - "2 1031985 1 3 2.0\n", - "3 1031985 2 1 NaN\n", - "4 1031985 2 2 1.0\n", - "5 1031985 2 3 2.0\n", - "118 1033944 1 1 NaN\n", - "119 1033944 1 2 1.0\n", - "120 1033944 1 3 2.0\n", - "121 1033944 1 4 3.0\n" - ] - } - ], - "source": [ - "# Is there a unique identifier?\n", - "\n", - "# Might need to use combination of `sampno` (household), `perno` (person within hh),\n", - "# `plano` (place within person's travel diary)\n", - "\n", - "# What's `tripno`? (\"unlinked trip ID\" - maybe representing transfer between modes)\n", - "\n", - "print(places_ba[['sampno', 'perno', 'plano', 'tripno']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Is every combination of `sampno`, `perno`, `plano` unique? -- Yes\n", - "\n", - "len(places_ba.groupby(['sampno', 'perno', 'plano']))" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "93406" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many places have a `tripno`? -- about 80%\n", - "\n", - "places_ba.tripno.count()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Is the `tripno` ever repeated? -- No\n", - "\n", - "len(places_ba.groupby(['sampno', 'perno', 'plano', 'tripno']))" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 93406.000000\n", - "mean 3.817185\n", - "std 2.841705\n", - "min 1.000000\n", - "25% 2.000000\n", - "50% 3.000000\n", - "75% 5.000000\n", - "max 32.000000\n", - "Name: tripno, dtype: float64" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places_ba.tripno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0 REDACTED\n", - "1 REDACTED\n", - "2 REDACTED\n", - "3 REDACTED\n", - "4 REDACTED\n", - "Name: place_name, dtype: object" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Can we see the place names? -- No\n", - "\n", - "places_ba.place_name.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "SAN FRANCISCO 15680\n", - "SAN JOSE 11414\n", - "OAKLAND 5455\n", - "SANTA ROSA 3441\n", - "BERKELEY 3185\n", - "PALO ALTO 2664\n", - "SUNNYVALE 2440\n", - "SAN MATEO 2190\n", - "NAPA 2160\n", - "FREMONT 2126\n", - "REDWOOD CITY 2067\n", - "MOUNTAIN VIEW 1948\n", - "WALNUT CREEK 1896\n", - "SANTA CLARA 1816\n", - "CONCORD 1800\n", - "Name: city, dtype: int64" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places_ba.city.value_counts().head(15)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAD8CAYAAAC/1zkdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGVlJREFUeJzt3X+0XWV95/H3xyAKKkMoFxrzowFXpAJLA9xBZhgdKwIB\nHQJdtYXpSGqZRhyYwoyzxqBdhaWli04VKqsONmpKsAgiPyTVUAwZldW1QAg/5FeguUAKl2SSSKhQ\ncaDBz/yxn6ubm3NvTm72ufse8nmtddbd+7uffc53B26+eZ5nn/3INhEREU14XdsJRETEa0eKSkRE\nNCZFJSIiGpOiEhERjUlRiYiIxqSoREREY1JUIiKiMSkqERHRmBSViIhozB5tJzDZ9t9/f8+dO7ft\nNCIi+so999zzY9sDO2q32xWVuXPnsmbNmrbTiIjoK5L+sZt2Gf6KiIjGpKhERERjUlQiIqIxKSoR\nEdGYFJWIiGhMikpERDQmRSUiIhqTohIREY1JUYmIiMbsdt+o3xVzl3xnwueuv+SDDWYSETE1pacS\nERGNSVGJiIjGpKhERERjUlQiIqIxKSoREdGYFJWIiGhMz4qKpNmSvidpraSHJZ1X4vtJWiVpXfk5\nvcQl6XJJQ5IekHRk7b0WlfbrJC2qxY+S9GA553JJ6tX1RETEjvWyp7IN+ITtdwDHAOdIOhRYAqy2\nPQ9YXfYBTgLmlddi4AqoihBwIfBu4GjgwpFCVNosrp23oIfXExERO9CzomJ7o+17y/YLwFpgJrAQ\nWF6aLQdOLdsLgatcuRPYV9IM4ERgle2ttp8DVgELyrF9bN9h28BVtfeKiIgWTMqciqS5wBHAD4ED\nbW+EqvAAB5RmM4Gna6cNl9h48eEO8U6fv1jSGklrtmzZsquXExERY+h5UZH0ZuAG4Hzbz4/XtEPM\nE4hvH7SX2h60PTgwMLCjlCMiYoJ6WlQkvZ6qoFxt+8YS3lSGrig/N5f4MDC7dvosYMMO4rM6xCMi\noiW9vPtLwFeBtbYvrR1aAYzcwbUIuLkWP7PcBXYM8JMyPHYrcIKk6WWC/gTg1nLsBUnHlM86s/Ze\nERHRgl4+pfhY4CPAg5LuL7FPAZcA10k6C3gK+HA5thI4GRgCXgQ+CmB7q6TPAneXdp+xvbVsfxy4\nEtgLuKW8IiKiJT0rKrb/ns7zHgDHdWhv4Jwx3msZsKxDfA1w+C6kGRERDco36iMiojEpKhER0ZgU\nlYiIaEyKSkRENCZFJSIiGpOiEhERjUlRiYiIxqSoREREY1JUIiKiMSkqERHRmBSViIhoTIpKREQ0\nJkUlIiIak6ISERGNSVGJiIjG9HLlx2WSNkt6qBb7hqT7y2v9yOJdkuZK+lnt2Jdq5xwl6UFJQ5Iu\nL6s8Imk/SaskrSs/p/fqWiIioju97KlcCSyoB2z/ju35tudTrV1/Y+3w4yPHbJ9di18BLAbmldfI\ney4BVtueB6wu+xER0aKeFRXbtwNbOx0rvY3fBq4Z7z0kzQD2sX1HWRnyKuDUcnghsLxsL6/FIyKi\nJW3NqbwH2GR7XS12kKT7JP1A0ntKbCYwXGszXGIAB9reCFB+HtDrpCMiYnw9W6N+B87g1b2UjcAc\n289KOgr4lqTD6LzGvXf2wyQtphpCY86cORNINyIiujHpPRVJewC/CXxjJGb7JdvPlu17gMeBt1P1\nTGbVTp8FbCjbm8rw2Mgw2eaxPtP2UtuDtgcHBgaavJyIiKhpY/jrA8Cjtn8xrCVpQNK0sn0w1YT8\nE2VY6wVJx5R5mDOBm8tpK4BFZXtRLR4RES3p5S3F1wB3AIdIGpZ0Vjl0OttP0L8XeEDSj4DrgbNt\nj0zyfxz4CjBE1YO5pcQvAY6XtA44vuxHRESLejanYvuMMeK/1yF2A9Utxp3arwEO7xB/Fjhu17KM\niIgm5Rv1ERHRmBSViIhoTIpKREQ0JkUlIiIak6ISERGNSVGJiIjGpKhERERjUlQiIqIxKSoREdGY\nFJWIiGhMikpERDQmRSUiIhqTohIREY1JUYmIiMakqERERGNSVCIiojG9XPlxmaTNkh6qxS6S9Iyk\n+8vr5NqxCyQNSXpM0om1+IISG5K0pBY/SNIPJa2T9A1Je/bqWiIioju97KlcCSzoEL/M9vzyWgkg\n6VCqZYYPK+f8b0nTyrr1XwROAg4FzihtAf6svNc84DngrNEfFBERk6tnRcX27cDWHTasLASutf2S\n7Sep1qM/uryGbD9h+2XgWmChJAHvp1rPHmA5cGqjFxARETutjTmVcyU9UIbHppfYTODpWpvhEhsr\n/ivAP9neNirekaTFktZIWrNly5amriMiIkaZ7KJyBfA2YD6wEfh8iatDW08g3pHtpbYHbQ8ODAzs\nXMYREdG1PSbzw2xvGtmW9GXg22V3GJhdazoL2FC2O8V/DOwraY/SW6m3j4iIlkxqT0XSjNruacDI\nnWErgNMlvUHSQcA84C7gbmBeudNrT6rJ/BW2DXwP+K1y/iLg5sm4hoiIGFvPeiqSrgHeB+wvaRi4\nEHifpPlUQ1XrgY8B2H5Y0nXAI8A24Bzbr5T3ORe4FZgGLLP9cPmITwLXSvoT4D7gq726loiI6E7P\niortMzqEx/yL3/bFwMUd4iuBlR3iT1DdHRYREVNEvlEfERGNSVGJiIjGpKhERERjUlQiIqIxKSoR\nEdGYFJWIiGhMikpERDQmRSUiIhozqc/+2p3NXfKdCZ+7/pIPNphJRETvdNVTkXR4rxOJiIj+1+3w\n15ck3SXpv0jat6cZRURE3+qqqNj+d8DvUj2Gfo2kr0s6vqeZRURE3+l6ot72OuCPqJ4O/O+ByyU9\nKuk3e5VcRET0l27nVN4p6TJgLdXa8P/B9jvK9mU9zC8iIvpIt3d//SXwZeBTtn82ErS9QdIf9SSz\niIjoO90Of50MfH2koEh6naS9AWx/rdMJkpZJ2izpoVrsz8uQ2QOSbhqZ9Jc0V9LPJN1fXl+qnXOU\npAclDUm6XJJKfD9JqyStKz+nT+yPICIimtJtUbkN2Ku2v3eJjedKYMGo2CrgcNvvBP4BuKB27HHb\n88vr7Fr8CmAx1RLD82rvuQRYbXsesLrsR0REi7otKm+0/c8jO2V77/FOsH07sHVU7Lu2t5XdO4FZ\n471HWdN+H9t3lHXprwJOLYcXAsvL9vJaPCIiWtJtUfmppCNHdiQdBfxsnPbd+H3gltr+QZLuk/QD\nSe8psZnAcK3NcIkBHGh7I0D5ecAu5hMREbuo24n684FvStpQ9mcAvzPRD5X0aWAbcHUJbQTm2H62\nFKxvSToMUIfTPYHPW0w1hMacOXMmlnREROxQV0XF9t2Sfh04hOov+kdt/8tEPlDSIuBDwHFlSAvb\nLwEvle17JD0OvJ2qZ1IfIpsFjBS2TZJm2N5Yhsk2j5P/UmApwODg4E4XpYiI6M7OPKX4XwPvBI4A\nzpB05s5+mKQFVF+ePMX2i7X4gKRpZftgqgn5J8qw1guSjil3fZ0J3FxOWwEsKtuLavGIiGhJVz0V\nSV8D3gbcD7xSwiMT52Odcw3wPmB/ScPAhVR3e70BWFXuDL6z3On1XuAzkraV9z/b9sgk/8ep7iTb\ni2oOZmQe5hLgOklnAU8BH+7mWiIione6nVMZBA4dGa7qhu0zOoS/OkbbG4Abxji2BtjuKcm2nwWO\n6zafiIjovW6Hvx4CfrWXiURERP/rtqeyP/CIpLsoE+oAtk/pSVYREdGXui0qF/UyiYiIeG3o9pbi\nH0j6NWCe7dvKc7+m9Ta1iIjoN90++v4PgOuBvyqhmcC3epVURET0p24n6s8BjgWeh18s2JXHokRE\nxKt0W1Resv3yyI6kPZjA41IiIuK1rdui8gNJnwL2KmvTfxP4296lFRER/ajborIE2AI8CHwMWEm1\nXn1ERMQvdHv318+plhP+cm/TiYiIftbts7+epMMciu2DG88oIiL61s48+2vEG6ke3rhf8+lEREQ/\n62pOxfaztdcztv8CeH+Pc4uIiD7T7fDXkbXd11H1XN7Sk4wiIqJvdTv89fna9jZgPfDbjWcTERF9\nrdu7v36j14lERET/63b467+Pd9z2pWOct4xqPfrNtg8vsf2AbwBzKT0e28+V5YK/AJwMvAj8nu17\nyzmL+OX3Yv7E9vISP4pfrgq5EjhvZxYSi4iIZnX75cdBqmV9Z5bX2cChVPMq482tXAksGBVbAqy2\nPQ9YXfYBTqJam34esBi4An5RhC4E3g0cDVwoaXo554rSduS80Z8VERGTaGcW6TrS9gsAki4Cvmn7\nP493ku3bJc0dFV5ItXY9wHLg+8AnS/yq0tO4U9K+kmaUtqtG1qyXtApYIOn7wD627yjxq4BT+eUa\n9hERMcm67anMAV6u7b9MNXw1EQfa3ghQfo487Xgm8HSt3TC/7BmNFR/uEI+IiJZ021P5GnCXpJuo\nvll/GnBVw7moQ8wTiG//xtJiqmEy5syZM9H8IiJiB7q9++tiSbcA7ymhj9q+b4KfuUnSDNsby/DW\n5hIfBmbX2s0CNpT4+0bFv1/iszq075T/UmApwODgYN9N5M9d8p1dOn/9JR9sKJOIiPF1O/wFsDfw\nvO0vAMOSDprgZ64AFpXtRcDNtfiZqhwD/KQMj90KnCBpepmgPwG4tRx7QdIx5c6xM2vvFRERLej2\nluILqe4AOwT4a+D1wN9QrQY53nnXUPUy9pc0THUX1yXAdZLOAp6ieo4YVLcEnwwMUd1S/FEA21sl\nfRa4u7T7zMikPdUdaVdS3VJ8C5mkj4hoVbdzKqcBRwD3AtjeIGmHj2mxfcYYh47r0NZUyxZ3ep9l\nwLIO8TXA4TvKIyIiJke3w18vl7/0DSDpTb1LKSIi+lW3ReU6SX8F7CvpD4DbyIJdERExSrd3f32u\nrE3/PNW8yh/bXtXTzCIiou/ssKhImkZ1t9UHgBSSiIgY0w6Hv2y/Arwo6V9NQj4REdHHur376/8B\nD5bnbv10JGj7D3uSVURE9KVui8p3yisiImJM4xYVSXNsPzWyfklERMR4djSn8q2RDUk39DiXiIjo\nczsqKvUnAR/cy0QiIqL/7aioeIztiIiI7exoov5dkp6n6rHsVbYp+7a9T0+zi4iIvjJuUbE9bbIS\niYiI/rcz66lERESMK0UlIiIaM+lFRdIhku6vvZ6XdL6kiyQ9U4ufXDvnAklDkh6TdGItvqDEhiQt\nmexriYiIV+v2G/WNsf0YMB9+8bDKZ4CbqFZ6vMz25+rtJR0KnA4cBrwVuE3S28vhLwLHU61Xf7ek\nFbYfmZQLiYiI7Ux6URnlOOBx2/9YLTPf0ULgWtsvAU9KGgKOLseGbD8BIOna0jZFJSKiJW3PqZwO\nXFPbP1fSA5KWSZpeYjOBp2tthktsrHhERLSktaIiaU/gFOCbJXQF8DaqobGNwOdHmnY43ePEO33W\nYklrJK3ZsmXLLuUdERFja7OnchJwr+1NALY32X7F9s+plioeGeIaBmbXzpsFbBgnvh3bS20P2h4c\nGBho+DIiImJEm3MqZ1Ab+pI0w/bGsnsa8FDZXgF8XdKlVBP184C7qHoq8yQdRDXZfzrwHycp974y\nd8nEVy1Yf8kHG8wkIl7rWikqkvamumvrY7Xw/5I0n2oIa/3IMdsPS7qOagJ+G3BOWY0SSecCtwLT\ngGW2H560i4iIiO20UlRsvwj8yqjYR8ZpfzFwcYf4SmBl4wlGRMSEtH33V0REvIakqERERGNSVCIi\nojEpKhER0ZgUlYiIaEyKSkRENCZFJSIiGpOiEhERjUlRiYiIxqSoREREY1JUIiKiMSkqERHRmBSV\niIhoTIpKREQ0JkUlIiIa0+bKj9EHsmpkROyM1noqktZLelDS/ZLWlNh+klZJWld+Ti9xSbpc0pCk\nByQdWXufRaX9OkmL2rqeiIhof/jrN2zPtz1Y9pcAq23PA1aXfYCTqNamnwcsBq6AqggBFwLvBo4G\nLhwpRBERMfnaLiqjLQSWl+3lwKm1+FWu3AnsK2kGcCKwyvZW288Bq4AFk510RERU2iwqBr4r6R5J\ni0vsQNsbAcrPA0p8JvB07dzhEhsrHhERLWhzov5Y2xskHQCskvToOG3VIeZx4q8+uSpaiwHmzJkz\nkVwjIqILrfVUbG8oPzcDN1HNiWwqw1qUn5tL82Fgdu30WcCGceKjP2up7UHbgwMDA01fSkREFK0U\nFUlvkvSWkW3gBOAhYAUwcgfXIuDmsr0COLPcBXYM8JMyPHYrcIKk6WWC/oQSi4iIFrQ1/HUgcJOk\nkRy+bvvvJN0NXCfpLOAp4MOl/UrgZGAIeBH4KIDtrZI+C9xd2n3G9tbJu4yIiKhrpajYfgJ4V4f4\ns8BxHeIGzhnjvZYBy5rOMSIidt5Uu6U4IiL6WIpKREQ0Js/+ip7Jc8Midj/pqURERGNSVCIiojEp\nKhER0ZgUlYiIaEyKSkRENCZFJSIiGpOiEhERjUlRiYiIxqSoREREY1JUIiKiMSkqERHRmDz7K6ak\nXXluGOTZYRFtSU8lIiIaM+lFRdJsSd+TtFbSw5LOK/GLJD0j6f7yOrl2zgWShiQ9JunEWnxBiQ1J\nWjLZ1xIREa/WxvDXNuATtu8t69TfI2lVOXaZ7c/VG0s6FDgdOAx4K3CbpLeXw18EjgeGgbslrbD9\nyKRcRUREbGfSi4rtjcDGsv2CpLXAzHFOWQhca/sl4ElJQ8DR5dhQWZoYSdeWtikqEREtaXWiXtJc\n4Ajgh8CxwLmSzgTWUPVmnqMqOHfWThvml0Xo6VHxd4/xOYuBxQBz5sxp7gJiysoCYRHtaG2iXtKb\ngRuA820/D1wBvA2YT9WT+fxI0w6ne5z49kF7qe1B24MDAwO7nHtERHTWSk9F0uupCsrVtm8EsL2p\ndvzLwLfL7jAwu3b6LGBD2R4rHhERLWjj7i8BXwXW2r60Fp9Ra3Ya8FDZXgGcLukNkg4C5gF3AXcD\n8yQdJGlPqsn8FZNxDRER0VkbPZVjgY8AD0q6v8Q+BZwhaT7VENZ64GMAth+WdB3VBPw24BzbrwBI\nOhe4FZgGLLP98GReSLw2ZT4mYuLauPvr7+k8H7JynHMuBi7uEF853nkRETG58o36iIhoTJ79FdGg\nDJ3F7i49lYiIaEx6KhFTRHo58VqQohLxGpClAmKqyPBXREQ0Jj2ViMjQWzQmRSUidkkKUtSlqERE\na3Z1LmhXpKD1RuZUIiKiMempRMRuKcN2vZGiEhGxk1KQxpaiEhExidqaR5qsYpY5lYiIaEyKSkRE\nNKbvi4qkBZIekzQkaUnb+URE7M76uqhImgZ8ETgJOJRq9chD280qImL31ddFBTgaGLL9hO2XgWuB\nhS3nFBGx2+r3ojITeLq2P1xiERHRgn6/pbjTWvferpG0GFhcdv9Z0mMT/Lz9gR9P8NzJMtVznOr5\nwdTPcarnB8mxCY3mpz/b5bf4tW4a9XtRGQZm1/ZnARtGN7K9FFi6qx8maY3twV19n16a6jlO9fxg\n6uc41fOD5NiEqZ7fWPp9+OtuYJ6kgyTtCZwOrGg5p4iI3VZf91Rsb5N0LnArMA1YZvvhltOKiNht\n9XVRAbC9Elg5SR+3y0Nok2Cq5zjV84Opn+NUzw+SYxOmen4dyd5uXjsiImJC+n1OJSIippAUlS5N\n5cfBSJot6XuS1kp6WNJ5bec0FknTJN0n6dtt5zKapH0lXS/p0fJn+W/azmk0Sf+t/Dd+SNI1kt44\nBXJaJmmzpIdqsf0krZK0rvycPsXy+/Py3/kBSTdJ2ret/MbKsXbsf0iypP3byG1npah0oQ8eB7MN\n+ITtdwDHAOdMsfzqzgPWtp3EGL4A/J3tXwfexRTLU9JM4A+BQduHU92ccnq7WQFwJbBgVGwJsNr2\nPGB12W/LlWyf3yrgcNvvBP4BuGCykxrlSrbPEUmzgeOBpyY7oYlKUenOlH4cjO2Ntu8t2y9Q/WU4\n5Z4sIGkW8EHgK23nMpqkfYD3Al8FsP2y7X9qN6uO9gD2krQHsDcdvpc12WzfDmwdFV4ILC/by4FT\nJzWpmk752f6u7W1l906q77i1Zow/Q4DLgP9Jhy91T1UpKt3pm8fBSJoLHAH8sN1MOvoLql+Qn7ed\nSAcHA1uAvy7Dc1+R9Ka2k6qz/QzwOap/tW4EfmL7u+1mNaYDbW+E6h89wAEt5zOe3wduaTuJ0SSd\nAjxj+0dt57IzUlS609XjYNom6c3ADcD5tp9vO586SR8CNtu+p+1cxrAHcCRwhe0jgJ/S7pDNdsq8\nxELgIOCtwJsk/ad2s+pvkj5NNXx8ddu51EnaG/g08Mdt57KzUlS609XjYNok6fVUBeVq2ze2nU8H\nxwKnSFpPNXz4fkl/025KrzIMDNse6eFdT1VkppIPAE/a3mL7X4AbgX/bck5j2SRpBkD5ubnlfLYj\naRHwIeB3PfW+W/E2qn88/Kj8zswC7pX0q61m1YUUle5M6cfBSBLVXMBa25e2nU8nti+wPcv2XKo/\nv/9je8r8K9v2/wWelnRICR0HPNJiSp08BRwjae/y3/w4ptjNBDUrgEVlexFwc4u5bEfSAuCTwCm2\nX2w7n9FsP2j7ANtzy+/MMHBk+f90SktR6UKZ0Bt5HMxa4Lop9jiYY4GPUP3r//7yOrntpPrQfwWu\nlvQAMB/405bzeZXSi7oeuBd4kOr3t/VvXUu6BrgDOETSsKSzgEuA4yWto7p76ZIplt9fAm8BVpXf\nly+1ld84OfalfKM+IiIak55KREQ0JkUlIiIak6ISERGNSVGJiIjGpKhERERjUlQiIqIxKSoREdGY\nFJWIiGjM/wfQ3LAf3y77kwAAAABJRU5ErkJggg==\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "places_ba.trip_distance_miles.plot.hist(bins=20, range=(0,15));" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "2296" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Most detailed spatial identifier in public data is tract_id\n", - "\n", - "# How many different tracts are visited?\n", - "places_ba.tract_id.unique().shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "9715" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many different households?\n", - "places_ba.sampno.unique().shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "23939" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many different people?\n", - "len(places_ba.groupby(['sampno','perno']))" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno plano tract_id county_id state_id\n", - "0 1031985 1 1 252202.0 95.0 6.0\n", - "1 1031985 1 2 252108.0 95.0 6.0\n", - "2 1031985 1 3 252202.0 95.0 6.0\n" - ] - } - ], - "source": [ - "# How are the ID's encoded?\n", - "\n", - "print(places_ba[['sampno','perno','plano','tract_id','county_id','state_id']].head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " tract_id county_id state_id\n", - "count 117344.0 117344.0 117344.0\n", - "mean 0.0 0.0 0.0\n", - "std 0.0 0.0 0.0\n", - "min 0.0 0.0 0.0\n", - "25% 0.0 0.0 0.0\n", - "50% 0.0 0.0 0.0\n", - "75% 0.0 0.0 0.0\n", - "max 0.0 0.0 0.0\n" - ] - } - ], - "source": [ - "# Do the floating point decimals encode anything, or is it just a mistake in the source\n", - "# data that they're not all stored as ints? (Looks like a mistake, and my guess is it\n", - "# happened because int columns can't have missing values in certain database systems.)\n", - "\n", - "asfloat = places_ba[['tract_id','county_id','state_id']].dropna()\n", - "asint = places_ba[['tract_id','county_id','state_id']].dropna().astype(int)\n", - "\n", - "print((asfloat - asint).describe())" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mode 23939\n", - "dtype: int64\n", - " mode\n", - "count 93406.000000\n", - "mean 5.279147\n", - "std 4.039473\n", - "min 1.000000\n", - "25% 5.000000\n", - "50% 5.000000\n", - "75% 6.000000\n", - "max 29.000000\n" - ] - } - ], - "source": [ - "# What does the travel mode data look like? We can replace null values with zero\n", - "\n", - "print(places_ba[['mode']].isnull().sum())\n", - "print(places_ba[['mode']].describe())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Census identifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress scientific notation\n", - "\n", - "pd.set_option('display.float_format', lambda x: '%.0f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6912\n", - "14388\n" - ] - } - ], - "source": [ - "# Is the mapping between census tracts and city names consistent? -- No\n", - "\n", - "print(places.tract_id.drop_duplicates().shape[0])\n", - "print(places[['tract_id', 'city']].drop_duplicates().shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460519\n", - "mean 191724\n", - "std 242716\n", - "min 100\n", - "25% 5911\n", - "50% 43317\n", - "75% 402800\n", - "max 999999\n", - "Name: tract_id, dtype: float64" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460519\n", - "mean 58\n", - "std 50\n", - "min 1\n", - "25% 37\n", - "50% 59\n", - "75% 79\n", - "max 999\n", - "Name: county_id, dtype: float64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.county_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460523\n", - "mean 6\n", - "std 5\n", - "min 1\n", - "25% 6\n", - "50% 6\n", - "75% 6\n", - "max 99\n", - "Name: state_id, dtype: float64" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.state_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "6 455641\n", - "99 1064\n", - "32 957\n", - "41 454\n", - "4 412\n", - "Name: state_id, dtype: int64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.state_id.value_counts().head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# How to deal with this? I think `tract_id` is an integer representation\n", - "# of the 4-digit tract ID within the couty plus the 2 digit suffix. \n", - "\n", - "# So the full unique identifier is `state_id` + `county_id` (3 digits) + `tract_id` (6 digits)\n", - "\n", - "places['_full_tract_id'] = places.state_id * 1e9 + places.county_id * 1e6 + places.tract_id\n", - "\n", - "# Presumably the all-9 entries reflect missing data, but documentation doesn't specify\n", - "\n", - "places.loc[(places.tract_id == 999999) |\n", - " (places.county_id == 999) |\n", - " (places.state_id == 99), '_full_tract_id'] = np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9098\n", - "14194\n" - ] - } - ], - "source": [ - "print(places._full_tract_id.drop_duplicates().shape[0])\n", - "print(places[['_full_tract_id', 'city']].drop_duplicates().shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "6115041100 14\n", - "6091010000 12\n", - "6027000800 11\n", - "6107000100 10\n", - "6097154303 10\n", - "Name: _full_tract_id, dtype: int64" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places[['_full_tract_id', 'city']].drop_duplicates().\\\n", - " _full_tract_id.value_counts().head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " _full_tract_id city\n", - "3238 6115041100 BROWNSVILLE\n", - "18952 6115041100 MARYSVILLE\n", - "33913 6115041100 NORTH SAN JUAN\n", - "44697 6115041100 DOBBINS\n", - "44705 6115041100 YUBA\n", - "100194 6115041100 BANGOR\n", - "160254 6115041100 CAMPTONVILLE\n", - "178724 6115041100 STRAWBERRY VALLEY\n", - "271235 6115041100 CHALLENGE-BROWNSVILLE\n", - "271250 6115041100 OREGON HOUSE\n", - "300021 6115041100 FORBESTOWN\n", - "317626 6115041100 CHALLENGE-BROWNSVILL\n", - "402446 6115041100 BROWNS VALLEY\n", - "403959 6115041100 RACKERBY\n" - ] - } - ], - "source": [ - "print(places[['_full_tract_id', 'city']].drop_duplicates().\\\n", - " loc[places._full_tract_id == 6115041100])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "So, there are still many census tracts that correspond to more than one city. I think we probably just want to use the census tracts as our unit of analysis. \n", - "\n", - "For descriptive purposes we can map each census tract to its most common corresponding city." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " city\n", - "_full_tract_id \n", - "1015000800 ANNISTON\n", - "1101001500 MONTGOMERY\n", - "1161400100 SEVILLA\n", - "2020001000 ANCHORAGE\n", - "2020001100 ANCHORAGE\n" - ] - } - ], - "source": [ - "# Map each tract to its most common corresponding city\n", - "\n", - "tracts = places[['_full_tract_id', 'city']].groupby('_full_tract_id').\\\n", - " agg(lambda x:x.value_counts().index[0])\n", - " \n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9098\n", - "9097\n" - ] - } - ], - "source": [ - "print(places._full_tract_id.drop_duplicates().shape[0])\n", - "print(tracts.shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Activities\n", - "\n", - "\"The activity reported is for a single travel day and contains the highest level of detail about the survey participants' travel purpose\" (data dictionary)\n", - "\n", - "So, there can be multiple \"activities\" at each \"place\" visited as part of a trip." - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "604711" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)\n", - "\n", - "len(activities)" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "157011" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# TO DO - fix to reflect households\n", - "\n", - "activities_ba = activities[activities.county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]\n", - "\n", - "len(activities_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno plano actno tripno\n", - "1 1041766 3 1 1 nan\n", - "4 1051203 1 9 1 8\n", - "8 1065929 1 1 1 nan\n", - "14 1097949 1 1 1 nan\n", - "22 1124271 1 5 1 4\n", - "27 1126030 2 1 1 nan\n", - "30 1127449 2 1 1 nan\n", - "32 1127626 1 1 1 nan\n", - "35 1128657 1 1 1 nan\n", - "37 1129482 1 1 1 nan\n" - ] - } - ], - "source": [ - "# What do the identifiers look like? \n", - "\n", - "print(activities_ba[['sampno', 'perno', 'plano', 'actno', 'tripno']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "118271\n", - "118271\n", - "117345\n" - ] - } - ], - "source": [ - "# Each place occurs in the activities table at least once\n", - "\n", - "print((activities_ba.actno == 1).sum()) # number of activities with id 1\n", - "\n", - "print(len(activities_ba.groupby(['sampno', 'perno', 'plano']))) # unique places referenced\n", - "\n", - "print(len(places_ba)) # records in places table" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 2624572\n", - "std 1695612\n", - "min 1031985\n", - "25% 1662824\n", - "50% 1979173\n", - "75% 2797238\n", - "max 7212388\n", - "Name: sampno, dtype: float64" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.sampno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 2\n", - "std 1\n", - "min 1\n", - "25% 1\n", - "50% 2\n", - "75% 3\n", - "max 8\n", - "Name: perno, dtype: float64" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.perno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 3\n", - "std 3\n", - "min 1\n", - "25% 1\n", - "50% 3\n", - "75% 5\n", - "max 34\n", - "Name: plano, dtype: float64" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.plano.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Trips\n", - "\n", - "What's the correct way to aggregate places into trips?\n", - "\n", - "It seems like each person recorded their travel for a single day as a sequence of places visited, without explicit classification into trips or tours. So that's up to us to do by applying whatever rules seem appropriate. \n", - "\n", - "Probably it's not even possible to identify tours with certainty from the anonymized data, because the place names and precise locations are redacted." - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "sampno perno\n", - "1031985 1 3\n", - " 2 3\n", - "1033944 1 16\n", - "1035274 1 8\n", - " 2 6\n", - "1037952 1 3\n", - " 2 1\n", - "1039620 1 5\n", - " 2 5\n", - "1041076 1 4\n", - "Name: plano, dtype: int64" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Dig into `tripno` some more\n", - "\n", - "places_ba.groupby(['sampno', 'perno']).plano.max().head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 19512\n", - "mean 1\n", - "std 0\n", - "min 1\n", - "25% 1\n", - "50% 1\n", - "75% 1\n", - "max 1\n", - "dtype: float64" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Do any respondents have multiple trip sequences? -- No!\n", - "\n", - "plano_counts = places_ba.groupby(['sampno', 'perno']).plano.max()\n", - "tripno_counts = places_ba.groupby(['sampno', 'perno']).tripno.max()\n", - "\n", - "(plano_counts - tripno_counts).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 93406\n", - "mean 1\n", - "std 0\n", - "min 1\n", - "25% 1\n", - "50% 1\n", - "75% 1\n", - "max 1\n", - "dtype: float64" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(places_ba.plano - places_ba.tripno).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
travel_datearr_timedep_timetract_idcitymodetrip_distance_milesprev_trip_duration_minact_dur
1522012-07-1703:00:0010:00:00509000SUNNYVALEnannannan425
1532012-07-1710:00:0010:00:00509000SUNNYVALE511030
1542012-07-1711:00:0011:00:00508504SUNNYVALE52151
1552012-07-1711:00:0011:00:00508504SUNNYVALE1059
1562012-07-1711:00:0013:00:00508504SUNNYVALE105105
1572012-07-1713:00:0014:00:00509000SUNNYVALE521060
1582012-07-1714:00:0015:00:00500100SAN JOSE582025
1592012-07-1715:00:0002:00:00509000SUNNYVALE5920699
\n", - "
" - ], - "text/plain": [ - " travel_date arr_time dep_time tract_id city mode \\\n", - "152 2012-07-17 03:00:00 10:00:00 509000 SUNNYVALE nan \n", - "153 2012-07-17 10:00:00 10:00:00 509000 SUNNYVALE 5 \n", - "154 2012-07-17 11:00:00 11:00:00 508504 SUNNYVALE 5 \n", - "155 2012-07-17 11:00:00 11:00:00 508504 SUNNYVALE 1 \n", - "156 2012-07-17 11:00:00 13:00:00 508504 SUNNYVALE 1 \n", - "157 2012-07-17 13:00:00 14:00:00 509000 SUNNYVALE 5 \n", - "158 2012-07-17 14:00:00 15:00:00 500100 SAN JOSE 5 \n", - "159 2012-07-17 15:00:00 02:00:00 509000 SUNNYVALE 5 \n", - "\n", - " trip_distance_miles prev_trip_duration_min act_dur \n", - "152 nan nan 425 \n", - "153 1 10 30 \n", - "154 2 15 1 \n", - "155 0 5 9 \n", - "156 0 5 105 \n", - "157 2 10 60 \n", - "158 8 20 25 \n", - "159 9 20 699 " - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What does a sequence of places look like?\n", - "\n", - "varlist = ['travel_date', 'arr_time', 'dep_time', 'tract_id', 'city', 'mode', \n", - " 'trip_distance_miles', 'prev_trip_duration_min', 'act_dur']\n", - "\n", - "places_ba.loc[(places_ba.sampno == 1035274) & (places_ba.perno == 1), varlist]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So, it looks like the key to identifying trip/tour semantics involves looking at the trip purposes in the activities table. Transfers are noted as a particular purpose, and those trip legs need to be aggregated together. \n", - "\n", - "The first and last activities of the day probably take place at home, but we can't verify using the public data.\n", - "\n", - "It looks like the arrival and departure times, and trip durations, are approximate based on people's recollections, but distances are precise because they come from the Google Maps interface." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Travel modes" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "5 50139\n", - "6 18632\n", - "1 15924\n", - "2 2244\n", - "15 1635\n", - "24 1444\n", - "7 566\n", - "26 459\n", - "8 299\n", - "25 293\n", - "Name: mode, dtype: int64" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What are the travel modes?\n", - "\n", - "places_ba['mode'].value_counts().head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "FROM DATA DICTIONARY\n", - "\n", - "Travel mode:\n", - "\n", - "- 1- Walk; \n", - "- 2- Bike; \n", - "- 3- Wheelchair/mobility scooter; \n", - "- 4- Other non-motorized; \n", - "- 5- Auto/van/truck driver; \n", - "- 6- Auto/van/truck passenger; \n", - "- 7- Carpool/vanpool; \n", - "- 8- Motorcycle/scooter/moped; \n", - "- 9- Taxi/hired car/limo; \n", - "- 10- Rental car/vehicle; \n", - "- 11- Private shuttle (Super shuttle, employer, hotel, etc.); \n", - "- 12- Greyhound bus; \n", - "- 13- Plane; \n", - "- 14- Other private transit; \n", - "- 15- Local bus, rapid bus; \n", - "- 16- Express bus/commuter bus (AC Transbay, Golden Gate Transit, etc.); \n", - "- 17- Premium bus (Metro Orange/Silver Line); \n", - "- 18- School bus; \n", - "- 19- Public transit shuttle (DASH, Emery Go Round, etc.); \n", - "- 20- AirBART/LAX FlyAway; \n", - "- 21- Dial-a-ride/paratransit (access services, etc.); \n", - "- 22- Amtrak bus; \n", - "- 23- Other bus; \n", - "- 24- BART, Metro Red/Purple Line; \n", - "- 25- ACE, Amtrak, Cal- train, Coaster, Metrolink; \n", - "- 26- Metro Blue/Green/Gold Line, Muni Metro, Sacramento Light Rail, San Diego Sprinter/Trolley/Orange/ Blue/Green, VTA light rail; \n", - "- 27- Streetcar/cable car, \n", - "- 28- Other rail; \n", - "- 29- Ferry/boat; \n", - "- 99- RF" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Trip purposes" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "1 47241\n", - "2 16700\n", - "21 9523\n", - "9 9151\n", - "27 8583\n", - "22 7250\n", - "8 6151\n", - "7 5792\n", - "37 5040\n", - "31 4737\n", - "39 3484\n", - "17 3105\n", - "25 3039\n", - "34 2701\n", - "29 2541\n", - "Name: purpose, dtype: int64" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What are the trip purposes?\n", - "\n", - "activities_ba.purpose.value_counts().head(15)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "FROM DATA DICTIONARY\n", - "\n", - "[Somewhere there's a `ptype` key indicating categories of purposes, probably based on the home/ work/ school locations, but I can't find it in these data tables.]\n", - "\n", - "Activity purpose: \n", - "\n", - "[These look like activities at home]\n", - "\n", - "- 1- Personal activities (sleeping, personal care, leisure, chores); \n", - "- 2- Preparing meals/eating; \n", - "- 3- Hosting visitors/entertaining guests; \n", - "- 4- Exercise (with or without equipment)/playing sports; \n", - "- 5- Study/schoolwork; \n", - "- 6- Work for pay at home using telecommunications equipment; \n", - "- 7- Using computer/telephone/cell or smart phone, or other communications device for personal activities; \n", - "- 8- All other activities at home; \n", - "\n", - "[These look like activites at work]\n", - "\n", - "- 9- Work/job duties; \n", - "- 10- Training; \n", - "- 11- Meals at work; \n", - "- 12- Work-sponsored social activities (holiday/birthday celebrations, etc.); \n", - "- 13- Non-work-related activities (social clubs, etc.); \n", - "- 14- Exercise/sports; \n", - "- 15- Volunteer work/activities, \n", - "- 16- All other work- related activities at work; \n", - "\n", - "[These look like activities at school]\n", - "\n", - "- 17- School/classroom/ laboratory; \n", - "- 18- Meals at school/college; \n", - "- 19- After-school or non-class-related sports/physical activities; \n", - "- 20- All other after-school or non-class-related activities (library, music rehearsal, clubs, etc.); \n", - "\n", - "[These look like transport-related]\n", - "\n", - "- 21- Change type of transportation/transfer (walk to bus, walk to/from parked car); \n", - "- 22- pick up/drop off passenger(s); \n", - "\n", - "[These look like activities at non-home, non-work, non-school locations]\n", - "\n", - "- 23- Drive-through meals (snacks, coffee, etc.) (show if PTYPE <> 1 [Home]); \n", - "- 24- Drive-through other (ATM, bank, etc.) (show if PTYPE <> 1); \n", - "- 25- Work-related (meetings, sales calls, deliveries); \n", - "- 26- Service private vehicle (gas, oil, lubes, repairs), \n", - "- 27- Routine shopping (groceries, clothing, convenience store, household maintenance, etc.); \n", - "- 28- Shopping for major purchases or specialty items (appliance, electronics, new vehicles, major household repairs, etc.); \n", - "- 29- Household errands (bank, dry cleaning, etc.); \n", - "- 30- Personal business (visit government office, attorney, accountant, etc.); \n", - "- 31- Eat meal at restaurant/diner; \n", - "- 32- Health care (doctor, dentist, eye care, chiropractor, veterinarian, etc.); \n", - "- 33- Civic/ religious activities; \n", - "- 34- Outdoor exercise (outdoor sports, jogging, bicycling, walking the dog, etc.); \n", - "- 35- Indoor exercise (gym, yoga, etc.); \n", - "- 36- Entertainment (movies, sporting events, etc.); \n", - "- 37- Social/visiting friends and relatives; \n", - "- 38- Other (specify), \n", - "\n", - "[Misc]\n", - "\n", - "- 39- Loop trip (for interviewer only- not listed on diary), \n", - "- 99- DK/RF" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/ChoiceModels progress update 2017-06-09.pdf b/notebooks/ChoiceModels progress update 2017-06-09.pdf deleted file mode 100644 index d298540..0000000 Binary files a/notebooks/ChoiceModels progress update 2017-06-09.pdf and /dev/null differ diff --git a/notebooks/Data-prep-02.ipynb b/notebooks/Data-prep-02.ipynb deleted file mode 100644 index 32bf1e2..0000000 --- a/notebooks/Data-prep-02.ipynb +++ /dev/null @@ -1,779 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data prep for estimating models\n", - "\n", - "Sam Maurer, August 2017 | Python 3.6\n", - "\n", - "Original version June 2017 (v01) \n", - "Updated Aug 2017 (v02) to fix int/float issues\n", - "\n", - "This notebook generates the data tables that are used the model estimation demos. For more about the California Household Travel Survey source data, you can refer to the \"CHTS-exploration\" notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import zipfile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load raw CHTS tables\n", - "\n", - "This requires the file named caltrans_full_survey.zip. You can download it by following the instructions in the \"data\" directory." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "z = zipfile.ZipFile('../data/caltrans_full_survey.zip')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "42426" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)\n", - "len(households)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "109113" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)\n", - "len(persons)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "460524" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)\n", - "len(places)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "604711" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)\n", - "len(activities)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Clean up the places table and generate tract identifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "460516\n" - ] - } - ], - "source": [ - "# Discard places with missing identifiers, and convert ID components to ints. \n", - "# (Some identifiers are stored as floats in the source table, but the \n", - "# \"CHTS-exploration\" notebook confirms that the decimal vlaues don't encode anything.)\n", - "\n", - "places.dropna(subset=['state_id','county_id','tract_id','city'], inplace=True)\n", - "\n", - "places['state_id'] = places.state_id.astype(int)\n", - "places['county_id'] = places.county_id.astype(int)\n", - "places['tract_id'] = places.tract_id.astype(int)\n", - "\n", - "print(len(places))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "460514\n" - ] - } - ], - "source": [ - "# Other missing values are encoded as nines; discard those as well\n", - "\n", - "places.drop((places.tract_id == 999999) | \n", - " (places.county_id == 999) | \n", - " (places.state_id == 99), inplace=True)\n", - "\n", - "print(len(places))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2 6\n", - "3 0\n", - "4 5\n", - "Name: mode, dtype: int64\n" - ] - } - ], - "source": [ - "# Clean up other data fields\n", - "\n", - "# Replace null travel mode with zero and encode as int (mode seems to be a protected\n", - "# keyword, so we have to use places['mode'] rather than places.mode)\n", - "\n", - "places['mode'] = places['mode'].fillna(0).astype(int)\n", - "\n", - "print(places['mode'].head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6098941414\n", - "6\n", - "98\n" - ] - } - ], - "source": [ - "# Define functions to move back and forth between full numerical tract ID and its components\n", - "\n", - "def full_tract_id(state_id, county_id, tract_id):\n", - " return state_id * 10**9 + county_id * 10**6 + tract_id\n", - "\n", - "def state_id(full_tract_id):\n", - " return full_tract_id // 10**9\n", - "\n", - "def county_id(full_tract_id):\n", - " _county_tract = np.fmod(full_tract_id, 10**9)\n", - " return _county_tract // 10**6\n", - "\n", - "print(full_tract_id(6, 98, 941414))\n", - "print(state_id(6098141414))\n", - "print(county_id(6098141414))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " state_id county_id tract_id city full_tract_id\n", - "2 6 95 252202 VALLEJO 6095252202\n", - "3 6 95 252202 VALLEJO 6095252202\n", - "4 6 95 251902 VALLEJO 6095251902\n" - ] - } - ], - "source": [ - "# Generate full tract identifiers\n", - "\n", - "places['full_tract_id'] = full_tract_id(places.state_id, places.county_id, places.tract_id)\n", - "\n", - "print(places[['state_id','county_id','tract_id','city','full_tract_id']].head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build a master table of census tracts" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9310\n", - " city\n", - "full_tract_id \n", - "1015000800 ANNISTON\n", - "1101001500 MONTGOMERY\n", - "1161400100 SEVILLA\n", - "2020001000 ANCHORAGE\n", - "2020001100 ANCHORAGE\n" - ] - } - ], - "source": [ - "# Generate a master list of census tracts, keeping the city name most commonly \n", - "# associated with each tract\n", - "\n", - "tracts = places[['full_tract_id','city']].groupby('full_tract_id').\\\n", - " agg(lambda x:x.value_counts().index[0])\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1583\n", - " city\n", - "full_tract_id \n", - "6001008309 TIJUANA\n", - "6001400100 BERKELEY\n", - "6001400200 OAKLAND\n", - "6001400300 OAKLAND\n", - "6001400400 OAKLAND\n" - ] - } - ], - "source": [ - "# Limit to the 9-county San Francisco Bay Area\n", - "\n", - "tracts = tracts[(state_id(tracts.index).isin([6])) & \n", - " (county_id(tracts.index).\\\n", - " isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "85 371\n", - "1 360\n", - "13 207\n", - "75 195\n", - "81 158\n", - "97 99\n", - "95 97\n", - "41 55\n", - "55 41\n", - "Name: full_tract_id, dtype: int64\n" - ] - } - ], - "source": [ - "print(county_id(tracts.index).value_counts())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Calculate some tract-level covariates\n", - "\n", - "Residential density, school/employment density" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Note: the `home_tract_id` in the households table is already a full 11-digit\n", - "# identifier, with the same format that we generated for the places table.\n", - "# Same with `empl_tract_id` and `school_tract_id` in the persons table." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Residential density = sum of weighted household sizes by census tract of home\n", - "\n", - "households['_weighted_persons_count'] = households.persons_count * households.hhwgt\n", - "\n", - "home_density = households.groupby('home_tract_id')._weighted_persons_count.sum().\\\n", - " rename('home_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Employment density = sum of person weights by census tract of work location\n", - "\n", - "work_density = persons.groupby('empl_tract_id').perwgt.sum().\\\n", - " rename('work_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# School density = sum of person weights by census tract of school location\n", - "\n", - "school_density = persons.groupby('school_tract_id').perwgt.sum().\\\n", - " rename('school_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " city home_density work_density school_density\n", - "full_tract_id \n", - "6001008309 TIJUANA 0.000000 0.000000 0.000000\n", - "6001400100 BERKELEY 13.437961 13.130867 13.511570\n", - "6001400200 OAKLAND 11.089638 4.248928 0.894794\n", - "6001400300 OAKLAND 28.878399 7.671554 0.000000\n", - "6001400400 OAKLAND 16.884910 4.063805 8.150402\n" - ] - } - ], - "source": [ - "# Merge these into the census tracts table, only keeping Bay Area tracts\n", - "\n", - "tracts = pd.merge(tracts, home_density, how='left', left_index=True, right_index=True)\n", - "tracts = pd.merge(tracts, work_density, how='left', left_index=True, right_index=True)\n", - "tracts = pd.merge(tracts, school_density, how='left', left_index=True, right_index=True)\n", - "tracts = tracts.fillna(0) # fill missing values with zero\n", - "\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generate a table of trips\n", - "\n", - "For now, this is a table of places visited for non-school, non-work activities" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# - trip destinations are in `places.full_tract_id` (sometimes missing)\n", - "# - trip purposes are in `activities.purpose`, and we want 23 thru 38\n", - "# - places and acitivities are linked by `sampno`, `perno`, `plano`, and there \n", - "# can be multiple activities per place" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10417660312\n" - ] - } - ], - "source": [ - "# Function to generate a single unique ID for places\n", - "\n", - "def place_id(sampno, perno, plano):\n", - " return sampno * 10**4 + perno * 10**2 + plano\n", - "\n", - "print(place_id(1041766, 3, 12))" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Add place_id to places table and activities table\n", - "\n", - "places['place_id'] = place_id(places.sampno, places.perno, places.plano)\n", - "activities['place_id'] = place_id(activities.sampno, activities.perno, activities.plano)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Get list of places that have a secondary activity\n", - "\n", - "_secondary_activity_places = activities.loc[activities.purpose.isin(range(23, 38+1)),\n", - " 'place_id'].drop_duplicates()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "147004\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850202 6095251902 5 5.125960\n", - "10320360102 6073017051 5 3.619056\n", - "10320360104 6073009304 5 19.351620\n", - "10320360105 6073008511 5 6.451126\n", - "10320360202 6073020211 6 10.466616\n" - ] - } - ], - "source": [ - "# Generate a table of those places with some covariates\n", - "\n", - "trips = places.loc[places.place_id.isin(_secondary_activity_places) &\n", - " places.full_tract_id.notnull(),\n", - " ['place_id', 'full_tract_id', 'mode', \n", - " 'trip_distance_miles']].set_index('place_id')\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "36764\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850202 6095251902 5 5.125960\n", - "10335860102 6085511915 6 156.370628\n", - "10335860103 6085512027 6 1.615535\n", - "10335860104 6085512027 6 0.375708\n", - "10335860105 6085511915 6 0.894730\n" - ] - } - ], - "source": [ - "# Limit to destinations in the 9-county San Francisco Bay Area\n", - "\n", - "trips = trips[(state_id(trips.full_tract_id).isin([6])) & \n", - " (county_id(trips.full_tract_id).\\\n", - " isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save estimaton data to disk" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "tracts.to_csv('../data/tracts_v02.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "trips.to_csv('../data/trips_v02.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# TO DO\n", - "# - for a mode choice model, could probably generate average travel times between\n", - "# tracts just from the observed data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/Destination-choice-models-02.ipynb b/notebooks/Destination-choice-models-02.ipynb deleted file mode 100644 index 861a722..0000000 --- a/notebooks/Destination-choice-models-02.ipynb +++ /dev/null @@ -1,652 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring destination choice models\n", - "\n", - "Sam Maurer, August 2017 | Python 3.6\n", - "\n", - "Original version June 2017 (v01) \n", - "Updated Aug 2017 (v02) to use new version of the estimation data (see \"Data-prep-02\" notebook)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", - " from pandas.core import datetools\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from patsy import dmatrix\n", - "from urbansim.urbanchoice import interaction, mnl\n", - "\n", - "from choicemodels import MultinomialLogit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load estimation data from disk" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1583\n", - " city home_density work_density school_density\n", - "full_tract_id \n", - "6001008309 TIJUANA 0.000000 0.000000 0.000000\n", - "6001400100 BERKELEY 13.437961 13.130867 13.511570\n", - "6001400200 OAKLAND 11.089638 4.248928 0.894794\n", - "6001400300 OAKLAND 28.878399 7.671554 0.000000\n", - "6001400400 OAKLAND 16.884910 4.063805 8.150402\n" - ] - } - ], - "source": [ - "tracts = pd.read_csv('../data/tracts_v02.csv').set_index('full_tract_id')\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "36764\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850202 6095251902 5 5.125960\n", - "10335860102 6085511915 6 156.370628\n", - "10335860103 6085512027 6 1.615535\n", - "10335860104 6085512027 6 0.375708\n", - "10335860105 6085511915 6 0.894730\n" - ] - } - ], - "source": [ - "trips = pd.read_csv('../data/trips_v02.csv').set_index('place_id')\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MNL destination choice using urbansim.urbanchoice" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# - each trip is a realized choice of a particular census tract\n", - "# - we can randomly sample alternative census tracts and build a model\n", - "# of destination choice" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# `interaction.mnl_interaction_dataset()` is not documented very well, but \n", - "# this is how it seems to work\n", - "\n", - "# Takes following input:\n", - "# - choosers: pandas.DataFrame with unique index\n", - "# - alternatives: pandas.DataFrame with unique index\n", - "# - SAMPLE_SIZE: number of alternatives for each choice scenario\n", - "# - chosenalts: list containing the alternative id chosen by each chooser?\n", - "\n", - "# Returns following output:\n", - "# - full list of alternatives that were sampled\n", - "# - long-format DataFrame merging the two tables\n", - "# - numchoosers X SAMPLE_SIZE matrix representing chosen alternatives" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Start with a sample of ~500 trips for easier computation" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "483\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "72006700102 6085503326 5 0.574042\n", - "71863140102 6085503108 5 1.718151\n", - "24974540206 6013307201 6 2.446018\n", - "70163300110 6075017102 1 0.038407\n", - "71669940202 6001403100 5 3.793155\n" - ] - } - ], - "source": [ - "choosers = trips.loc[np.random.choice(trips.index, 500, replace=False)]\n", - "choosers = choosers.loc[choosers.trip_distance_miles.notnull()]\n", - "\n", - "print(choosers.shape[0])\n", - "print(choosers.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sample 100 alternatives for each and set up a long-format data table" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "48300\n", - "(483, 100)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/Dropbox/Git-imac/udst/urbansim/urbansim/urbanchoice/interaction.py:83: SettingWithCopyWarning: \n", - "A value is trying to be set on a copy of a slice from a DataFrame.\n", - "Try using .loc[row_indexer,col_indexer] = value instead\n", - "\n", - "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", - " alts_sample['join_index'] = np.repeat(choosers.index.values, SAMPLE_SIZE)\n" - ] - } - ], - "source": [ - "numalts = 100\n", - "\n", - "_, merged, chosen = interaction.mnl_interaction_dataset(\n", - " choosers=choosers, alternatives=tracts, SAMPLE_SIZE=numalts, \n", - " chosenalts=choosers.full_tract_id)\n", - "\n", - "print(merged.shape[0])\n", - "print(chosen.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Use Patsy to generate the design matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Intercept home_density work_density school_density\n", - "full_tract_id \n", - "6085503326 1.0 33.103403 3.018663 13.646608\n", - "6041104102 1.0 17.376936 4.465194 3.304285\n", - "6001440304 1.0 3.324621 0.672532 0.000000\n", - "6013364002 1.0 12.594876 0.788063 0.762573\n", - "6095253107 1.0 26.588450 0.425587 4.469490\n" - ] - } - ], - "source": [ - "model_expression = \"home_density + work_density + school_density\"\n", - "\n", - "model_design = dmatrix(model_expression, data=merged, return_type='dataframe')\n", - "\n", - "print(model_design.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fit the model using mnl_estimate()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'null': -2224.297199832249, 'convergence': -2183.4783133690826, 'ratio': 0.018351363507648655}\n", - " Coefficient Std. Error T-Score\n", - "0 -2.539330e-18 0.084172 -3.016817e-17\n", - "1 1.486461e-02 0.004156 3.576700e+00\n", - "2 1.106214e-02 0.001507 7.340532e+00\n", - "3 1.349303e-02 0.003850 3.504669e+00\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/Dropbox/Git-imac/udst/urbansim/urbansim/urbanchoice/pmat.py:48: RuntimeWarning: overflow encountered in exp\n", - " return PMAT(np.exp(self.mat))\n" - ] - } - ], - "source": [ - "log_likelihoods, fit_parameters = mnl.mnl_estimate(\n", - " model_design.as_matrix(), chosen, numalts=numalts)\n", - "\n", - "print(log_likelihoods)\n", - "print(fit_parameters)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## NEW -- Same process in ChoiceModels" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from choicemodels import MultinomialLogit\n", - "from choicemodels.tools import MergedChoiceTable" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "483\n" - ] - } - ], - "source": [ - "# Start with the same sample of trips\n", - "\n", - "print(choosers.shape[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merge choosers and alternatives using a new ChoiceModels interface" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "48300\n" - ] - } - ], - "source": [ - "merged = MergedChoiceTable(observations = choosers, \n", - " alternatives = tracts, \n", - " chosen_alternatives = choosers.full_tract_id, \n", - " sample_size = numalts)\n", - "\n", - "print(type(merged))\n", - "print(merged.to_frame().shape[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Estimate a model using the ChoiceModels engine" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/Dropbox/Git-imac/udst/choicemodels/choicemodels/tools/pmat.py:48: RuntimeWarning: overflow encountered in exp\n", - " return PMAT(np.exp(self.mat))\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " CHOICEMODELS ESTIMATION RESULTS \n", - "===================================================================\n", - "Dep. Var.: chosen No. Observations: \n", - "Model: Multinomial Logit Df Residuals: \n", - "Method: Maximum Likelihood Df Model: \n", - "Date: Pseudo R-squ.: \n", - "Time: Pseudo R-bar-squ.: \n", - "AIC: Log-Likelihood: -2,187.181\n", - "BIC: LL-Null: -2,224.297\n", - "===================================================================\n", - " coef std err z P>|z| Conf. Int.\n", - "-------------------------------------------------------------------\n", - "home_density 0.0139 0.003 5.298 \n", - "work_density 0.0094 0.001 6.361 \n", - "school_density 0.0151 0.004 3.963 \n", - "===================================================================\n", - "CPU times: user 219 ms, sys: 14 ms, total: 233 ms\n", - "Wall time: 70.9 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = \"home_density + work_density + school_density - 1\"\n", - "\n", - "model = MultinomialLogit(data = merged.to_frame(), \n", - " observation_id_col = merged.observation_id_col, \n", - " choice_col = merged.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "print(type(results))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Estimate a model using the PyLogit engine\n", - "\n", - "Usage is the same, but with an OrderedDict model expression" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from collections import OrderedDict" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -2,224.2972\n", - "Initial Log-likelihood: -2,224.2972\n", - "Estimation Time: 0.06 seconds.\n", - "Final log-likelihood: -2,187.1807\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/anaconda/lib/python3.6/site-packages/scipy/optimize/_minimize.py:385: RuntimeWarning: Method BFGS does not use Hessian information (hess).\n", - " RuntimeWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: chosen No. Observations: 483\n", - "Model: Multinomial Logit Model Df Residuals: 480\n", - "Method: MLE Df Model: 3\n", - "Date: Thu, 10 Aug 2017 Pseudo R-squ.: 0.017\n", - "Time: 13:39:29 Pseudo R-bar-squ.: 0.015\n", - "AIC: 4,380.361 Log-Likelihood: -2,187.181\n", - "BIC: 4,392.901 LL-Null: -2,224.297\n", - "==================================================================================\n", - " coef std err z P>|z| [0.025 0.975]\n", - "----------------------------------------------------------------------------------\n", - "home_density 0.0139 0.004 3.330 0.001 0.006 0.022\n", - "work_density 0.0094 0.001 7.850 0.000 0.007 0.012\n", - "school_density 0.0151 0.004 3.818 0.000 0.007 0.023\n", - "==================================================================================\n", - "CPU times: user 12.6 s, sys: 12.4 s, total: 25.1 s\n", - "Wall time: 10.5 s\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = OrderedDict([('home_density', 'all_same'),\n", - " ('work_density', 'all_same'),\n", - " ('school_density', 'all_same')])\n", - "\n", - "model = MultinomialLogit(data = merged.to_frame(),\n", - " observation_id_col = merged.observation_id_col,\n", - " alternative_id_col = merged.alternative_id_col,\n", - " choice_col = merged.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/MNL-prediction-demo-02.ipynb b/notebooks/MNL-prediction-demo-02.ipynb deleted file mode 100644 index 2b3a783..0000000 --- a/notebooks/MNL-prediction-demo-02.ipynb +++ /dev/null @@ -1,766 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MNL prediction demo\n", - "\n", - "Sam Maurer, August 2017 | Python 3.6\n", - "\n", - "Original version July 2017 (v01) \n", - "Updated July 2017 (v02) to include probabilities \n", - "Updated Aug 2017 (v02) to fix int/float problems \n", - "\n", - "### Summary\n", - "\n", - "This notebook demonstrates how to fit a model using the ChoiceModels interface and then use the UrbanSim MNL functions to generate probabilities and predictions. \n", - "\n", - "Eventually, a prediction interface will be incorporated into the `MultinomialLogitResults` object, but it's not there yet!\n", - "\n", - "This demo uses the estimation data that's set up in the `Data-prep-02` notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/maurer/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", - " from pandas.core import datetools\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from patsy import dmatrix\n", - "\n", - "from choicemodels import mnl # could also import form urbansim.urbanchoice\n", - "from choicemodels import MultinomialLogit\n", - "from choicemodels.tools import MergedChoiceTable" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load data from disk" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1566\n", - " city home_density work_density school_density\n", - "full_tract_id \n", - "6001400100 BERKELEY 13.437961 13.130867 13.511570\n", - "6001400200 OAKLAND 11.089638 4.248928 0.894794\n", - "6001400300 OAKLAND 28.878399 7.671554 0.000000\n" - ] - } - ], - "source": [ - "tracts = pd.read_csv('../data/tracts_v02.csv').set_index('full_tract_id')\n", - "tracts = tracts.loc[(tracts.home_density > 0) | (tracts.work_density > 0) | (tracts.school_density > 0)]\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "35786\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850202 6095251902 5 5.125960\n", - "10335860102 6085511915 6 156.370628\n", - "10335860103 6085512027 6 1.615535\n" - ] - } - ], - "source": [ - "trips = pd.read_csv('../data/trips_v02.csv').set_index('place_id')\n", - "trips = trips.loc[trips.trip_distance_miles.notnull()]\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set up estimation table\n", - "\n", - "Each observed trip is a realized choice of a particular destination census tract. We can randomly sample alternative census tracts to build a model of destination choice.\n", - "\n", - "We'll divide the trips into a training set and a testing set, fit an MNL model using the training data, use it to generate predicted choices for the testing data, and compare the predicted to the actual choices." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(100000, 9)\n", - "(3473300, 9)\n" - ] - } - ], - "source": [ - "training_observations = trips.iloc[:1000]\n", - "training = MergedChoiceTable(observations = training_observations,\n", - " alternatives = tracts,\n", - " chosen_alternatives = training_observations.full_tract_id,\n", - " sample_size = 100)\n", - "\n", - "testing_observations = trips.iloc[1000:]\n", - "testing = MergedChoiceTable(observations = testing_observations,\n", - " alternatives = tracts,\n", - " chosen_alternatives = testing_observations.full_tract_id,\n", - " sample_size = 100)\n", - "\n", - "print(training.to_frame().shape)\n", - "print(testing.to_frame().shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fit a model using the training observations" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " CHOICEMODELS ESTIMATION RESULTS \n", - "===================================================================\n", - "Dep. Var.: chosen No. Observations: \n", - "Model: Multinomial Logit Df Residuals: \n", - "Method: Maximum Likelihood Df Model: \n", - "Date: Pseudo R-squ.: \n", - "Time: Pseudo R-bar-squ.: \n", - "AIC: Log-Likelihood: -4,506.216\n", - "BIC: LL-Null: -4,605.170\n", - "===================================================================\n", - " coef std err z P>|z| Conf. Int.\n", - "-------------------------------------------------------------------\n", - "home_density 0.0113 0.002 6.051 \n", - "work_density 0.0119 0.001 14.909 \n", - "school_density 0.0069 0.004 1.916 \n", - "===================================================================\n", - "CPU times: user 96.6 ms, sys: 55.7 ms, total: 152 ms\n", - "Wall time: 145 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = \"home_density + work_density + school_density - 1\"\n", - "\n", - "model = MultinomialLogit(data = training.to_frame(), \n", - " observation_id_col = training.observation_id_col, \n", - " choice_col = training.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Predict destination choices for the testing observations\n", - "\n", - "We'll use the UrbanSim MNL functions directly, because this hasn't been integrated into the ChoiceModels results classes yet. https://github.com/UDST/choicemodels/blob/master/choicemodels/mnl.py#L536" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 0.011321\n", - "1 0.011928\n", - "2 0.006929\n", - "Name: Coefficient, dtype: float64\n" - ] - } - ], - "source": [ - "# Pull the coefs out of the results object (the PyLogit syntax would be different)\n", - "\n", - "coefs = results.get_raw_results()['fit_parameters']['Coefficient']\n", - "print(coefs)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(3473300, 3)\n", - " home_density work_density school_density\n", - "full_tract_id \n", - "6097150607 10.659461 6.868701 7.16003\n", - "6075020700 20.952573 4.410758 0.00000\n", - "6013319000 21.324330 9.745037 1.26180\n" - ] - } - ], - "source": [ - "# The data columns for prediction need to align with the coefficients; \n", - "# you can do this manually or with patsy, as here\n", - "\n", - "df = testing.to_frame().set_index('full_tract_id')\n", - "\n", - "testing_df = dmatrix(model_expression, data=df, return_type='dataframe')\n", - "print(testing_df.shape)\n", - "print(testing_df.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34733\n", - "[28 55 95 61 6]\n" - ] - } - ], - "source": [ - "# Simulate a destination choice for each testing observation\n", - "\n", - "choices = mnl.mnl_simulate(testing_df, coefs, numalts=100, returnprobs=False)\n", - "\n", - "print(len(choices))\n", - "print(choices[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['a', 'd']\n" - ] - } - ], - "source": [ - "# Annoyingly, that identifies the choices by position rather than by id;\n", - "# here's a function to get the id's\n", - "\n", - "def get_chosen_ids(ids, positions):\n", - " \"\"\"\n", - " We observe N choice scenarios. In each, one of J alternatives is chosen.\n", - " We have a long (len N * J) list of the available alternatives. We have a \n", - " list (len N) of which alternatives were chosen, but it identifies them \n", - " by POSITION and we want their ID. \n", - " \n", - " Parameters\n", - " ----------\n", - " ids : list or list-like\n", - " List of alternative ID's (len N * J).\n", - " \n", - " positions : list or list-like\n", - " List of chosen alternatives by position (len N), where each entry is\n", - " an int in range [0, J)\n", - " \n", - " Returns\n", - " -------\n", - " chosen_ids : list\n", - " List of chosen alternatives by ID (len N)\n", - " \n", - " \"\"\"\n", - " N = len(positions)\n", - " J = len(ids) // N\n", - " \n", - " ids_by_obs = np.reshape(ids, (N,J))\n", - " return [ids_by_obs[i][positions[i]] for i in range(N)]\n", - " \n", - "\n", - "print(get_chosen_ids(['a','b','c','d'], [0,1]))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34733\n", - "[6013352102, 6081604200, 6075042800, 6001435601, 6001424001]\n" - ] - } - ], - "source": [ - "# Get tract id's for the simulated choices\n", - "\n", - "predicted_tracts = get_chosen_ids(testing_df.index.tolist(), choices)\n", - "\n", - "print(len(predicted_tracts))\n", - "print(predicted_tracts[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34733\n", - "[6097150607, 6097153200, 6097151402, 6097151402, 6097151204]\n" - ] - } - ], - "source": [ - "# Get tract id's for observed choices\n", - "\n", - "df = testing.to_frame()\n", - "observed_tracts = df.loc[df.chosen == 1, 'full_tract_id'].tolist()\n", - "\n", - "print(len(observed_tracts))\n", - "print(observed_tracts[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Compare the predicted choices to the observed ones\n", - "\n", - "Multinomial models are kind of tricky to validate. We don't expect the actual choices to match, because there are so many alternatives, but we do expect the characteristics of the predicted choices to be similar to the characteristics of the observed choices. \n", - "\n", - "Choose your own metric for this depending on what you're trying to evaluate! It's even plausible that the metric could be something not directly in the model, like the distance between the predicted and actual destination choices." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.016439697118\n" - ] - } - ], - "source": [ - "# What portion of predicted destination choices were a perfect match?\n", - "# With an uninformative model we would expect 0.01, given that the \n", - "# observed choice is included in the 100 available alternatives.\n", - "\n", - "perfect_match = np.equal(predicted_tracts, observed_tracts)\n", - "print(sum(perfect_match)/len(perfect_match))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.130483901201\n" - ] - } - ], - "source": [ - "# What's the correlation between employment density of the predicted and \n", - "# observed destinations? With an uninformative model we would expect 0.\n", - "\n", - "density_1 = pd.Series([tracts.loc[t,'work_density'] for t in predicted_tracts])\n", - "density_2 = pd.Series([tracts.loc[t,'work_density'] for t in observed_tracts])\n", - "\n", - "print(density_1.corr(density_2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### How does UrbanSim generate household location choices?\n", - "\n", - "These three class methods collectively set up the choosers and alternatives according to various parameters like the sample size, prediction filters, \"probability mode,\" and \"choice mode\" (aggregate or individual):\n", - "\n", - "- `urbansim.models.MNLDiscreteChocieModel.probabilities()` \n", - "- `urbansim.models.MNLDiscreteChocieModel.summed_probabilities()` \n", - "- `urbansim.models.MNLDiscreteChocieModel.predict()` \n", - "\n", - "https://github.com/UDST/urbansim/blob/master/urbansim/models/dcm.py#L474\n", - "\n", - "Then this lower-level function generates a table of probabilities for each alternative, which is passed back to the `MNLDiscreteChoiceModel` class for further processing:\n", - "\n", - "- `urbansim.urbanchoice.mnl.mnl_simulate()`\n", - "\n", - "https://github.com/UDST/urbansim/blob/master/urbansim/urbanchoice/mnl.py#L121" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "### Generate probabilities instead of predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 0.011688\n", - "1 0.012314\n", - "2 0.007539\n", - "Name: Coefficient, dtype: float64\n", - "(3473300, 3)\n", - " home_density work_density school_density\n", - "full_tract_id \n", - "6097150607 10.659461 6.868701 7.16003\n", - "6075020700 20.952573 4.410758 0.00000\n", - "6013319000 21.324330 9.745037 1.26180\n" - ] - } - ], - "source": [ - "# Use coefs and testing dataset from above\n", - "\n", - "print(coefs)\n", - "print(testing_df.shape)\n", - "print(testing_df.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(34733, 100)\n", - "[[ 0.00971082 0.01006746 0.01090096 0.00783025 0.00947018]\n", - " [ 0.01075279 0.01775576 0.01048519 0.0089396 0.00777349]\n", - " [ 0.01134458 0.00807226 0.00781357 0.00986128 0.00917031]\n", - " [ 0.01152731 0.00816036 0.00913801 0.01034887 0.01152651]\n", - " [ 0.00783021 0.02165223 0.00972678 0.01013919 0.02457639]]\n" - ] - } - ], - "source": [ - "probs = mnl.mnl_simulate(testing_df, coefs, numalts=100, returnprobs=True)\n", - "\n", - "print(probs.shape)\n", - "print(probs[:5,:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "chooser_id alternative_id\n", - "11485050104 6097150607 0.009711\n", - " 6075020700 0.010067\n", - " 6013319000 0.010901\n", - " 6075017902 0.007830\n", - " 6075042800 0.009470\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "# Join probabilities to a multi-index of chooser and alternative id's\n", - "# Code adapted from UrbanSim: \n", - "# https://github.com/UDST/urbansim/blob/master/urbansim/models/dcm.py#L549-L556\n", - "\n", - "mi = pd.MultiIndex.from_arrays(\n", - " [testing.to_frame()[testing.observation_id_col], \n", - " testing.to_frame()[testing.alternative_id_col]],\n", - " names=('chooser_id', 'alternative_id'))\n", - "\n", - "probs_df = pd.Series(probs.flatten(), index=mi)\n", - "\n", - "print(probs_df.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sum the probabilities\n", - "\n", - "Calculate the total probability associated with each alternative. This approach is adapted from UrbanSim. \n", - "\n", - "https://github.com/UDST/urbansim/blob/master/urbansim/models/dcm.py#L562-L597\n", - "\n", - "Conceptually, the fitted model implies a probability density function (PDF) for each agent choosing among a set of alternatives. Here we're summing the densities across agents to get a single density function that can serve as a proxy for the aggregate appeal of the alternatives.\n", - "\n", - "Important note! What we're actually creating here (I think) is PDFs over the alternatives sampled for each chooser. With random sampling, the sum will approximate a PDF over all the alternatives. Non-random sampling will alter the interpretation -- it's still a measure of aggregate appeal, but conditioned on the sampling procedure." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "alternative_id\n", - "6001400100 0.364101\n", - "6001400200 0.213724\n", - "6001400300 0.352288\n", - "6001400400 0.332770\n", - "6001400500 0.258811\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "# Code adapted from UrbanSim - For each chooser, normalize the probabilities so\n", - "# they sum to 1 (is this really necessary?). Then sum the probabilties associated\n", - "# with each alternative. I'm using the first 500 choosers for efficiency.\n", - "\n", - "def normalize(s):\n", - " return s / s.sum()\n", - "\n", - "summed_probs = probs_df[:50000].groupby(level=0).apply(normalize).groupby(level=1).sum()\n", - "\n", - "print(summed_probs.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/Sampling-correction-01.ipynb b/notebooks/Sampling-correction-01.ipynb deleted file mode 100644 index 0c4d60a..0000000 --- a/notebooks/Sampling-correction-01.ipynb +++ /dev/null @@ -1,1110 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Sampling correction for large choice sets" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sam Maurer, Nov. 21, 2016 (updated Dec. 6 to remove errors)\n", - "\n", - "1. Replicate synthetic data from Guevara & Ben-Akiva 2013\n", - "2. Do MNL with and without sampling correction\n", - "3. Check whether parameter estimates deviate from true values\n", - "4. Extend to Mixed Logit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## 1. Generate synthetic data set" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- N = 1000 observations\n", - "- J = 1000 alternatives for all observations (C_n = C)\n", - "- X = single attribute distributed Uniform(-2,1) for the first 500 alternatives and Uniform(-1,2) for the second half\n", - "- beta = generic linear taste coefficient, distributed Normal(mu=1.5, sigma=0.8) across the 1000 observations\n", - "- systematic utility = beta * X\n", - "- epsilon = error term distributed ExtremeValue(0,1)\n", - "- random utility = beta * X + epsilon\n", - "\n", - "Utility of alternative i for agent n:\n", - "$$ U_{in} = V_{in} + \\varepsilon_{in} = \\beta_n x_{i} + \\varepsilon_{in} $$\n", - "\n", - "Probability that agent n will choose alternative i:\n", - "$$ L_n(i \\mid \\beta_n, x_n,C_n) = \\frac {e^{V_{in}}} {\\sum_{j \\epsilon C_n} e^{V_{jn}}} $$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 187, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n", - "[-1.53751147 0.22014909 -1.21005495 -0.39878182 -1.95627511]\n" - ] - } - ], - "source": [ - "# Generate attribute x for each of J alternatives\n", - "\n", - "# Set a seed for reproducibility\n", - "np.random.seed(12)\n", - "\n", - "# Start with J << 1000 to speed up runtimes\n", - "\n", - "J = 50 # alternatives\n", - "\n", - "Xa = 3 * np.random.rand(J/2) - 2 # uniform distribution over [-2, 1]\n", - "Xb = 3 * np.random.rand(J/2) - 1 # uniform distribution over [-1, 2]\n", - "\n", - "X = np.concatenate((Xa, Xb))\n", - "\n", - "print len(X)\n", - "print X[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 188, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1000\n", - "[ 1.5 1.5 1.5 1.5 1.5]\n" - ] - } - ], - "source": [ - "# Generate taste coefficient beta for each of N agents \n", - "\n", - "# For regular MNL, i think we need to use a single value, instead of a \n", - "# distribution as Guevara & Ben-Akiva used for the mixture model\n", - "\n", - "N = 1000 # agents/observations\n", - "\n", - "beta = np.zeros(1000) + 1.5\n", - "# beta = 0.8 * np.random.randn(N) + 1.5\n", - "\n", - "print len(beta)\n", - "print beta[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print pd.DataFrame(beta).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 190, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1000, 50)\n" - ] - } - ], - "source": [ - "# Generate probability matrix for N agents choosing among J alternatives\n", - "\n", - "def probs(n):\n", - " ''' \n", - " Return list of J probabilities for agent n\n", - " '''\n", - " b = beta[n]\n", - " exps = [np.exp(b*x) for x in X]\n", - " sum_exps = np.sum(exps)\n", - " return [exp/sum_exps for exp in exps]\n", - "\n", - "P = np.array([probs(n) for n in range(N)])\n", - " \n", - "print P.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 191, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n" - ] - } - ], - "source": [ - "# Check that each row sums to 1\n", - "\n", - "print np.sum(P, axis=1)[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 192, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1000\n", - "[12, 41, 37, 5, 30, 27, 8, 35, 33, 6]\n" - ] - } - ], - "source": [ - "# Simulate a choice from J alternatives for each of N agents\n", - "\n", - "C = [np.random.choice(range(J), p=p) for p in P]\n", - "\n", - "print len(C)\n", - "print C[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "#### Now we have data:\n", - "\n", - "- N agents/observations with true taste coefficients in array \"beta\"\n", - "- J alternatives with single attributes in array \"X\"\n", - "- N choice outcomes in array \"C\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Estimate beta without sampling, using PyLogit MNL" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import pylogit\n", - "from collections import OrderedDict" - ] - }, - { - "cell_type": "code", - "execution_count": 233, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50000\n" - ] - } - ], - "source": [ - "# Set up an estimation dataset in long format\n", - "\n", - "d = [[n, i, int(C[n]==i), X[i]] for n in range(N) for i in range(J)]\n", - "\n", - "print len(d)" - ] - }, - { - "cell_type": "code", - "execution_count": 234, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id choice x\n", - "0 0 0 0 -1.537511\n", - "1 0 1 0 0.220149\n", - "2 0 2 0 -1.210055\n", - "3 0 3 0 -0.398782\n", - "4 0 4 0 -1.956275 \n", - "\n", - " obs_id alt_id choice x\n", - "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 499.500000 24.500000 0.020000 0.014570\n", - "std 288.677877 14.431014 0.140001 1.116965\n", - "min 0.000000 0.000000 0.000000 -1.993222\n", - "25% 249.750000 12.000000 0.000000 -0.894495\n", - "50% 499.500000 24.500000 0.000000 0.220035\n", - "75% 749.250000 37.000000 0.000000 0.832675\n", - "max 999.000000 49.000000 1.000000 1.985414\n" - ] - } - ], - "source": [ - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x'])\n", - "\n", - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 235, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set up model spec\n", - "\n", - "spec = OrderedDict([\n", - " ('x', [range(J)])\n", - " ])\n", - "\n", - "labels = OrderedDict([\n", - " ('x', ['beta_x'])\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 236, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -3,912.0230\n", - "Initial Log-likelihood: -3,912.0230\n", - "Estimation Time: 0.07 seconds.\n", - "Final log-likelihood: -3,065.1983\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: choice No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 999\n", - "Method: MLE Df Model: 1\n", - "Date: Mon, 21 Nov 2016 Pseudo R-squ.: 0.216\n", - "Time: 13:52:41 Pseudo R-bar-squ.: 0.216\n", - "converged: True Log-Likelihood: -3,065.198\n", - " LL-Null: -3,912.023\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.5324 0.046 33.649 0.000 1.443 1.622\n", - "==============================================================================\n", - "CPU times: user 7.7 s, sys: 14.1 s, total: 21.8 s\n", - "Wall time: 14.4 s\n" - ] - } - ], - "source": [ - "%%time\n", - "m = pylogit.create_choice_model(data = df, \n", - " alt_id_col = 'alt_id', \n", - " obs_id_col = 'obs_id', \n", - " choice_col = 'choice', \n", - " specification = spec, \n", - " model_type = \"MNL\", \n", - " names = labels)\n", - "\n", - "m.fit_mle(init_vals = np.array([0]))\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Try with UrbanSim MNL instead of PyLogit\n", - "\n", - "Model class: https://github.com/UDST/urbansim/blob/master/urbansim/models/dcm.py\n", - "\n", - "Estimation algorithms: https://github.com/UDST/urbansim/blob/master/urbansim/urbanchoice/mnl.py" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "from urbansim.models import MNLDiscreteChoiceModel" - ] - }, - { - "cell_type": "code", - "execution_count": 178, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1000\n" - ] - } - ], - "source": [ - "# Choosers should be a DataFrame of characteristics, with index as identifier\n", - "\n", - "d = [[n, C[n]] for n in range(N)]\n", - "\n", - "choosers = pd.DataFrame(d, columns=['id', 'choice']).set_index('id')\n", - "\n", - "print len(choosers)" - ] - }, - { - "cell_type": "code", - "execution_count": 179, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n" - ] - } - ], - "source": [ - "# Alternatives should be a DataFrame of characteristics, with index as identifier\n", - "\n", - "d = [[i, X[i]] for i in range(J)]\n", - "\n", - "alts = pd.DataFrame(d, columns=['id', 'x']).set_index('id')\n", - "\n", - "print len(alts)" - ] - }, - { - "cell_type": "code", - "execution_count": 180, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Null Log-liklihood: -3891.820\n", - "Log-liklihood at convergence: -3077.869\n", - "Log-liklihood Ratio: 0.209\n", - "\n", - "+-----------+-------------+------------+---------+\n", - "| Component | Coefficient | Std. Error | T-Score |\n", - "+-----------+-------------+------------+---------+\n", - "| x | 1.527 | 0.022 | 69.267 |\n", - "+-----------+-------------+------------+---------+\n", - "CPU times: user 104 ms, sys: 9.03 ms, total: 113 ms\n", - "Wall time: 89.4 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "# It seems like this implementation *requires* us to sample the alternatives, \n", - "# so here i'm estimating the model with J-1 alts\n", - "\n", - "m = MNLDiscreteChoiceModel(model_expression = 'x',\n", - " sample_size = J-1)\n", - "\n", - "m.fit(choosers = choosers,\n", - " alternatives = alts,\n", - " current_choice = 'choice')\n", - "\n", - "m.report_fit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## 4. MNL, sampling alternatives without correction\n", - "\n", - "(NB - with random sampling, no correction is needed)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# In the estimation dataset, for each observation include a row for the\n", - "# chosen alternative, plus K-1 other alternatives sampled randomly\n", - "# without replacement, where K < J." - ] - }, - { - "cell_type": "code", - "execution_count": 194, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[12, 23, 7]\n" - ] - } - ], - "source": [ - "K = 3\n", - "\n", - "def alts(obs_id):\n", - " \"\"\"\n", - " Sample alternatives for observation `obs_id`. Expects `J` total\n", - " alts, `K` sampled alts, list `C` of choice outcomes. Returns list \n", - " of K alt id's including the chosen one.\n", - " \"\"\"\n", - " chosen = C[obs_id] # id of chosen alternative\n", - " unchosen = [i for i in range(J) if chosen != i] # id's of J-1 unchosen alts\n", - " sample_unchosen = np.random.choice(unchosen, size=K-1, replace=False)\n", - " return [chosen] + sample_unchosen.tolist()\n", - " \n", - "print alts(0)" - ] - }, - { - "cell_type": "code", - "execution_count": 195, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "3000\n" - ] - } - ], - "source": [ - "# Set up the estimation dataset\n", - "\n", - "d = [[n, i, int(C[n]==i), X[i]] for n in range(N) for i in alts(n)]\n", - "\n", - "print len(d)" - ] - }, - { - "cell_type": "code", - "execution_count": 196, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id choice x\n", - "0 0 12 1 0.832675\n", - "1 0 3 0 -0.398782\n", - "2 0 35 0 1.850941\n", - "3 1 41 1 1.985414\n", - "4 1 45 0 0.272157 \n", - "\n", - " obs_id alt_id choice x\n", - "count 3000.000000 3000.000000 3000.000000 3000.000000\n", - "mean 499.500000 26.898000 0.333333 0.446787\n", - "std 288.723115 13.974669 0.471483 1.170677\n", - "min 0.000000 0.000000 0.000000 -1.993222\n", - "25% 249.750000 14.000000 0.000000 -0.181750\n", - "50% 499.500000 29.000000 0.000000 0.413689\n", - "75% 749.250000 38.000000 1.000000 1.448505\n", - "max 999.000000 49.000000 1.000000 1.985414\n" - ] - } - ], - "source": [ - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x'])\n", - "\n", - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 150, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Same model spec as before\n", - "\n", - "spec = OrderedDict([\n", - " ('x', [range(J)])\n", - " ])\n", - "\n", - "labels = OrderedDict([\n", - " ('x', ['beta_x'])\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 151, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -1,098.6123\n", - "Initial Log-likelihood: -1,098.6123\n", - "Estimation Time: 0.02 seconds.\n", - "Final log-likelihood: -585.7551\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: choice No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 999\n", - "Method: MLE Df Model: 1\n", - "Date: Sun, 20 Nov 2016 Pseudo R-squ.: 0.467\n", - "Time: 14:37:24 Pseudo R-bar-squ.: 0.466\n", - "converged: True Log-Likelihood: -585.755\n", - " LL-Null: -1,098.612\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.6151 0.077 20.888 0.000 1.464 1.767\n", - "==============================================================================\n", - "CPU times: user 303 ms, sys: 41.3 ms, total: 344 ms\n", - "Wall time: 226 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "m = pylogit.create_choice_model(data = df, \n", - " alt_id_col = 'alt_id', \n", - " obs_id_col = 'obs_id', \n", - " choice_col = 'choice', \n", - " specification = spec, \n", - " model_type = \"MNL\", \n", - " names = labels)\n", - "\n", - "m.fit_mle(init_vals = np.array([0]))\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run 1000x with different samples" - ] - }, - { - "cell_type": "code", - "execution_count": 152, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def estimate_beta():\n", - " d = [[n, i, int(C[n]==i), X[i]] for n in range(N) for i in alts(n)]\n", - " df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x'])\n", - " m = pylogit.create_choice_model(df, 'alt_id', 'obs_id', 'choice', spec, 'MNL', names=labels)\n", - " m.fit_mle(init_vals = np.array([0]))\n", - " return m.params.beta_x" - ] - }, - { - "cell_type": "code", - "execution_count": 218, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%%capture\n", - "\n", - "beta = []\n", - "for i in range(1000):\n", - " beta.append(estimate_beta())" - ] - }, - { - "cell_type": "code", - "execution_count": 219, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "count 1000.000000\n", - "mean 1.508913\n", - "std 0.052854\n", - "min 1.322523\n", - "25% 1.471155\n", - "50% 1.507724\n", - "75% 1.545232\n", - "max 1.674443\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "print pd.Series(beta).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. MNL with sampling correction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Utility of alternative j:\n", - "$$ V_{j} = \\beta x_{j} $$\n", - "\n", - "With sampling, we have to account for the restricted choice set (from Eq 6 in Guevara & Ben-Akiva 2013):\n", - "\n", - "$$ V_j = \\beta x_j + \\ln \\pi(D \\mid j) $$\n", - "\n", - "Where pi is the conditional probability that we would construct the choice set D given that alternative j was chosen. This goes into the likelihood function in both the numerator and denominator.\n", - "\n", - "$$ L_n = \\frac {exp(\\beta x_i + \\ln \\pi(D_n \\mid i))} {\\sum_{j \\epsilon D_n} exp(\\beta x_j + \\ln \\pi(D_n \\mid j))} $$\n", - "\n", - "How to calculate pi? From the original formulation of this in McFadden 1978: \"Suppose D is comprized of i plus a sample of alternatives from the set C\\\\{i}, obtained by considering each element of this set independently, and including it with probability p. Then, the probability of D will depend solely on the number of elements K it contains.\"\n", - "\n", - "$$ \\pi(D) = p^{K-1} (1 - p)^{J-K} $$\n", - "\n", - "(?? Without replacement, i think it should be the n-choose-k binomial coefficient, where n=J-1 and k=K-1)\n", - "\n", - "$$ \\pi(D) = {n \\choose k} = \\frac {(K-1)!(J-K)!} {(J-1)!} $$\n" - ] - }, - { - "cell_type": "code", - "execution_count": 197, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id choice x const\n", - "0 0 12 1 0.832675 1\n", - "1 0 24 0 -1.070307 1\n", - "2 0 4 0 -1.956275 1\n", - "3 1 41 1 1.985414 1\n", - "4 1 26 0 0.413689 1 \n", - "\n", - " obs_id alt_id choice x const\n", - "count 3000.000000 3000.000000 3000.000000 3000.000000 3000.0\n", - "mean 499.500000 26.777667 0.333333 0.438763 1.0\n", - "std 288.723115 13.883149 0.471483 1.180510 0.0\n", - "min 0.000000 0.000000 0.000000 -1.993222 1.0\n", - "25% 249.750000 15.000000 0.000000 -0.343887 1.0\n", - "50% 499.500000 29.000000 0.000000 0.413689 1.0\n", - "75% 749.250000 38.000000 1.000000 1.448505 1.0\n", - "max 999.000000 49.000000 1.000000 1.985414 1.0\n" - ] - } - ], - "source": [ - "# Add a column in the estimation data for the constant\n", - "\n", - "d = [[n, i, int(C[n]==i), X[i], 1] for n in range(N) for i in alts(n)]\n", - "\n", - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x', 'const'])\n", - "\n", - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 198, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "spec2 = OrderedDict([\n", - " ('x', [range(J)]),\n", - " ('const', [range(J)])\n", - " ])\n", - "\n", - "labels2 = OrderedDict([\n", - " ('x', ['beta_x']),\n", - " ('const', ['constant'])\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 232, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.5" - ] - }, - "execution_count": 232, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Try binomial formula\n", - "\n", - "j=3\n", - "k=2\n", - "\n", - "fact = np.math.factorial\n", - "\n", - "float(fact(k-1)*fact(j-k))/fact(j-1)" - ] - }, - { - "cell_type": "code", - "execution_count": 225, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -1,098.6123\n", - "Initial Log-likelihood: -1,098.6123\n", - "Estimation Time: 0.02 seconds.\n", - "Final log-likelihood: -613.3560\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: choice No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 998\n", - "Method: MLE Df Model: 2\n", - "Date: Mon, 21 Nov 2016 Pseudo R-squ.: 0.442\n", - "Time: 13:47:43 Pseudo R-bar-squ.: 0.440\n", - "converged: True Log-Likelihood: -613.356\n", - " LL-Null: -1,098.612\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.5376 0.075 20.586 0.000 1.391 1.684\n", - "constant -7.0699 1.31e+07 -5.39e-07 1.000 -2.57e+07 2.57e+07\n", - "==============================================================================\n", - "CPU times: user 325 ms, sys: 29.7 ms, total: 355 ms\n", - "Wall time: 237 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "m = pylogit.create_choice_model(data = df, \n", - " alt_id_col = 'alt_id', \n", - " obs_id_col = 'obs_id', \n", - " choice_col = 'choice', \n", - " specification = spec2, \n", - " model_type = \"MNL\", \n", - " names = labels2)\n", - "\n", - "# p = float(K-1)/(J-1)\n", - "# const = np.log(p**(K-1) * (1-p)**(J-K))\n", - "\n", - "const = np.log(float(fact(K-1)*fact(J-K))/fact(J-1))\n", - "\n", - "# Add an initial value for the constant and constrain it to that\n", - "m.fit_mle(init_vals = np.array([0, const]), \n", - " constrained_pos=[1])\n", - "\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run 1000x with different samples" - ] - }, - { - "cell_type": "code", - "execution_count": 213, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# try binomial formula\n", - "const = np.log(float(fact(K-1)*fact(J-K))/fact(J-1))\n", - "\n", - "def estimate_beta_with_correction():\n", - " d = [[n, i, int(C[n]==i), X[i], 1] for n in range(N) for i in alts(n)]\n", - " df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x', 'const'])\n", - " m = pylogit.create_choice_model(df, 'alt_id', 'obs_id', 'choice', spec2, 'MNL', names=labels2)\n", - " m.fit_mle(init_vals = np.array([0, const]), constrained_pos=[1])\n", - " return m.params.beta_x" - ] - }, - { - "cell_type": "code", - "execution_count": 216, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%%capture\n", - "\n", - "beta = []\n", - "for i in range(1000):\n", - " beta.append(estimate_beta_with_correction())" - ] - }, - { - "cell_type": "code", - "execution_count": 217, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "count 1000.000000\n", - "mean 1.513490\n", - "std 0.051725\n", - "min 1.354507\n", - "25% 1.477341\n", - "50% 1.512756\n", - "75% 1.548081\n", - "max 1.736557\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "print pd.Series(beta).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "NB - the correction isn't needed for the random sampling case, but we can adapt this code for stratified sampling later on" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/Sampling-correction-02.ipynb b/notebooks/Sampling-correction-02.ipynb deleted file mode 100644 index 75ee428..0000000 --- a/notebooks/Sampling-correction-02.ipynb +++ /dev/null @@ -1,1141 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Sampling correction for large choice sets" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sam Maurer, Dec. 1, 2016\n", - "\n", - "1. Replicate synthetic data from Guevara & Ben-Akiva 2013\n", - "2. Do MNL with and without sampling correction\n", - "3. Check whether parameter estimates deviate from true values\n", - "4. Extend to Mixed Logit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## 1. Generate synthetic data set" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- numobs (N) = 1000 observations\n", - "- numalts (J) = 1000 alternatives for all observations (choiceset_n = choiceset, C_n = C)\n", - "- X = single attribute distributed Uniform(-2,1) for the first 500 alternatives and Uniform(-1,2) for the second half\n", - "- beta = generic linear taste coefficient, distributed Normal(mu=1.5, sigma=0.8) across the 1000 observations\n", - "- systematic utility = beta * X\n", - "- epsilon = error term distributed ExtremeValue(0,1)\n", - "- random utility = beta * X + epsilon\n", - "\n", - "Utility of alternative i for agent n:\n", - "$$ U_{in} = V_{in} + \\varepsilon_{in} = \\beta_n x_{i} + \\varepsilon_{in} $$\n", - "\n", - "Probability that agent n will choose alternative i:\n", - "$$ L_n(i \\mid \\beta_n, x_n,C_n) = \\frac {e^{V_{in}}} {\\sum_{j \\epsilon C_n} e^{V_{jn}}} $$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "# Set a seed so that the random numbers will be reproducible\n", - "np.random.seed(12)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Generate attributes x1, x2 for each of numalts (J) alternatives" - ] - }, - { - "cell_type": "code", - "execution_count": 134, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# For now, J << 1000 alternatives to speed up runtimes\n", - "numalts = 50\n", - "\n", - "def rand(len, min, max):\n", - " \"\"\" Generate `len` random floats uniformly distributed from `min` to `max` \"\"\"\n", - " return (max - min) * np.random.rand(len) + min\n", - "\n", - "# Attribute x is uniformly distributed over [-2, 1] for half the alternatives\n", - "# and over [-1, 2] for the other half, as in Guevara & Ben-Akiva\n", - "\n", - "# X = np.concatenate((rand(numalts/2, -2, 1), rand(numalts/2, -1, 2)))\n", - "\n", - "# Or, attribute x is uniformly distributed over [0, 10] for half the alternatives\n", - "# and over [100, 110] for the other half, to induce bias in estimation\n", - "\n", - "X = np.concatenate((rand(numalts/2, 0, 10), rand(numalts/2, 100, 110)))" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "metadata": { - "collapsed": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 0\n", - "count 25.000000\n", - "mean 5.782730\n", - "std 2.903692\n", - "min 0.233499\n", - "25% 3.436553\n", - "50% 6.561771\n", - "75% 7.527760\n", - "max 9.956289\n", - " 0\n", - "count 25.000000\n", - "mean 105.373296\n", - "std 3.202724\n", - "min 100.563383\n", - "25% 102.109451\n", - "50% 105.497276\n", - "75% 108.062617\n", - "max 109.884905\n" - ] - } - ], - "source": [ - "print pd.DataFrame(X[:numalts/2]).describe()\n", - "print pd.DataFrame(X[numalts/2:]).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Generate taste coefficient beta for each of numobs (N) agents " - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# For regular MNL, use a single value instead of a distribution as \n", - "# Guevara & Ben-Akiva used for the mixture model\n", - "\n", - "numobs = 1000 # agents/observations\n", - "\n", - "beta = np.zeros(1000) + 1.5\n", - "# beta = 0.8 * np.random.randn(numobs) + 1.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print pd.DataFrame(beta).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Simulate a choice from numalts (J) alternatives for each of numobs (N) agents" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1000\n", - "50\n" - ] - } - ], - "source": [ - "# Generate a utility matrix for N agents choosing among J alternatives\n", - "\n", - "U = [[beta[n]*x + np.random.gumbel() for x in X] for n in range(numobs)]\n", - " \n", - "print len(U)\n", - "print len(U[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1000\n", - "[28, 32, 33, 33, 33, 43, 32, 43, 32, 37]\n" - ] - } - ], - "source": [ - "# Each agent chooses the alternative with highest utility\n", - "\n", - "choices = [np.argmax(a) for a in U]\n", - "\n", - "print len(choices)\n", - "print choices[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "#### Now we have data:\n", - "\n", - "- N agents/observations with true taste coefficients in array \"`beta`\"\n", - "- J alternatives with single attributes in array \"`X`\"\n", - "- N choice outcomes in array \"`choices`\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Estimate beta without sampling, using PyLogit MNL" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import pylogit\n", - "from collections import OrderedDict" - ] - }, - { - "cell_type": "code", - "execution_count": 174, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set up the estimation dataset in long format\n", - "\n", - "d = [[n, i, int(choices[n]==i), X[i]] for n in range(numobs) for i in range(numalts)]\n", - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'chosen', 'x'])" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": { - "collapsed": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id chosen x\n", - "0 0 0 0 1.699728\n", - "1 0 1 0 2.530486\n", - "2 0 2 0 7.104747\n", - "3 0 3 0 5.721117\n", - "4 0 4 0 6.125892 \n", - "\n", - " obs_id alt_id chosen x\n", - "count 50000.000000 50000.000000 50000.000000 50000.000000\n", - "mean 499.500000 24.500000 0.020000 55.555265\n", - "std 288.677877 14.431014 0.140001 50.140919\n", - "min 0.000000 0.000000 0.000000 1.611696\n", - "25% 249.750000 12.000000 0.000000 6.127162\n", - "50% 499.500000 24.500000 0.000000 54.888537\n", - "75% 749.250000 37.000000 0.000000 105.748896\n", - "max 999.000000 49.000000 1.000000 109.778827\n" - ] - } - ], - "source": [ - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 168, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set up reusable model spec\n", - "\n", - "spec = OrderedDict([('x', 'all_same')])\n", - "labels = OrderedDict([('x', 'beta_x')])" - ] - }, - { - "cell_type": "code", - "execution_count": 172, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Set up reusable code to estimate a model\n", - "\n", - "def estimate_model(init_val):\n", - " \"\"\"\n", - " Initialize and fit a model, returning it as an object. Will use the \n", - " current values of `df`, `spec`, and `labels`.\n", - " \"\"\"\n", - " m = pylogit.create_choice_model(data = df, \n", - " alt_id_col = 'alt_id', \n", - " obs_id_col = 'obs_id', \n", - " choice_col = 'chosen', \n", - " specification = spec, \n", - " model_type = \"MNL\", \n", - " names = labels)\n", - "\n", - " m.fit_mle(init_vals = np.array([init_val]))\n", - " return m" - ] - }, - { - "cell_type": "code", - "execution_count": 175, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -3,912.0230\n", - "Initial Log-likelihood: -1,823.3647\n", - "Estimation Time: 0.17 seconds.\n", - "Final log-likelihood: -1,813.8248\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: chosen No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 999\n", - "Method: MLE Df Model: 1\n", - "Date: Sun, 11 Dec 2016 Pseudo R-squ.: 0.536\n", - "Time: 19:42:20 Pseudo R-bar-squ.: 0.536\n", - "converged: True Log-Likelihood: -1,813.825\n", - " LL-Null: -3,912.023\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.4422 0.060 24.067 0.000 1.325 1.560\n", - "==============================================================================\n", - "CPU times: user 8.37 s, sys: 15.4 s, total: 23.7 s\n", - "Wall time: 15.1 s\n" - ] - } - ], - "source": [ - "%%time\n", - "m = estimate_model(init_val = 1.2)\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This looks good: it's very close to the true beta of 1.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3a. Estimate beta with random sampling of alternatives\n", - "\n", - "This should produce an unbiased estimate of beta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# In the estimation dataset, for each observation include a row for the\n", - "# chosen alternative, plus K-1 other alternatives sampled randomly\n", - "# without replacement, where K < J.\n", - "\n", - "# Some more notation:\n", - "# - true choice set C = range(J)\n", - "# - restricted choice set D_n is a subset of C, where len(D_n) = K" - ] - }, - { - "cell_type": "code", - "execution_count": 154, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 3 12 13 28 38]\n" - ] - } - ], - "source": [ - "# TO DO - rewrite to use sampling weights\n", - "\n", - "def alts(obs_id, C, K):\n", - " \"\"\"\n", - " This function generates a restricted choice set D for a particular\n", - " observation. Expects list `C` of alternatives to sample from (either\n", - " the full choice set or a stratrum), int `K` alternatives to sample,\n", - " and list `choices` of the alt_id chosen for each obs_id. Returns list \n", - " of K alt_id's including the chosen one.\n", - " \"\"\"\n", - " chosen = choices[obs_id] # id of chosen alternative\n", - " unchosen = [i for i in C if chosen != i] # id's of unchosen alts\n", - " sample_unchosen = np.random.choice(unchosen, size=K-1, replace=False).tolist()\n", - " return np.sort([chosen] + sample_unchosen)\n", - " \n", - "print alts(0, range(numalts), 5)" - ] - }, - { - "cell_type": "code", - "execution_count": 176, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set up the estimation dataset, which can use the same spec as earlier\n", - "\n", - "C = range(numalts) # choice set to sample from\n", - "K = 10\n", - "\n", - "d = [[n, i, int(choices[n]==i), X[i]] for n in range(numobs) for i in alts(n, C, K)]\n", - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'chosen', 'x'])" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": { - "collapsed": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id chosen x\n", - "0 0 2 0 8.730367\n", - "1 0 11 0 2.855760\n", - "2 0 18 0 9.956289\n", - "3 0 28 1 108.045363\n", - "4 0 30 0 105.386425 \n", - "\n", - " obs_id alt_id chosen x\n", - "count 10000.000000 10000.00000 10000.000000 10000.000000\n", - "mean 499.500000 25.51300 0.100000 60.170736\n", - "std 288.689425 14.29645 0.300015 50.009368\n", - "min 0.000000 0.00000 0.000000 0.233499\n", - "25% 249.750000 13.00000 0.000000 6.797005\n", - "50% 499.500000 27.00000 0.000000 100.970300\n", - "75% 749.250000 37.00000 0.000000 106.140668\n", - "max 999.000000 49.00000 1.000000 109.884905\n" - ] - } - ], - "source": [ - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 177, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -2,302.5851\n", - "Initial Log-likelihood: -585.5314\n", - "Estimation Time: 0.01 seconds.\n", - "Final log-likelihood: -578.0528\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: chosen No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 999\n", - "Method: MLE Df Model: 1\n", - "Date: Sun, 11 Dec 2016 Pseudo R-squ.: 0.749\n", - "Time: 19:42:57 Pseudo R-bar-squ.: 0.749\n", - "converged: True Log-Likelihood: -578.053\n", - " LL-Null: -2,302.585\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.4855 0.081 18.243 0.000 1.326 1.645\n", - "==============================================================================\n", - "CPU times: user 594 ms, sys: 380 ms, total: 974 ms\n", - "Wall time: 657 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "m = estimate_model(init_val = 1.2)\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run 1000x with different samples of alternatives" - ] - }, - { - "cell_type": "code", - "execution_count": 178, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1min 33s, sys: 38 s, total: 2min 11s\n", - "Wall time: 1min 14s\n" - ] - } - ], - "source": [ - "%%time\n", - "%%capture\n", - "\n", - "beta = []\n", - "C = range(numalts)\n", - "K = 10\n", - "\n", - "for i in range(100):\n", - " d = [[n, i, int(choices[n]==i), X[i]] for n in range(numobs) for i in alts(n, C, K)]\n", - " df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'chosen', 'x'])\n", - " m = estimate_model(init_val = 1.2)\n", - " beta.append(m.params.beta_x)" - ] - }, - { - "cell_type": "code", - "execution_count": 179, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "count 100.000000\n", - "mean 1.453900\n", - "std 0.042923\n", - "min 1.371329\n", - "25% 1.426357\n", - "50% 1.450013\n", - "75% 1.484759\n", - "max 1.557182\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "print pd.Series(beta).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looks unbiased, as expected. It's very close to the true beta of 1.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## 3b. Estimate beta with over-sampling of *irrelevant* alternatives\n", - "\n", - "This should produce a biased estimate of beta, until we add a correction to the estimation procedure" - ] - }, - { - "cell_type": "code", - "execution_count": 187, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Recall that half the values of x are in the range [0, 10] and half are\n", - "# in the range [100, 110]. The taste coefficient is positive, so the first\n", - "# set of alternatives is much less relevant than the second set. \n", - "\n", - "C = range(numalts/2) # alternatives to sample from\n", - "K = 10\n", - "\n", - "d = [[n, i, int(choices[n]==i), X[i]] for n in range(numobs) for i in alts(n, C, K)]\n", - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'chosen', 'x'])" - ] - }, - { - "cell_type": "code", - "execution_count": 188, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id alt_id chosen x\n", - "0 0 1 0 7.427010\n", - "1 0 3 0 9.930585\n", - "2 0 5 0 3.436553\n", - "3 0 9 0 6.558043\n", - "4 0 11 0 2.855760 \n", - "\n", - " obs_id alt_id chosen x\n", - "count 10000.000000 10000.000000 10000.000000 10000.000000\n", - "mean 499.500000 14.463900 0.100000 16.177211\n", - "std 288.689425 10.195033 0.300015 31.248750\n", - "min 0.000000 0.000000 0.000000 0.233499\n", - "25% 249.750000 6.000000 0.000000 3.436553\n", - "50% 499.500000 13.000000 0.000000 6.797005\n", - "75% 749.250000 20.000000 0.000000 8.371036\n", - "max 999.000000 45.000000 1.000000 109.884905\n" - ] - } - ], - "source": [ - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 189, - "metadata": { - "collapsed": false, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -2,302.5851\n", - "Initial Log-likelihood: 0.0000\n", - "Estimation Time: 0.00 seconds.\n", - "Final log-likelihood: 0.0000\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: chosen No. Observations: 1,000\n", - "Model: Multinomial Logit Model Df Residuals: 999\n", - "Method: MLE Df Model: 1\n", - "Date: Sun, 11 Dec 2016 Pseudo R-squ.: 1.000\n", - "Time: 20:00:33 Pseudo R-bar-squ.: 1.000\n", - "converged: True Log-Likelihood: 0.000\n", - " LL-Null: -2,302.585\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "beta_x 1.5000 nan nan nan nan nan\n", - "==============================================================================\n", - "CPU times: user 635 ms, sys: 373 ms, total: 1.01 s\n", - "Wall time: 674 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "m = estimate_model(init_val = 1.5)\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. MNL with sampling correction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Utility of alternative j:\n", - "$$ V_{j} = \\beta x_{j} $$\n", - "\n", - "With sampling, we have to account for the restricted choice set (from Eq 6 in Guevara & Ben-Akiva 2013):\n", - "\n", - "$$ V_j = \\beta x_j + \\ln \\pi(D \\mid j) $$\n", - "\n", - "Where pi is the conditional probability that we would construct the choice set D given that alternative j was chosen. This goes into the likelihood function in both the numerator and denominator.\n", - "\n", - "$$ L_n = \\frac {exp(\\beta x_i + \\ln \\pi(D_n \\mid i))} {\\sum_{j \\epsilon D_n} exp(\\beta x_j + \\ln \\pi(D_n \\mid j))} $$\n", - "\n", - "How to calculate pi? From the original formulation of this in McFadden 1978: \"Suppose D is comprized of i plus a sample of alternatives from the set C\\\\{i}, obtained by considering each element of this set independently, and including it with probability p. Then, the probability of D will depend solely on the number of elements K it contains.\"\n", - "\n", - "$$ \\pi(D) = p^{K-1} (1 - p)^{J-K} $$\n", - "\n", - "(?? Without replacement, i think it should be the n-choose-k binomial coefficient, where n=J-1 and k=K-1)\n", - "\n", - "$$ \\pi(D) = {n \\choose k} = \\frac {(K-1)!(J-K)!} {(J-1)!} $$\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Add a column in the estimation data for the constant\n", - "\n", - "d = [[n, i, int(C[n]==i), X[i], 1] for n in range(N) for i in alts(n)]\n", - "\n", - "df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x', 'const'])\n", - "\n", - "print df.head(), '\\n'\n", - "print df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "spec2 = OrderedDict([\n", - " ('x', [range(J)]),\n", - " ('const', [range(J)])\n", - " ])\n", - "\n", - "labels2 = OrderedDict([\n", - " ('x', ['beta_x']),\n", - " ('const', ['constant'])\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Try binomial formula\n", - "\n", - "j=3\n", - "k=2\n", - "\n", - "fact = np.math.factorial\n", - "\n", - "float(fact(k-1)*fact(j-k))/fact(j-1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%%time\n", - "m = pylogit.create_choice_model(data = df, \n", - " alt_id_col = 'alt_id', \n", - " obs_id_col = 'obs_id', \n", - " choice_col = 'choice', \n", - " specification = spec2, \n", - " model_type = \"MNL\", \n", - " names = labels2)\n", - "\n", - "# p = float(K-1)/(J-1)\n", - "# const = np.log(p**(K-1) * (1-p)**(J-K))\n", - "\n", - "const = np.log(float(fact(K-1)*fact(J-K))/fact(J-1))\n", - "\n", - "# Add an initial value for the constant and constrain it to that\n", - "m.fit_mle(init_vals = np.array([0, const]), \n", - " constrained_pos=[1])\n", - "\n", - "print m.get_statsmodels_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run 1000x with different samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# try binomial formula\n", - "const = np.log(float(fact(K-1)*fact(J-K))/fact(J-1))\n", - "\n", - "def estimate_beta_with_correction():\n", - " d = [[n, i, int(C[n]==i), X[i], 1] for n in range(N) for i in alts(n)]\n", - " df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x', 'const'])\n", - " m = pylogit.create_choice_model(df, 'alt_id', 'obs_id', 'choice', spec2, 'MNL', names=labels2)\n", - " m.fit_mle(init_vals = np.array([0, const]), constrained_pos=[1])\n", - " return m.params.beta_x" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%%time\n", - "%%capture\n", - "\n", - "beta = []\n", - "for i in range(1000):\n", - " beta.append(estimate_beta_with_correction())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print pd.Series(beta).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/_archive/CHTS-exploration-01.ipynb b/notebooks/_archive/CHTS-exploration-01.ipynb deleted file mode 100644 index 652d11a..0000000 --- a/notebooks/_archive/CHTS-exploration-01.ipynb +++ /dev/null @@ -1,2051 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring the public CHTS data\n", - "\n", - "Sam Maurer, June 2017\n", - "\n", - "Python 3.6" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "\n", - "import matplotlib\n", - "import numpy as np\n", - "import pandas as pd\n", - "import zipfile" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# See ../data/README.md for instructions about how to get the data\n", - "\n", - "z = zipfile.ZipFile('../data/caltrans_full_survey.zip')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Households\n", - "\n", - "Households that participated in the travel diary survey" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "42426" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)\n", - "\n", - "len(households)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "9715" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Limit to the Bay Area\n", - "\n", - "households_ba = households[households.home_county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]\n", - "\n", - "len(households_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "SAN FRANCISCO 1076\n", - "SAN JOSE 939\n", - "OAKLAND 459\n", - "SANTA ROSA 321\n", - "BERKELEY 251\n", - "NAPA 228\n", - "PALO ALTO 218\n", - "SUNNYVALE 200\n", - "SAN MATEO 197\n", - "FREMONT 177\n", - "WALNUT CREEK 173\n", - "REDWOOD CITY 170\n", - "FAIRFIELD 159\n", - "CONCORD 158\n", - "SAN RAFAEL 158\n", - "Name: home_city, dtype: int64" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Top home locations\n", - "\n", - "households_ba.home_city.value_counts()[:15]" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 2.571462\n", - "std 1.373733\n", - "min 1.000000\n", - "25% 2.000000\n", - "50% 2.000000\n", - "75% 3.000000\n", - "max 8.000000\n", - "Name: persons_count, dtype: float64" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.persons_count.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 0.999955\n", - "std 0.704667\n", - "min 0.003498\n", - "25% 0.447392\n", - "50% 0.915924\n", - "75% 1.376790\n", - "max 5.400840\n", - "Name: hhwgt, dtype: float64" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.hhwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421.000000\n", - "mean 293.007784\n", - "std 206.482227\n", - "min 1.025146\n", - "25% 131.095416\n", - "50% 268.385115\n", - "75% 403.428487\n", - "max 1582.559559\n", - "Name: exphhwgt, dtype: float64" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.exphhwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 42421\n", - "mean 6056293182\n", - "std 29445570\n", - "min 6001400100\n", - "25% 6037207301\n", - "50% 6059042114\n", - "75% 6079011200\n", - "max 6115041100\n", - "Name: home_tract_id, dtype: float64" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households.home_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Persons" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "109113" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)\n", - "\n", - "len(persons)" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno travel_date gender relation education race1\n", - "0 7128119 1 2013-01-27 1 1 6 1\n", - "1 7128119 3 2013-01-27 2 3 1 1\n", - "2 7128138 1 2012-11-05 2 1 5 1\n", - "3 7128262 1 2012-12-21 2 1 1 1\n", - "4 7128262 3 2012-12-21 2 3 2 1\n", - "5 7128262 2 2012-12-21 1 2 1 1\n", - "6 7128288 2 2013-01-22 1 3 3 1\n", - "7 7128288 1 2013-01-22 2 1 5 1\n", - "8 7128316 1 2012-12-29 2 1 4 1\n", - "9 7128372 1 2012-12-29 2 1 6 1\n" - ] - } - ], - "source": [ - "print(persons[['sampno', 'perno', 'travel_date', 'gender', 'relation', \n", - " 'education', 'race1']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 108776.000000\n", - "mean 3.233838\n", - "std 2.954577\n", - "min 0.000000\n", - "25% 1.000000\n", - "50% 2.000000\n", - "75% 5.000000\n", - "max 33.000000\n", - "Name: person_trips, dtype: float64" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What is `person_trips`? -- not sure, but it looks related to the `tripno` field\n", - "\n", - "persons.person_trips.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 43111\n", - "mean 6241008094\n", - "std 3120182944\n", - "min 2614\n", - "25% 6037238200\n", - "50% 6059063907\n", - "75% 6079011001\n", - "max 99999999999\n", - "Name: empl_tract_id, dtype: float64" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.empl_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113\n", - "unique 2\n", - "top False\n", - "freq 66002\n", - "Name: empl_tract_id, dtype: object" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.empl_tract_id.notnull().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 25438\n", - "mean 6342776654\n", - "std 3678070397\n", - "min 4005001000\n", - "25% 6037232875\n", - "50% 6059062642\n", - "75% 6079010205\n", - "max 99999999999\n", - "Name: school_tract_id, dtype: float64" - ] - }, - "execution_count": 81, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.school_tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113\n", - "unique 2\n", - "top False\n", - "freq 83675\n", - "Name: school_tract_id, dtype: object" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.school_tract_id.notnull().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 109113.000000\n", - "mean 0.999999\n", - "std 0.962373\n", - "min 0.000568\n", - "25% 0.322230\n", - "50% 0.717519\n", - "75% 1.329846\n", - "max 5.060089\n", - "Name: perwgt, dtype: float64" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons.perwgt.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Places\n", - "\n", - "Each record represents a single visit to a place" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "460524" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)\n", - "\n", - "len(places)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Filter for places visited by people who live in the Bay Area (may want to do use a\n", - "# different filter depending on the application)\n", - "\n", - "places_ba = places[places.sampno.isin(households_ba.sampno)]\n", - "\n", - "len(places_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno plano tripno\n", - "0 1031985 1 1 nan\n", - "1 1031985 1 2 1\n", - "2 1031985 1 3 2\n", - "3 1031985 2 1 nan\n", - "4 1031985 2 2 1\n", - "5 1031985 2 3 2\n", - "118 1033944 1 1 nan\n", - "119 1033944 1 2 1\n", - "120 1033944 1 3 2\n", - "121 1033944 1 4 3\n" - ] - } - ], - "source": [ - "# Is there a unique identifier?\n", - "\n", - "# Might need to use combination of `sampno` (household), `perno` (person within hh),\n", - "# `plano` (place within person's travel diary)\n", - "\n", - "# What's `tripno`? (\"unlinked trip ID\" - maybe representing transfer between modes)\n", - "\n", - "print(places_ba[['sampno', 'perno', 'plano', 'tripno']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Is every combination of `sampno`, `perno`, `plano` unique? -- Yes\n", - "\n", - "len(places_ba.groupby(['sampno', 'perno', 'plano']))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "93406" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many places have a `tripno`? -- about 80%\n", - "\n", - "places_ba.tripno.count()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "117345" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Is the `tripno` ever repeated? -- No\n", - "\n", - "len(places_ba.groupby(['sampno', 'perno', 'plano', 'tripno']))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 93406.000000\n", - "mean 3.817185\n", - "std 2.841705\n", - "min 1.000000\n", - "25% 2.000000\n", - "50% 3.000000\n", - "75% 5.000000\n", - "max 32.000000\n", - "Name: tripno, dtype: float64" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places_ba.tripno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0 REDACTED\n", - "1 REDACTED\n", - "2 REDACTED\n", - "3 REDACTED\n", - "4 REDACTED\n", - "Name: place_name, dtype: object" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Can we see the place names? -- No\n", - "\n", - "places_ba.place_name.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "SAN FRANCISCO 15680\n", - "SAN JOSE 11414\n", - "OAKLAND 5455\n", - "SANTA ROSA 3441\n", - "BERKELEY 3185\n", - "PALO ALTO 2664\n", - "SUNNYVALE 2440\n", - "SAN MATEO 2190\n", - "NAPA 2160\n", - "FREMONT 2126\n", - "REDWOOD CITY 2067\n", - "MOUNTAIN VIEW 1948\n", - "WALNUT CREEK 1896\n", - "SANTA CLARA 1816\n", - "CONCORD 1800\n", - "Name: city, dtype: int64" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places_ba.city.value_counts().head(15)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZUAAAD8CAYAAAC/1zkdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGWJJREFUeJzt3X+0XWV95/H3R2IRVJAfkdL86A0l0gkZDeQOKzOM1ja1\npGINdqENqyNxyiIyMFamrNUG6qp21sosmKq0zAyxUWgCRSDyQzIVOgZwyXQtA15oSiBIuUiUew0k\nAkP8AcHAZ/44z7Gby73JSbLP3eeQz2uts85zvns/e38PEL7Z+3nOfmSbiIiIOryh6QQiIuL1I0Ul\nIiJqk6ISERG1SVGJiIjapKhERERtUlQiIqI2KSoREVGbFJWIiKhNikpERNRmStMJTLajjz7aAwMD\nTacREdFX7r///h/anrqn/Q64ojIwMMDQ0FDTaURE9BVJ3+tkv9z+ioiI2qSoREREbVJUIiKiNikq\nERFRmxSViIioTYpKRETUJkUlIiJqk6ISERG1SVGJiIjaHHC/qN8fA8u/tl/9t1x6ek2ZRET0plyp\nREREbVJUIiKiNikqERFRmxSViIioTYpKRETUJkUlIiJqk6ISERG16VpRkTRD0jckbZb0sKRPlviR\nktZLeqy8H1Hpc7GkYUmPSjqtEp8vaVPZdoUklfjBkm4s8XslDXTr+0RExJ5180plF3CR7TnAAuAC\nSXOA5cBdtmcDd5XPlG1LgBOBRcCVkg4qx1oJnAvMLq9FJX4O8Jzt44HLgcu6+H0iImIPulZUbG+1\n/UBp/wh4BJgGLAbWlN3WAGeU9mLgBts7bT8BDAOnSDoWOMz2BtsGrhnTp32sm4CF7auYiIiYfJMy\nplJuS50E3AscY3tr2fQUcExpTwOerHQbKbFppT02/qo+tncBzwNH1f4FIiKiI10vKpLeAtwMXGh7\nR3VbufLwJOSwTNKQpKHt27d3+3QREQesrhYVSW+kVVCus31LCT9dbmlR3reV+Cgwo9J9eomNlvbY\n+Kv6SJoCHA48MzYP26tsD9oenDp1ah1fLSIixtHN2V8CrgIesf35yqZ1wNLSXgrcVokvKTO6ZtEa\nkL+v3CrbIWlBOebZY/q0j3UmcHe5+omIiAZ089H3pwIfBTZJ2lhilwCXAmslnQN8D/gIgO2HJa0F\nNtOaOXaB7ZdLv/OB1cAhwB3lBa2ida2kYeBZWrPHIiKiIV0rKrb/AZhoJtbCCfqsAFaMEx8C5o4T\nfxH48H6kGRERNcov6iMiojYpKhERUZsUlYiIqE2KSkRE1CZFJSIiapOiEhERtUlRiYiI2qSoRERE\nbVJUIiKiNikqERFRmxSViIioTYpKRETUJkUlIiJqk6ISERG1SVGJiIjapKhERERturmc8NWStkl6\nqBK7UdLG8trSXhFS0oCkFyrbvlDpM1/SJknDkq4oSwpTlh2+scTvlTTQre8SERGd6eaVympgUTVg\n+/dsz7M9D7gZuKWy+fH2NtvnVeIrgXNprVk/u3LMc4DnbB8PXA5c1p2vERERnepaUbF9D61141+j\nXG18BLh+d8eQdCxwmO0Ntg1cA5xRNi8G1pT2TcDC9lVMREQ0o6kxlXcDT9t+rBKbVW59fVPSu0ts\nGjBS2WekxNrbngSwvQt4HjhqvJNJWiZpSNLQ9u3b6/weERFR0VRROYtXX6VsBWaW22J/BHxZ0mF1\nncz2KtuDtgenTp1a12EjImKMKZN9QklTgN8F5rdjtncCO0v7fkmPA+8ARoHple7TS4zyPgMYKcc8\nHHim618gIiIm1MSVym8C37H989takqZKOqi0j6M1IP9d21uBHZIWlPGSs4HbSrd1wNLSPhO4u4y7\nREREQ7o5pfh64FvACZJGJJ1TNi3htQP07wEeLFOMbwLOs90e5D8f+BIwDDwO3FHiVwFHSRqmdcts\nebe+S0REdKZrt79snzVB/GPjxG6mNcV4vP2HgLnjxF8EPrx/WUZERJ3yi/qIiKhNikpERNQmRSUi\nImqTohIREbVJUYmIiNqkqERERG1SVCIiojYpKhERUZsUlYiIqE2KSkRE1CZFJSIiapOiEhERtUlR\niYiI2qSoREREbVJUIiKiNt1cpOtqSdskPVSJfUbSqKSN5fX+yraLJQ1LelTSaZX4fEmbyrYrygqQ\nSDpY0o0lfq+kgW59l4iI6Ew3r1RWA4vGiV9ue1553Q4gaQ6tFSFPLH2ubC8vDKwEzqW1xPDsyjHP\nAZ6zfTxwOXBZt75IRER0pmtFxfY9wLN73LFlMXCD7Z22n6C1dPApko4FDrO9oaw/fw1wRqXPmtK+\nCVjYvoqJiIhmNDGm8glJD5bbY0eU2DTgyco+IyU2rbTHxl/Vx/Yu4HngqG4mHhERuzfZRWUlcBww\nD9gKfG4yTippmaQhSUPbt2+fjFNGRByQJrWo2H7a9su2XwG+CJxSNo0CMyq7Ti+x0dIeG39VH0lT\ngMOBZyY47yrbg7YHp06dWtfXiYiIMSa1qJQxkrYPAe2ZYeuAJWVG1yxaA/L32d4K7JC0oIyXnA3c\nVumztLTPBO4u4y4REdGQKd06sKTrgfcCR0saAT4NvFfSPMDAFuDjALYflrQW2AzsAi6w/XI51Pm0\nZpIdAtxRXgBXAddKGqY1IWBJt75LRER0pmtFxfZZ44Sv2s3+K4AV48SHgLnjxF8EPrw/OUZERL3y\ni/qIiKhNikpERNQmRSUiImqTohIREbVJUYmIiNqkqERERG1SVCIiojZd+51KvNbA8q/tc98tl55e\nYyYREd3R0ZWKpH/d7UQiIqL/dXr760pJ90k6X9LhXc0oIiL6VkdFxfa7gd+n9VTg+yV9WdL7uppZ\nRET0nY4H6m0/BnwK+BPg14ArJH1H0u92K7mIiOgvnY6pvFPS5cAjwG8Av2P7X5X25V3MLyIi+kin\ns7/+B/Al4BLbL7SDtn8g6VNdySwiIvpOp0XldOCF9honkt4AvMn2T21f27XsIiKir3Q6pnInrUWy\n2g4tsQlJulrSNkkPVWJ/UcZhHpR0q6S3lfiApBckbSyvL1T6zJe0SdKwpCvKCpCUVSJvLPF7JQ10\n+F0iIqJLOi0qb7L94/aH0j50D31WA4vGxNYDc22/E/hn4OLKtsdtzyuv8yrxlcC5tJYYnl055jnA\nc7aPpzWuc1mH3yUiIrqk06LyE0kntz9Img+8sJv9sX0PrWV+q7Gv295VPm4Apu/uGGVN+8Nsbyjr\nz18DnFE2LwbWlPZNwML2VUxERDSj0zGVC4GvSPoBIOAXgd/bz3P/AXBj5fMsSRuB54FP2f6/wDRg\npLLPSIlR3p8EsL1L0vPAUcAP9zOviIjYRx0VFdvflvSrwAkl9Kjtn+3rSSX9KbALuK6EtgIzbT9T\nroK+KunEfT3+OOdbBiwDmDlzZl2HjYiIMfbmgZL/BhgofU6WhO1r9vaEkj4GfABYWG5pYXsnsLO0\n75f0OPAOYJRX3yKbXmKU9xnAiKQpwOHAM+Od0/YqYBXA4OCg9zbniIjoTEdFRdK1wK8AG4GXS7g9\nxtExSYuAPwZ+zfZPK/GpwLO2X5Z0HK0B+e/aflbSDkkLgHuBs2n9ZgZgHbAU+BZwJnB3u0hFREQz\nOr1SGQTm7M3/tCVdD7wXOFrSCPBpWrO9DgbWlzH1DWWm13uA/yrpZ8ArwHm224P859OaSXYIcEd5\nAVwFXCtpmNaEgCWd5hYREd3RaVF5iNbg/NZOD2z7rHHCV02w783AzRNsGwLmjhN/Efhwp/lERET3\ndVpUjgY2S7qPMvYBYPuDXckqIiL6UqdF5TPdTCIiIl4fOp1S/E1JvwzMtn2npEOBg7qbWkRE9JtO\nH31/Lq1frf91CU0DvtqtpCIioj91+piWC4BTgR3w8wW73t6tpCIioj91WlR22n6p/aH82DC/CYmI\niFfptKh8U9IlwCFlbfqvAP+7e2lFREQ/6rSoLAe2A5uAjwO301qvPiIi4uc6nf31CvDF8oqIiBhX\np8/+eoJxxlBsH1d7RhER0bf25tlfbW+i9XiUI+tPJyIi+llHYyq2n6m8Rm3/JXB6l3OLiIg+0+nt\nr5MrH99A68plb9ZiiYiIA0CnheFzlfYuYAvwkdqziYiIvtbp7K9f73YiERHR/zq9/fVHu9tu+/P1\npBMREf2s0x8/DgL/idaDJKcB5wEnA28tr9eQdLWkbZIeqsSOlLRe0mPl/YjKtoslDUt6VNJplfh8\nSZvKtitUloyUdLCkG0v8XkkDe/fVIyKibp0WlenAybYvsn0RMB+YafvPbf/5BH1WA4vGxJYDd9me\nDdxVPiNpDq3lgE8sfa6U1H60/krgXFrr1s+uHPMc4DnbxwOXA5d1+F0iIqJLOi0qxwAvVT6/VGIT\nsn0PrbXjqxYDa0p7DXBGJX6D7Z22nwCGgVMkHQscZnuDbQPXjOnTPtZNwML2VUxERDSj09lf1wD3\nSbq1fD6Df/kf+t44xnZ7nfun+JfCNA3YUNlvpMR+Vtpj4+0+TwLY3iXpeeAo4IdjTyppGbAMYObM\nmfuQdkREdKLT2V8rJN0BvLuE/qPtf9yfE9u2pEl5fL7tVcAqgMHBwb58ZP/A8q/tc98tl+Z3qhEx\nOTq9/QVwKLDD9l8BI5Jm7cP5ni63tCjv20p8FJhR2W96iY2W9tj4q/qU9V0OB57Zh5wiIqImnS4n\n/GngT4CLS+iNwN/uw/nWAUtLeylwWyW+pMzomkVrQP6+cqtsh6QFZbzk7DF92sc6E7i7jLtERERD\nOh1T+RBwEvAAgO0fSBp3KnGbpOuB9wJHSxoBPg1cCqyVdA7wPcqv8m0/LGktsJnWL/YvsP1yOdT5\ntGaSHQLcUV4AVwHXShqmNSFgSYffJSIiuqTTovJSdQxE0pv31MH2WRNsWjjB/iuAFePEh4C548Rf\npPW05IiI6BGdjqmslfTXwNsknQvcSRbsioiIMTqd/fXZsjb9DuAE4M9sr+9qZhER0Xf2WFTKL9vv\nLA+VTCGJiIgJ7fH2Vxkwf0XS4ZOQT0RE9LFOB+p/DGyStB74STto+w+7klVERPSlTovKLeUVEREx\nod0WFUkzbX/f9r485ysiIg4wexpT+Wq7IenmLucSERF9bk9Fpfoo+eO6mUhERPS/PRUVT9COiIh4\njT0N1L9L0g5aVyyHlDbls20f1tXsIiKir+y2qNg+aHfbIyIiqvZmPZWIiIjdSlGJiIjapKhERERt\nJr2oSDpB0sbKa4ekCyV9RtJoJf7+Sp+LJQ1LelTSaZX4fEmbyrYryuqQERHRkEkvKrYftT3P9jxg\nPvBT4Nay+fL2Ntu3A0iaQ2tVxxOBRcCV5cnJACuBc2ktPzy7bI+IiIY0fftrIfC47e/tZp/FwA22\nd9p+AhgGTpF0LHCY7Q1lbfprgDO6n3JEREyk6aKyBLi+8vkTkh6UdLWkI0psGvBkZZ+REptW2mPj\nERHRkMaKiqRfAD4IfKWEVtJ6FMw8YCvwuRrPtUzSkKSh7du313XYiIgYo9NH33fDbwMP2H4aoP0O\nIOmLwN+Vj6PAjEq/6SU2Wtpj469hexWwCmBwcPCAe9zMwPKv7XPfLZeeXmMmEfF61+Ttr7Oo3Poq\nYyRtHwIeKu11wBJJB0uaRWtA/j7bW4EdkhaUWV9nA7dNTuoRETGeRq5UJL0ZeB/w8Ur4v0uaR+vB\nlVva22w/LGktsBnYBVxQljgGOB9YDRwC3FFeERHRkEaKiu2fAEeNiX10N/uvAFaMEx8C5taeYERE\n7JOmZ39FRMTrSIpKRETUJkUlIiJqk6ISERG1SVGJiIjapKhERERtUlQiIqI2KSoREVGbFJWIiKhN\nikpERNQmRSUiImqTohIREbVJUYmIiNqkqERERG2aXPkx+kBWjYyIvdHIlYqkLZI2SdooaajEjpS0\nXtJj5f2Iyv4XSxqW9Kik0yrx+eU4w5KuKCtARkREQ5q8/fXrtufZHiyflwN32Z4N3FU+I2kOsAQ4\nEVgEXCnpoNJnJXAurSWGZ5ftERHRkF4aU1kMrCntNcAZlfgNtnfafgIYBk4pa9ofZnuDbQPXVPpE\nREQDmioqBu6UdL+kZSV2jO2tpf0UcExpTwOerPQdKbFppT02HhERDWlqoP7f2x6V9HZgvaTvVDfa\ntiTXdbJSuJYBzJw5s67DRkTEGI1cqdgeLe/bgFuBU4Cnyy0tyvu2svsoMKPSfXqJjZb22Ph451tl\ne9D24NSpU+v8KhERUTHpRUXSmyW9td0Gfgt4CFgHLC27LQVuK+11wBJJB0uaRWtA/r5yq2yHpAVl\n1tfZlT4REdGAJm5/HQPcWmb/TgG+bPvvJX0bWCvpHOB7wEcAbD8saS2wGdgFXGD75XKs84HVwCHA\nHeUVERENmfSiYvu7wLvGiT8DLJygzwpgxTjxIWBu3TlGRMS+6aUpxRER0edSVCIiojZ59ld0zf48\nNwzy7LCIfpQrlYiIqE2KSkRE1CZFJSIiapOiEhERtUlRiYiI2qSoREREbVJUIiKiNikqERFRmxSV\niIioTYpKRETUJkUlIiJqk2d/Rc/an2eH5blhEc3IlUpERNSmieWEZ0j6hqTNkh6W9MkS/4ykUUkb\ny+v9lT4XSxqW9Kik0yrx+ZI2lW1XlGWFIyKiIU3c/toFXGT7gbJW/f2S1pdtl9v+bHVnSXOAJcCJ\nwC8Bd0p6R1lSeCVwLnAvcDuwiCwpHBHRmEm/UrG91fYDpf0j4BFg2m66LAZusL3T9hPAMHCKpGOB\nw2xvsG3gGuCMLqcfERG70ehAvaQB4CRaVxqnAp+QdDYwROtq5jlaBWdDpdtIif2stMfGxzvPMmAZ\nwMyZM2v9DtGbMsgf0YzGBuolvQW4GbjQ9g5at7KOA+YBW4HP1XUu26tsD9oenDp1al2HjYiIMRop\nKpLeSKugXGf7FgDbT9t+2fYrwBeBU8ruo8CMSvfpJTZa2mPjERHRkCZmfwm4CnjE9ucr8WMru30I\neKi01wFLJB0saRYwG7jP9lZgh6QF5ZhnA7dNypeIiIhxNTGmcirwUWCTpI0ldglwlqR5gIEtwMcB\nbD8saS2wmdbMsQvKzC+A84HVwCG0Zn1l5lfst4zHROy7SS8qtv8BGO/3JLfvps8KYMU48SFgbn3Z\nRUTE/sgv6iMiojZ59ldEjXLrLA50uVKJiIja5Eolokfsz1UO5EonekOKSsTrRG69RS/I7a+IiKhN\nrlQiIlc5UZsUlYjYLylIUZWiEhGN2d/JCfsjBa07MqYSERG1yZVKRByQctuuO1JUIiL2UgrSxFJU\nIiIm0et9HCljKhERUZsUlYiIqE3fFxVJiyQ9KmlY0vKm84mIOJD1dVGRdBDwv4DfBubQWj1yTrNZ\nRUQcuPq6qACnAMO2v2v7JeAGYHHDOUVEHLD6vahMA56sfB4psYiIaMABMaVY0jJgWfn4Y0mP7uOh\njgZ+WE9WXdPrOfZ6fpAc69Dr+UHv51h7frpsv7r/cic79XtRGQVmVD5PL7FXsb0KWLW/J5M0ZHtw\nf4/TTb2eY6/nB8mxDr2eH/R+jr2e30T6/fbXt4HZkmZJ+gVgCbCu4ZwiIg5YfX2lYnuXpP8M/B/g\nIOBq2w83nFZExAGrr4sKgO3bgdsn6XT7fQttEvR6jr2eHyTHOvR6ftD7OfZ6fuOS7aZziIiI14l+\nH1OJiIgekqLSoV5+HIykGZK+IWmzpIclfbLpnCYi6SBJ/yjp75rOZTyS3ibpJknfkfSIpH/bdE5V\nkv5L+Xf8kKTrJb2pB3K6WtI2SQ9VYkdKWi/psfJ+RA/m+Bfl3/ODkm6V9LZeyq+y7SJJlnR0E7nt\nrRSVDvTB42B2ARfZngMsAC7osfyqPgk80nQSu/FXwN/b/lXgXfRQrpKmAX8IDNqeS2tyypJmswJg\nNbBoTGw5cJft2cBd5XOTVvPaHNcDc22/E/hn4OLJTqpiNa/ND0kzgN8Cvj/ZCe2rFJXO9PTjYGxv\ntf1Aaf+I1v8Ie+7JApKmA6cDX2o6l/FIOhx4D3AVgO2XbP+/ZrN6jSnAIZKmAIcCP2g4H2zfAzw7\nJrwYWFPaa4AzJjWpMcbL0fbXbe8qHzfQ+p1bIyb4ZwhwOfDHQN8MfqeodKZvHgcjaQA4Cbi32UzG\n9Ze0/oC80nQiE5gFbAf+ptyi+5KkNzedVJvtUeCztP7WuhV43vbXm81qQsfY3lraTwHHNJlMB/4A\nuKPpJKokLQZGbf9T07nsjRSV1xFJbwFuBi60vaPpfKokfQDYZvv+pnPZjSnAycBK2ycBP6H52zY/\nV8YlFtMqfr8EvFnSf2g2qz1za4ppz/5NW9Kf0rqFfF3TubRJOhS4BPizpnPZWykqnenocTBNkvRG\nWgXlOtu3NJ3POE4FPihpC63bh78h6W+bTek1RoAR2+2rvJtoFZle8ZvAE7a32/4ZcAvw7xrOaSJP\nSzoWoLxvazifcUn6GPAB4PfdW7+v+BVaf3n4p/JnZjrwgKRfbDSrDqSodKanHwcjSbTGAR6x/fmm\n8xmP7YttT7c9QOuf3922e+pv2bafAp6UdEIJLQQ2N5jSWN8HFkg6tPw7X0gPTSQYYx2wtLSXArc1\nmMu4JC2idTv2g7Z/2nQ+VbY32X677YHyZ2YEOLn8N9rTUlQ6UAbz2o+DeQRY22OPgzkV+Citv/1v\nLK/3N51Un/oEcJ2kB4F5wH9rOJ+fK1dQNwEPAJto/flt/FfXkq4HvgWcIGlE0jnApcD7JD1G6wrr\n0h7M8X8CbwXWlz8zX+ix/PpSflEfERG1yZVKRETUJkUlIiJqk6ISERG1SVGJiIjapKhERERtUlQi\nIqI2KSoREVGbFJWIiKjN/we9J82/i0JuNwAAAABJRU5ErkJggg==\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "places_ba.trip_distance_miles.plot.hist(bins=20, range=(0,15));" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "2296" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Most detailed spatial identifier in public data is tract_id\n", - "\n", - "# How many different tracts are visited?\n", - "places_ba.tract_id.unique().shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "9715" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many different households?\n", - "places_ba.sampno.unique().shape[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "23939" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# How many different people?\n", - "len(places_ba.groupby(['sampno','perno']))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Census identifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress scientific notation\n", - "\n", - "pd.set_option('display.float_format', lambda x: '%.0f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6912\n", - "14388\n" - ] - } - ], - "source": [ - "# Is the mapping between census tracts and city names consistent? -- No\n", - "\n", - "print(places.tract_id.drop_duplicates().shape[0])\n", - "print(places[['tract_id', 'city']].drop_duplicates().shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460519\n", - "mean 191724\n", - "std 242716\n", - "min 100\n", - "25% 5911\n", - "50% 43317\n", - "75% 402800\n", - "max 999999\n", - "Name: tract_id, dtype: float64" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.tract_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460519\n", - "mean 58\n", - "std 50\n", - "min 1\n", - "25% 37\n", - "50% 59\n", - "75% 79\n", - "max 999\n", - "Name: county_id, dtype: float64" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.county_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 460523\n", - "mean 6\n", - "std 5\n", - "min 1\n", - "25% 6\n", - "50% 6\n", - "75% 6\n", - "max 99\n", - "Name: state_id, dtype: float64" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.state_id.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "6 455641\n", - "99 1064\n", - "32 957\n", - "41 454\n", - "4 412\n", - "Name: state_id, dtype: int64" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places.state_id.value_counts().head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# How to deal with this? I think `tract_id` is an integer representation\n", - "# of the 4-digit tract ID within the couty plus the 2 digit suffix. \n", - "\n", - "# So the full unique identifier is `state_id` + `county_id` (3 digits) + `tract_id` (6 digits)\n", - "\n", - "places['_full_tract_id'] = places.state_id * 1e9 + places.county_id * 1e6 + places.tract_id\n", - "\n", - "# Presumably the all-9 entries reflect missing data, but documentation doesn't specify\n", - "\n", - "places.ix[(places.tract_id == 999999) |\n", - " (places.county_id == 999) |\n", - " (places.state_id == 99), '_full_tract_id'] = np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9098\n", - "14194\n" - ] - } - ], - "source": [ - "print(places._full_tract_id.drop_duplicates().shape[0])\n", - "print(places[['_full_tract_id', 'city']].drop_duplicates().shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "6115041100 14\n", - "6091010000 12\n", - "6027000800 11\n", - "6107000100 10\n", - "6097154303 10\n", - "Name: _full_tract_id, dtype: int64" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places[['_full_tract_id', 'city']].drop_duplicates().\\\n", - " _full_tract_id.value_counts().head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " _full_tract_id city\n", - "3238 6115041100 BROWNSVILLE\n", - "18952 6115041100 MARYSVILLE\n", - "33913 6115041100 NORTH SAN JUAN\n", - "44697 6115041100 DOBBINS\n", - "44705 6115041100 YUBA\n", - "100194 6115041100 BANGOR\n", - "160254 6115041100 CAMPTONVILLE\n", - "178724 6115041100 STRAWBERRY VALLEY\n", - "271235 6115041100 CHALLENGE-BROWNSVILLE\n", - "271250 6115041100 OREGON HOUSE\n", - "300021 6115041100 FORBESTOWN\n", - "317626 6115041100 CHALLENGE-BROWNSVILL\n", - "402446 6115041100 BROWNS VALLEY\n", - "403959 6115041100 RACKERBY\n" - ] - } - ], - "source": [ - "print(places[['_full_tract_id', 'city']].drop_duplicates().\\\n", - " loc[places._full_tract_id == 6115041100])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "So, there are still many census tracts that correspond to more than one city. I think we probably just want to use the census tracts as our unit of analysis. \n", - "\n", - "For descriptive purposes we can map each census tract to its most common corresponding city." - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " city\n", - "_full_tract_id \n", - "1015000800 ANNISTON\n", - "1101001500 MONTGOMERY\n", - "1161400100 SEVILLA\n", - "2020001000 ANCHORAGE\n", - "2020001100 ANCHORAGE\n" - ] - } - ], - "source": [ - "# Map each tract to its most common corresponding city\n", - "\n", - "tracts = places[['_full_tract_id', 'city']].groupby('_full_tract_id').\\\n", - " agg(lambda x:x.value_counts().index[0])\n", - " \n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9098\n", - "9097\n" - ] - } - ], - "source": [ - "print(places._full_tract_id.drop_duplicates().shape[0])\n", - "print(tracts.shape[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Activities\n", - "\n", - "\"The activity reported is for a single travel day and contains the highest level of detail about the survey participants' travel purpose\" (data dictionary)\n", - "\n", - "So, there can be multiple \"activities\" at each \"place\" visited as part of a trip." - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "604711" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)\n", - "\n", - "len(activities)" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "157011" - ] - }, - "execution_count": 85, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# TO DO - fix to reflect households\n", - "\n", - "activities_ba = activities[activities.county_id.isin([1, 13, 41, 55, 75, 81, 85, 95, 97])]\n", - "\n", - "len(activities_ba)" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " sampno perno plano actno tripno\n", - "1 1041766 3 1 1 nan\n", - "4 1051203 1 9 1 8\n", - "8 1065929 1 1 1 nan\n", - "14 1097949 1 1 1 nan\n", - "22 1124271 1 5 1 4\n", - "27 1126030 2 1 1 nan\n", - "30 1127449 2 1 1 nan\n", - "32 1127626 1 1 1 nan\n", - "35 1128657 1 1 1 nan\n", - "37 1129482 1 1 1 nan\n" - ] - } - ], - "source": [ - "# What do the identifiers look like? \n", - "\n", - "print(activities_ba[['sampno', 'perno', 'plano', 'actno', 'tripno']].head(10))" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "118271\n", - "118271\n", - "118271\n" - ] - } - ], - "source": [ - "# Each place occurs in the activities table at least once\n", - "\n", - "print((activities_ba.actno == 1).sum()) # number of activities with id 1\n", - "\n", - "print(len(activities_ba.groupby(['sampno', 'perno', 'plano']))) # unique places referenced\n", - "\n", - "print(len(places_ba)) # records in places table" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 2624572\n", - "std 1695612\n", - "min 1031985\n", - "25% 1662824\n", - "50% 1979173\n", - "75% 2797238\n", - "max 7212388\n", - "Name: sampno, dtype: float64" - ] - }, - "execution_count": 87, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.sampno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 2\n", - "std 1\n", - "min 1\n", - "25% 1\n", - "50% 2\n", - "75% 3\n", - "max 8\n", - "Name: perno, dtype: float64" - ] - }, - "execution_count": 88, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.perno.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 604711\n", - "mean 3\n", - "std 3\n", - "min 1\n", - "25% 1\n", - "50% 3\n", - "75% 5\n", - "max 34\n", - "Name: plano, dtype: float64" - ] - }, - "execution_count": 89, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities.plano.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Trips\n", - "\n", - "What's the correct way to aggregate places into trips?\n", - "\n", - "It seems like each person recorded their travel for a single day as a sequence of places visited, without explicit classification into trips or tours. So that's up to us to do by applying whatever rules seem appropriate. \n", - "\n", - "Probably it's not even possible to identify tours with certainty from the anonymized data, because the place names and precise locations are redacted." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "sampno perno\n", - "1031985 1 3\n", - " 2 3\n", - "1033944 1 16\n", - "1035274 1 8\n", - " 2 6\n", - "1037952 1 3\n", - " 2 1\n", - "1039620 1 5\n", - " 2 5\n", - "1041076 1 4\n", - "Name: plano, dtype: int64" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Dig into `tripno` some more\n", - "\n", - "places_ba.groupby(['sampno', 'perno']).plano.max().head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 19512.0\n", - "mean 1.0\n", - "std 0.0\n", - "min 1.0\n", - "25% 1.0\n", - "50% 1.0\n", - "75% 1.0\n", - "max 1.0\n", - "dtype: float64" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Do any respondents have multiple trip sequences? -- No!\n", - "\n", - "plano_counts = places_ba.groupby(['sampno', 'perno']).plano.max()\n", - "tripno_counts = places_ba.groupby(['sampno', 'perno']).tripno.max()\n", - "\n", - "(plano_counts - tripno_counts).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 93406.0\n", - "mean 1.0\n", - "std 0.0\n", - "min 1.0\n", - "25% 1.0\n", - "50% 1.0\n", - "75% 1.0\n", - "max 1.0\n", - "dtype: float64" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(places_ba.plano - places_ba.tripno).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
travel_datearr_timedep_timetract_idcitymodetrip_distance_milesprev_trip_duration_minact_dur
1522012-07-1703:00:0010:00:00509000SUNNYVALEnannannan425
1532012-07-1710:00:0010:00:00509000SUNNYVALE511030
1542012-07-1711:00:0011:00:00508504SUNNYVALE52151
1552012-07-1711:00:0011:00:00508504SUNNYVALE1059
1562012-07-1711:00:0013:00:00508504SUNNYVALE105105
1572012-07-1713:00:0014:00:00509000SUNNYVALE521060
1582012-07-1714:00:0015:00:00500100SAN JOSE582025
1592012-07-1715:00:0002:00:00509000SUNNYVALE5920699
\n", - "
" - ], - "text/plain": [ - " travel_date arr_time dep_time tract_id city mode \\\n", - "152 2012-07-17 03:00:00 10:00:00 509000 SUNNYVALE nan \n", - "153 2012-07-17 10:00:00 10:00:00 509000 SUNNYVALE 5 \n", - "154 2012-07-17 11:00:00 11:00:00 508504 SUNNYVALE 5 \n", - "155 2012-07-17 11:00:00 11:00:00 508504 SUNNYVALE 1 \n", - "156 2012-07-17 11:00:00 13:00:00 508504 SUNNYVALE 1 \n", - "157 2012-07-17 13:00:00 14:00:00 509000 SUNNYVALE 5 \n", - "158 2012-07-17 14:00:00 15:00:00 500100 SAN JOSE 5 \n", - "159 2012-07-17 15:00:00 02:00:00 509000 SUNNYVALE 5 \n", - "\n", - " trip_distance_miles prev_trip_duration_min act_dur \n", - "152 nan nan 425 \n", - "153 1 10 30 \n", - "154 2 15 1 \n", - "155 0 5 9 \n", - "156 0 5 105 \n", - "157 2 10 60 \n", - "158 8 20 25 \n", - "159 9 20 699 " - ] - }, - "execution_count": 100, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What does a sequence of places look like?\n", - "\n", - "varlist = ['travel_date', 'arr_time', 'dep_time', 'tract_id', 'city', 'mode', \n", - " 'trip_distance_miles', 'prev_trip_duration_min', 'act_dur']\n", - "\n", - "places_ba.loc[(places_ba.sampno == 1035274) & (places_ba.perno == 1), varlist]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So, it looks like the key to identifying trip/tour semantics involves looking at the trip purposes in the activities table. Transfers are noted as a particular purpose, and those trip legs need to be aggregated together. \n", - "\n", - "The first and last activities of the day probably take place at home, but we can't verify using the public data.\n", - "\n", - "It looks like the arrival and departure times, and trip durations, are approximate based on people's recollections, but distances are precise because they come from the Google Maps interface." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Travel modes" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "5.0 50139\n", - "6.0 18632\n", - "1.0 15924\n", - "2.0 2244\n", - "15.0 1635\n", - "24.0 1444\n", - "7.0 566\n", - "26.0 459\n", - "8.0 299\n", - "25.0 293\n", - "Name: mode, dtype: int64" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What are the travel modes?\n", - "\n", - "places_ba['mode'].value_counts().head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "FROM DATA DICTIONARY\n", - "\n", - "Travel mode:\n", - "\n", - "- 1- Walk; \n", - "- 2- Bike; \n", - "- 3- Wheelchair/mobility scooter; \n", - "- 4- Other non-motorized; \n", - "- 5- Auto/van/truck driver; \n", - "- 6- Auto/van/truck passenger; \n", - "- 7- Carpool/vanpool; \n", - "- 8- Motorcycle/scooter/moped; \n", - "- 9- Taxi/hired car/limo; \n", - "- 10- Rental car/vehicle; \n", - "- 11- Private shuttle (Super shuttle, employer, hotel, etc.); \n", - "- 12- Greyhound bus; \n", - "- 13- Plane; \n", - "- 14- Other private transit; \n", - "- 15- Local bus, rapid bus; \n", - "- 16- Express bus/commuter bus (AC Transbay, Golden Gate Transit, etc.); \n", - "- 17- Premium bus (Metro Orange/Silver Line); \n", - "- 18- School bus; \n", - "- 19- Public transit shuttle (DASH, Emery Go Round, etc.); \n", - "- 20- AirBART/LAX FlyAway; \n", - "- 21- Dial-a-ride/paratransit (access services, etc.); \n", - "- 22- Amtrak bus; \n", - "- 23- Other bus; \n", - "- 24- BART, Metro Red/Purple Line; \n", - "- 25- ACE, Amtrak, Cal- train, Coaster, Metrolink; \n", - "- 26- Metro Blue/Green/Gold Line, Muni Metro, Sacramento Light Rail, San Diego Sprinter/Trolley/Orange/ Blue/Green, VTA light rail; \n", - "- 27- Streetcar/cable car, \n", - "- 28- Other rail; \n", - "- 29- Ferry/boat; \n", - "- 99- RF" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Trip purposes" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "1 47241\n", - "2 16700\n", - "21 9523\n", - "9 9151\n", - "27 8583\n", - "22 7250\n", - "8 6151\n", - "7 5792\n", - "37 5040\n", - "31 4737\n", - "39 3484\n", - "17 3105\n", - "25 3039\n", - "34 2701\n", - "29 2541\n", - "Name: purpose, dtype: int64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# What are the trip purposes?\n", - "\n", - "activities_ba.purpose.value_counts().head(15)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "FROM DATA DICTIONARY\n", - "\n", - "[Somewhere there's a `ptype` key indicating categories of purposes, probably based on the home/ work/ school locations, but I can't find it in these data tables.]\n", - "\n", - "Activity purpose: \n", - "\n", - "[These look like activities at home]\n", - "\n", - "- 1- Personal activities (sleeping, personal care, leisure, chores); \n", - "- 2- Preparing meals/eating; \n", - "- 3- Hosting visitors/entertaining guests; \n", - "- 4- Exercise (with or without equipment)/playing sports; \n", - "- 5- Study/schoolwork; \n", - "- 6- Work for pay at home using telecommunications equipment; \n", - "- 7- Using computer/telephone/cell or smart phone, or other communications device for personal activities; \n", - "- 8- All other activities at home; \n", - "\n", - "[These look like activites at work]\n", - "\n", - "- 9- Work/job duties; \n", - "- 10- Training; \n", - "- 11- Meals at work; \n", - "- 12- Work-sponsored social activities (holiday/birthday celebrations, etc.); \n", - "- 13- Non-work-related activities (social clubs, etc.); \n", - "- 14- Exercise/sports; \n", - "- 15- Volunteer work/activities, \n", - "- 16- All other work- related activities at work; \n", - "\n", - "[These look like activities at school]\n", - "\n", - "- 17- School/classroom/ laboratory; \n", - "- 18- Meals at school/college; \n", - "- 19- After-school or non-class-related sports/physical activities; \n", - "- 20- All other after-school or non-class-related activities (library, music rehearsal, clubs, etc.); \n", - "\n", - "[These look like transport-related]\n", - "\n", - "- 21- Change type of transportation/transfer (walk to bus, walk to/from parked car); \n", - "- 22- pick up/drop off passenger(s); \n", - "\n", - "[These look like activities at non-home, non-work, non-school locations]\n", - "\n", - "- 23- Drive-through meals (snacks, coffee, etc.) (show if PTYPE <> 1 [Home]); \n", - "- 24- Drive-through other (ATM, bank, etc.) (show if PTYPE <> 1); \n", - "- 25- Work-related (meetings, sales calls, deliveries); \n", - "- 26- Service private vehicle (gas, oil, lubes, repairs), \n", - "- 27- Routine shopping (groceries, clothing, convenience store, household maintenance, etc.); \n", - "- 28- Shopping for major purchases or specialty items (appliance, electronics, new vehicles, major household repairs, etc.); \n", - "- 29- Household errands (bank, dry cleaning, etc.); \n", - "- 30- Personal business (visit government office, attorney, accountant, etc.); \n", - "- 31- Eat meal at restaurant/diner; \n", - "- 32- Health care (doctor, dentist, eye care, chiropractor, veterinarian, etc.); \n", - "- 33- Civic/ religious activities; \n", - "- 34- Outdoor exercise (outdoor sports, jogging, bicycling, walking the dog, etc.); \n", - "- 35- Indoor exercise (gym, yoga, etc.); \n", - "- 36- Entertainment (movies, sporting events, etc.); \n", - "- 37- Social/visiting friends and relatives; \n", - "- 38- Other (specify), \n", - "\n", - "[Misc]\n", - "\n", - "- 39- Loop trip (for interviewer only- not listed on diary), \n", - "- 99- DK/RF" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# TO DO\n", - "\n", - "# - set up destination choice model\n", - "# - make two tables: (1) trips, (2) destinations\n", - "# - write a function to generate choice set\n", - "\n", - "# - for covariates, calculate home/work/etc density endogenously\n", - "\n", - "# - can probably generate average travel time between tracts, by mode\n", - "# - then can use that to build a mode choice model\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/_archive/ChoiceModels-demo.ipynb b/notebooks/_archive/ChoiceModels-demo.ipynb deleted file mode 100644 index c759ca6..0000000 --- a/notebooks/_archive/ChoiceModels-demo.ipynb +++ /dev/null @@ -1,468 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ChoiceModels usage demo\n", - "\n", - "Sam Maurer, October 10, 2016" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%aimport choicemodels\n", - "%autoreload 1" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import choicemodels\n", - "import numpy as np\n", - "import pandas as pd\n", - "from collections import OrderedDict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Binary Logit" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Set up estimation data\n", - "\n", - "endog = np.random.randint(2, size=50) # 50x1 vector of random 0's and 1's\n", - "exog = np.random.rand(50, 5) # 50x5 matrix of random floats" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Optimization terminated successfully.\n", - " Current function value: 0.635509\n", - " Iterations 5\n" - ] - } - ], - "source": [ - "# Estimate a model\n", - "\n", - "m = choicemodels.Logit(endog, exog)\n", - "results = m.fit()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Logit Regression Results \n", - "==============================================================================\n", - "Dep. Variable: y No. Observations: 50\n", - "Model: Logit Df Residuals: 45\n", - "Method: MLE Df Model: 4\n", - "Date: Fri, 07 Oct 2016 Pseudo R-squ.: 0.07890\n", - "Time: 16:31:07 Log-Likelihood: -31.775\n", - "converged: True LL-Null: -34.497\n", - " LLR p-value: 0.2447\n", - "==============================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "------------------------------------------------------------------------------\n", - "x1 0.0305 0.899 0.034 0.973 -1.731 1.792\n", - "x2 1.4040 0.977 1.436 0.151 -0.512 3.320\n", - "x3 -2.2294 1.034 -2.156 0.031 -4.256 -0.202\n", - "x4 0.0607 0.996 0.061 0.951 -1.892 2.013\n", - "x5 0.5010 0.995 0.503 0.615 -1.450 2.452\n", - "==============================================================================\n" - ] - } - ], - "source": [ - "# Show estimation results\n", - "\n", - "print(results.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Multinomial Logit" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Load some real data\n", - "\n", - "path = '../../timothyb0912/pylogit/examples/data/swissmetro.dat'\n", - "swissmetro = pd.read_table(path, sep='\\t')\n", - "\n", - "include = (swissmetro.PURPOSE.isin([1, 3]) & (swissmetro.CHOICE != 0))\n", - "swissmetro = swissmetro.loc[include]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "swissmetro.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/smmaurer/Dropbox/Git-rMBP/timothyb0912/pylogit/pylogit/choice_tools.py:431: UserWarning: Note, there are 29 variables in wide_data but the inputs ind_vars, alt_specific_vars, and subset_specific_vars only account for 28 variables.\n", - " msg_2 + msg_3.format(num_vars_accounted_for))\n" - ] - } - ], - "source": [ - "# Convert to long format\n", - "\n", - "ind_vars = swissmetro.columns.tolist()[:15]\n", - "\n", - "alt_varying_vars = {'travel_time': dict([(1, 'TRAIN_TT'), (2, 'SM_TT'), (3, 'CAR_TT')]),\n", - " 'travel_cost': dict([(1, 'TRAIN_CO'), (2, 'SM_CO'), (3, 'CAR_CO')]),\n", - " 'headway': dict([(1, 'TRAIN_HE'), (2, 'SM_HE')])}\n", - "\n", - "availability_vars = {1: 'TRAIN_AV', 2: 'SM_AV', 3: 'CAR_AV'}\n", - "\n", - "alt_id_col = 'mode_id'\n", - "\n", - "swissmetro['custom_id'] = np.arange(swissmetro.shape[0], dtype=int) + 1\n", - "obs_id_col = 'custom_id'\n", - "\n", - "choice_col = 'CHOICE'\n", - "\n", - "data = choicemodels.convert_wide_to_long(swissmetro, ind_vars, alt_varying_vars, \n", - " availability_vars, obs_id_col, choice_col, new_alt_id_name=alt_id_col)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "data.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Rescale variables\n", - "\n", - "data[\"travel_time_hrs\"] = data[\"travel_time\"] / 60.0\n", - "data[\"headway_hrs\"] = data[\"headway\"] / 60.0\n", - "data[\"travel_cost_scaled\"] = data[\"travel_cost\"] / 100.0" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Set up specification\n", - "\n", - "spec = OrderedDict()\n", - "labels = OrderedDict()\n", - "\n", - "spec[\"intercept\"] = [1, 2]\n", - "labels[\"intercept\"] = ['ASC Train', 'ASC Swissmetro']\n", - "\n", - "spec[\"travel_time_hrs\"] = [[1, 2,], 3]\n", - "labels[\"travel_time_hrs\"] = ['Travel Time (Train/SM)', 'Travel Time (Car)']\n", - "\n", - "spec[\"travel_cost_scaled\"] = [1, 2, 3]\n", - "labels[\"travel_cost_scaled\"] = ['Travel Cost (Train)', 'Travel Cost (Swissmetro)', \n", - " 'Travel Cost (Car)']\n", - "\n", - "spec[\"headway_hrs\"] = [1, 2]\n", - "labels[\"headway_hrs\"] = [\"Headway (Train)\", \"Headway (Swissmetro)\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -6,964.6630\n", - "Initial Log-likelihood: -6,964.6630\n", - "Estimation Time: 0.09 seconds.\n", - "Final log-likelihood: -5,359.1984\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/smmaurer/anaconda/lib/python2.7/site-packages/scipy/optimize/_minimize.py:385: RuntimeWarning: Method BFGS does not use Hessian information (hess).\n", - " RuntimeWarning)\n" - ] - } - ], - "source": [ - "# Set up and estimate the model\n", - "\n", - "m = choicemodels.MNLogit(data, alt_id_col, obs_id_col, choice_col, spec, names=labels)\n", - "\n", - "results = m.fit_mle(np.zeros(9))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: CHOICE No. Observations: 6,768\n", - "Model: Multinomial Logit Model Df Residuals: 6,759\n", - "Method: MLE Df Model: 9\n", - "Date: Fri, 07 Oct 2016 Pseudo R-squ.: 0.231\n", - "Time: 16:31:26 Pseudo R-bar-squ.: 0.229\n", - "converged: False Log-Likelihood: -5,359.198\n", - " LL-Null: -6,964.663\n", - "============================================================================================\n", - " coef std err z P>|z| [95.0% Conf. Int.]\n", - "--------------------------------------------------------------------------------------------\n", - "ASC Train -0.4710 0.128 -3.674 0.000 -0.722 -0.220\n", - "ASC Swissmetro 0.2597 0.104 2.504 0.012 0.056 0.463\n", - "Travel Time (Train/SM) -0.7459 0.041 -18.011 0.000 -0.827 -0.665\n", - "Travel Time (Car) -0.5572 0.043 -13.065 0.000 -0.641 -0.474\n", - "Travel Cost (Train) 0.0637 0.004 14.386 0.000 0.055 0.072\n", - "Travel Cost (Swissmetro) 0.0096 0.003 2.969 0.003 0.003 0.016\n", - "Travel Cost (Car) -0.2327 0.091 -2.546 0.011 -0.412 -0.054\n", - "Headway (Train) -0.3592 0.064 -5.590 0.000 -0.485 -0.233\n", - "Headway (Swissmetro) -0.4353 0.192 -2.265 0.023 -0.812 -0.059\n", - "============================================================================================\n" - ] - } - ], - "source": [ - "# Show results\n", - "\n", - "print(results.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "### Alternate syntax for setting up a multinomial specification\n", - "\n", - "This section is speculative -- not yet implemented!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# StatsModels allows the following syntax:\n", - "\n", - "spec = 'outcome ~ const + var1 + np.log(var2)'\n", - "\n", - "m = choicemodels.Logit.from_formula(spec, data)\n", - "results = m.fit_mle()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# It would be nice to enable something similar for multinomial models,\n", - "# so that the user interface follows the utility functions more closely\n", - "\n", - "spec = {\n", - " '1': 'choice ~ ASC_t + btt * time_t/60 + bct * cost_t/100 + bht * headway_t/60',\n", - " \n", - " '2': 'choice ~ ASC_sm + btt * time_sm/60 + bcs * cost_sm/100 + bhs * headway_sm/60',\n", - " \n", - " '3': 'choice ~ btc * time_c/60 + bcc * cost_c/100' }\n", - "\n", - "labels: {\n", - " 'ASC_t': \"ASC Train\",\n", - " 'ASC_sm': \"ASC Swissmetro\", \n", - " 'btt': \"Travel Time (Train/SM)\", \n", - " 'btc': \"Travel Time (Car)\", \n", - " 'bct': \"Travel Cost (Train)\", \n", - " 'bcs': \"Travel Cost (Swissmetro)\", \n", - " 'bht': \"Headway (Train)\", \n", - " 'bhs': \"Headway (Swissmetro)\", }\n", - "\n", - "m = choicemodels.MNL.from_formula(spec, data, alt_id_col)\n", - "results = m.fit_mle()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/notebooks/_archive/Data-prep-01.ipynb b/notebooks/_archive/Data-prep-01.ipynb deleted file mode 100644 index 43b5f97..0000000 --- a/notebooks/_archive/Data-prep-01.ipynb +++ /dev/null @@ -1,666 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data prep for estimating models\n", - "\n", - "Sam Maurer, June 2017\n", - "\n", - "Python 3.6" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import zipfile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load raw CHTS tables\n", - "\n", - "This requires the file named caltrans_full_survey.zip. You can download it by following the instructions in the \"data\" directory." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "z = zipfile.ZipFile('../data/caltrans_full_survey.zip')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "42426" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "households = pd.read_csv(z.open('caltrans_full_survey/survey_households.csv'), low_memory=False)\n", - "len(households)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "109113" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "persons = pd.read_csv(z.open('caltrans_full_survey/survey_person.csv'), low_memory=False)\n", - "len(persons)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "460524" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "places = pd.read_csv(z.open('caltrans_full_survey/survey_place.csv'), low_memory=False)\n", - "len(places)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "604711" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "activities = pd.read_csv(z.open('caltrans_full_survey/survey_activity.csv'), low_memory=False)\n", - "len(activities)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build a master table of census tracts\n", - "\n", - "Generate a table of census tracts in the 9-county Bay Area, for use in destination choice models." - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress scientific notation in the Pandas display output\n", - "\n", - "pd.set_option('display.float_format', lambda x: '%.0f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 141, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "6098141414.0\n", - "6.0\n", - "98.0\n" - ] - } - ], - "source": [ - "# Functions to move back and forth between full numerical tract ID and its components\n", - "\n", - "# TO DO - it would be better to generate ints than floats, but it's not obvious\n", - "# to me how to do this in a way that works smoothly with arrays\n", - "\n", - "def full_tract_id(state_id, county_id, tract_id):\n", - " return state_id * 1e9 + county_id * 1e6 + tract_id\n", - "\n", - "def state_id(full_tract_id):\n", - " return np.floor(full_tract_id / 1e9)\n", - "\n", - "def county_id(full_tract_id):\n", - " _county_tract = np.fmod(full_tract_id, 1e9)\n", - " return np.floor(_county_tract / 1e6)\n", - "\n", - "print(full_tract_id(6, 98, 141414))\n", - "print(state_id(6098141414))\n", - "print(county_id(6098141414))" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Generate full tract identifiers for the `places` table\n", - "\n", - "places['full_tract_id'] = full_tract_id(places.state_id, places.county_id, places.tract_id)\n", - "\n", - "# Replace missing identifiers with NaN's\n", - "\n", - "places.ix[(places.tract_id == 999999) |\n", - " (places.county_id == 999) |\n", - " (places.state_id == 99), 'full_tract_id'] = np.nan" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9097\n", - " city\n", - "full_tract_id \n", - "1015000800 ANNISTON\n", - "1101001500 MONTGOMERY\n", - "1161400100 SEVILLA\n", - "2020001000 ANCHORAGE\n", - "2020001100 ANCHORAGE\n" - ] - } - ], - "source": [ - "# Generate a master list of census tracts from the `places` table, keeping the\n", - "# city name most commonly associated with each tract\n", - "\n", - "tracts = places[['full_tract_id', 'city']].groupby('full_tract_id').\\\n", - " agg(lambda x:x.value_counts().index[0])\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1583\n", - " city\n", - "full_tract_id \n", - "6001008309 TIJUANA\n", - "6001400100 BERKELEY\n", - "6001400200 OAKLAND\n", - "6001400300 OAKLAND\n", - "6001400400 OAKLAND\n" - ] - } - ], - "source": [ - "# Limit to the 9-county San Francisco Bay Area\n", - "\n", - "tracts = tracts[(state_id(tracts.index).isin([6])) & \n", - " (county_id(tracts.index).\\\n", - " isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "85 371\n", - "1 360\n", - "13 207\n", - "75 195\n", - "81 158\n", - "97 99\n", - "95 97\n", - "41 55\n", - "55 41\n", - "Name: full_tract_id, dtype: int64\n" - ] - } - ], - "source": [ - "print(county_id(tracts.index).value_counts())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Calculate some tract-level covariates\n", - "\n", - "Residential density, school/employment density" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Note: the `home_tract_id` in the households table is already a full 11-digit\n", - "# identifier, with the same format that we generated for the places table.\n", - "# Same with `empl_tract_id` and `school_tract_id` in the persons table." - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Residential density = sum of weighted household sizes by census tract of home\n", - "\n", - "households['_weighted_persons_count'] = households.persons_count * households.hhwgt\n", - "\n", - "home_density = households.groupby('home_tract_id')._weighted_persons_count.sum().\\\n", - " rename('home_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Employment density = sum of person weights by census tract of work location\n", - "\n", - "work_density = persons.groupby('empl_tract_id').perwgt.sum().\\\n", - " rename('work_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 124, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# School density = sum of person weights by census tract of school location\n", - "\n", - "school_density = persons.groupby('school_tract_id').perwgt.sum().\\\n", - " rename('school_density').to_frame()" - ] - }, - { - "cell_type": "code", - "execution_count": 125, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " city home_density work_density school_density\n", - "full_tract_id \n", - "6001008309 TIJUANA 0 0 0\n", - "6001400100 BERKELEY 13 13 14\n", - "6001400200 OAKLAND 11 4 1\n", - "6001400300 OAKLAND 29 8 0\n", - "6001400400 OAKLAND 17 4 8\n" - ] - } - ], - "source": [ - "# Merge these into the census tracts table, only keeping Bay Area tracts\n", - "\n", - "tracts = pd.merge(tracts, home_density, how='left', left_index=True, right_index=True)\n", - "tracts = pd.merge(tracts, work_density, how='left', left_index=True, right_index=True)\n", - "tracts = pd.merge(tracts, school_density, how='left', left_index=True, right_index=True)\n", - "tracts = tracts.fillna(0) # fill missing values with zero\n", - "\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generate a table of trips\n", - "\n", - "For now, this is a table of places visited for non-school, non-work activities" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# - trip destinations are in `places.full_tract_id` (sometimes missing)\n", - "# - trip purposes are in `activities.purpose`, and we want 23 thru 38\n", - "# - places and acitivities are linked by `sampno`, `perno`, `plano`, and there \n", - "# can be multiple activities per place" - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10417660312.0\n" - ] - } - ], - "source": [ - "# Function to generate a single unique ID for places\n", - "\n", - "def place_id(sampno, perno, plano):\n", - " return sampno * 1e4 + perno * 1e2 + plano\n", - "\n", - "print(place_id(1041766, 3, 12))" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Add place_id to places table and activities table\n", - "\n", - "places['place_id'] = place_id(places.sampno, places.perno, places.plano)\n", - "activities['place_id'] = place_id(activities.sampno, activities.perno, activities.plano)" - ] - }, - { - "cell_type": "code", - "execution_count": 131, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Get list of places that have a secondary activity\n", - "\n", - "_secondary_activity_places = activities.loc[activities.purpose.isin(range(23, 38+1)),\n", - " 'place_id'].drop_duplicates()" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "145993\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850102 6095252108 6 13\n", - "10319850202 6095251902 5 5\n", - "10320360102 6073017051 5 4\n", - "10320360104 6073009304 5 19\n", - "10320360105 6073008511 5 6\n" - ] - } - ], - "source": [ - "# Generate a table of those places with some covariates\n", - "\n", - "trips = places.loc[places.place_id.isin(_secondary_activity_places) &\n", - " places.full_tract_id.notnull(),\n", - " ['place_id', 'full_tract_id', 'mode', \n", - " 'trip_distance_miles']].set_index('place_id')\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "36765\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850102 6095252108 6 13\n", - "10319850202 6095251902 5 5\n", - "10335860102 6085511915 6 156\n", - "10335860103 6085512027 6 2\n", - "10335860104 6085512027 6 0\n" - ] - } - ], - "source": [ - "# Limit to destinations in the 9-county San Francisco Bay Area\n", - "\n", - "trips = trips[(state_id(trips.full_tract_id).isin([6])) & \n", - " (county_id(trips.full_tract_id).\\\n", - " isin([1, 13, 41, 55, 75, 81, 85, 95, 97]))].copy()\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save estimaton data to disk" - ] - }, - { - "cell_type": "code", - "execution_count": 134, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "tracts.to_csv('../data/tracts.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "trips.to_csv('../data/trips.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/_archive/Destination-choice-models-01.ipynb b/notebooks/_archive/Destination-choice-models-01.ipynb deleted file mode 100644 index 45d91bb..0000000 --- a/notebooks/_archive/Destination-choice-models-01.ipynb +++ /dev/null @@ -1,634 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring destination choice models\n", - "\n", - "Sam Maurer, June 2017\n", - "\n", - "Python 3.6\n", - "\n", - "## Plan\n", - "\n", - "- Set up a simple MNL destination choice model using the `urbansim.urbanchoice` interface\n", - "\n", - "- Refactor the code, using this notebook for ad-hoc testing\n", - "\n", - "- Set up more complex models as needed" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from patsy import dmatrix\n", - "from urbansim.urbanchoice import interaction, mnl\n", - "\n", - "from choicemodels import MultinomialLogit" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress deprecation warnings\n", - "\n", - "import warnings; warnings.simplefilter('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load estimation data from disk" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress scientific notation in the Pandas display output\n", - "\n", - "pd.set_option('display.float_format', lambda x: '%.3f' % x)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1583\n", - " city home_density work_density school_density\n", - "full_tract_id \n", - "6001008309.000 TIJUANA 0.000 0.000 0.000\n", - "6001400100.000 BERKELEY 13.438 13.131 13.512\n", - "6001400200.000 OAKLAND 11.090 4.249 0.895\n", - "6001400300.000 OAKLAND 28.878 7.672 0.000\n", - "6001400400.000 OAKLAND 16.885 4.064 8.150\n" - ] - } - ], - "source": [ - "tracts = pd.read_csv('../data/tracts.csv').set_index('full_tract_id')\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "36765\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "10319850102.000 6095252108.000 6.000 13.428\n", - "10319850202.000 6095251902.000 5.000 5.126\n", - "10335860102.000 6085511915.000 6.000 156.371\n", - "10335860103.000 6085512027.000 6.000 1.616\n", - "10335860104.000 6085512027.000 6.000 0.376\n" - ] - } - ], - "source": [ - "trips = pd.read_csv('../data/trips.csv').set_index('place_id')\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MNL destination choice using urbansim.urbanchoice" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# - each trip is a realized choice of a particular census tract\n", - "# - we can randomly sample alternative census tracts and build a model\n", - "# of destination choice" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# `interaction.mnl_interaction_dataset()` is not documented very well, but \n", - "# this is how it seems to work\n", - "\n", - "# Takes following input:\n", - "# - choosers: pandas.DataFrame with unique index\n", - "# - alternatives: pandas.DataFrame with unique index\n", - "# - SAMPLE_SIZE: number of alternatives for each choice scenario\n", - "# - chosenalts: list containing the alternative id chosen by each chooser?\n", - "\n", - "# Returns following output:\n", - "# - full list of alternatives that were sampled\n", - "# - long-format DataFrame merging the two tables\n", - "# - numchoosers X SAMPLE_SIZE matrix representing chosen alternatives" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Start with a sample of ~500 trips for easier computation" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "490\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "71720050203.000 6055201402.000 6.000 3.080\n", - "19678330204.000 6095253404.000 6.000 15.400\n", - "30057980204.000 6001408600.000 6.000 7.070\n", - "30002610307.000 6001433400.000 5.000 1.371\n", - "30208410103.000 6085503601.000 5.000 7.498\n" - ] - } - ], - "source": [ - "choosers = trips.loc[np.random.choice(trips.index, 500, replace=False)]\n", - "choosers = choosers.loc[choosers.trip_distance_miles.notnull()]\n", - "\n", - "print(choosers.shape[0])\n", - "print(choosers.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sample 100 alternatives for each and set up a long-format data table" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "49000\n", - "(490, 100)\n" - ] - } - ], - "source": [ - "numalts = 100\n", - "\n", - "_, merged, chosen = interaction.mnl_interaction_dataset(\n", - " choosers=choosers, alternatives=tracts, SAMPLE_SIZE=numalts, \n", - " chosenalts=choosers.full_tract_id)\n", - "\n", - "print(merged.shape[0])\n", - "print(chosen.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Use Patsy to generate the design matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Intercept home_density work_density school_density\n", - "full_tract_id \n", - "6055201402.000 1.000 13.406 1.692 0.000\n", - "6013308001.000 1.000 8.448 0.828 2.252\n", - "6085500901.000 1.000 6.060 32.747 110.417\n", - "6085503712.000 1.000 16.097 6.792 0.000\n", - "6097153801.000 1.000 48.146 3.061 8.313\n" - ] - } - ], - "source": [ - "model_expression = \"home_density + work_density + school_density\"\n", - "\n", - "model_design = dmatrix(model_expression, data=merged, return_type='dataframe')\n", - "\n", - "print(model_design.head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fit the model using mnl_estimate()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'convergence': -2209.5185606064615, 'null': -2256.5333911341672, 'ratio': 0.02083498108755011}\n", - " Coefficient Std. Error T-Score\n", - "0 -0.000 0.084 -0.000\n", - "1 0.013 0.004 3.049\n", - "2 0.012 0.001 9.855\n", - "3 0.011 0.005 2.170\n" - ] - } - ], - "source": [ - "log_likelihoods, fit_parameters = mnl.mnl_estimate(\n", - " model_design.as_matrix(), chosen, numalts=numalts)\n", - "\n", - "print(log_likelihoods)\n", - "print(fit_parameters)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## NEW -- Same process in ChoiceModels" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from choicemodels import MultinomialLogit\n", - "from choicemodels.tools import MergedChoiceTable" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "490\n" - ] - } - ], - "source": [ - "# Start with the same sample of trips\n", - "\n", - "print(choosers.shape[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merge choosers and alternatives using a new ChoiceModels interface" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "49000\n" - ] - } - ], - "source": [ - "merged = MergedChoiceTable(observations = choosers, \n", - " alternatives = tracts, \n", - " chosen_alternatives = choosers.full_tract_id, \n", - " sample_size = numalts)\n", - "\n", - "print(type(merged))\n", - "print(merged.to_frame().shape[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Estimate a model using the ChoiceModels engine" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " CHOICEMODELS ESTIMATION RESULTS \n", - "===================================================================\n", - "Dep. Var.: chosen No. Observations: \n", - "Model: Multinomial Logit Df Residuals: \n", - "Method: Maximum Likelihood Df Model: \n", - "Date: Pseudo R-squ.: \n", - "Time: Pseudo R-bar-squ.: \n", - "AIC: Log-Likelihood: -2,206.414\n", - "BIC: LL-Null: -2,256.533\n", - "===================================================================\n", - " coef std err z P>|z| Conf. Int.\n", - "-------------------------------------------------------------------\n", - "home_density 0.0123 0.003 4.574 \n", - "work_density 0.0128 0.001 10.993 \n", - "school_density 0.0097 0.005 2.018 \n", - "===================================================================\n", - "CPU times: user 125 ms, sys: 34.8 ms, total: 160 ms\n", - "Wall time: 110 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = \"home_density + work_density + school_density - 1\"\n", - "\n", - "model = MultinomialLogit(data = merged.to_frame(), \n", - " observation_id_col = merged.observation_id_col, \n", - " choice_col = merged.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "print(type(results))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Estimate a model using the PyLogit engine\n", - "\n", - "Usage is the same, but with an OrderedDict model expression" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from collections import OrderedDict" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Log-likelihood at zero: -2,256.5334\n", - "Initial Log-likelihood: -2,256.5334\n", - "Estimation Time: 0.15 seconds.\n", - "Final log-likelihood: -2,206.4141\n", - " Multinomial Logit Model Regression Results \n", - "===================================================================================\n", - "Dep. Variable: chosen No. Observations: 490\n", - "Model: Multinomial Logit Model Df Residuals: 487\n", - "Method: MLE Df Model: 3\n", - "Date: Tue, 27 Jun 2017 Pseudo R-squ.: 0.022\n", - "Time: 19:51:07 Pseudo R-bar-squ.: 0.021\n", - "converged: True Log-Likelihood: -2,206.414\n", - " LL-Null: -2,256.533\n", - "==================================================================================\n", - " coef std err z P>|z| [0.025 0.975]\n", - "----------------------------------------------------------------------------------\n", - "home_density 0.0123 0.004 2.942 0.003 0.004 0.020\n", - "work_density 0.0128 0.001 11.104 0.000 0.011 0.015\n", - "school_density 0.0097 0.004 2.191 0.028 0.001 0.018\n", - "==================================================================================\n", - "CPU times: user 15.5 s, sys: 13.7 s, total: 29.2 s\n", - "Wall time: 21 s\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = OrderedDict([('home_density', 'all_same'),\n", - " ('work_density', 'all_same'),\n", - " ('school_density', 'all_same')])\n", - "\n", - "model = MultinomialLogit(data = merged.to_frame(),\n", - " observation_id_col = merged.observation_id_col,\n", - " alternative_id_col = merged.alternative_id_col,\n", - " choice_col = merged.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/_archive/MNL-prediction-demo-01.ipynb b/notebooks/_archive/MNL-prediction-demo-01.ipynb deleted file mode 100644 index 75e3cdc..0000000 --- a/notebooks/_archive/MNL-prediction-demo-01.ipynb +++ /dev/null @@ -1,623 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## MNL prediction demo\n", - "\n", - "Sam Maurer, July 2017\n", - "\n", - "Python 3.6\n", - "\n", - "### Summary\n", - "\n", - "This notebook demonstrates how to fit a model using the ChoiceModels interface and then use the UrbanSim MNL functions to generate predictions. \n", - "\n", - "Eventually, a prediction interface will be incorporated into the `MultinomialLogitResults` object, but it's not there yet!\n", - "\n", - "This demo uses the estimation data that's set up in the `Data-prep-01` notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from patsy import dmatrix\n", - "\n", - "from choicemodels import mnl # could also import form urbansim.urbanchoice\n", - "from choicemodels import MultinomialLogit\n", - "from choicemodels.tools import MergedChoiceTable" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Suppress deprecation warnings\n", - "import warnings; warnings.simplefilter('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load data from disk" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1566\n", - " city home_density work_density school_density\n", - "full_tract_id \n", - "6.001400e+09 BERKELEY 13.437961 13.130867 13.511570\n", - "6.001400e+09 OAKLAND 11.089638 4.248928 0.894794\n", - "6.001400e+09 OAKLAND 28.878399 7.671554 0.000000\n" - ] - } - ], - "source": [ - "tracts = pd.read_csv('../data/tracts.csv').set_index('full_tract_id')\n", - "tracts = tracts.loc[(tracts.home_density > 0) | (tracts.work_density > 0) | (tracts.school_density > 0)]\n", - "\n", - "print(tracts.shape[0])\n", - "print(tracts.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "35787\n", - " full_tract_id mode trip_distance_miles\n", - "place_id \n", - "1.031985e+10 6.095252e+09 6.0 13.428271\n", - "1.031985e+10 6.095252e+09 5.0 5.125960\n", - "1.033586e+10 6.085512e+09 6.0 156.370628\n" - ] - } - ], - "source": [ - "trips = pd.read_csv('../data/trips.csv').set_index('place_id')\n", - "trips = trips.loc[trips.trip_distance_miles.notnull()]\n", - "\n", - "print(trips.shape[0])\n", - "print(trips.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set up estimation table\n", - "\n", - "Each observed trip is a realized choice of a particular destination census tract. We can randomly sample alternative census tracts to build a model of destination choice.\n", - "\n", - "We'll divide the trips into a training set and a testing set, fit an MNL model using the training data, use it to generate predicted choices for the testing data, and compare the predicted to the actual choices." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(100000, 9)\n", - "(3473400, 9)\n" - ] - } - ], - "source": [ - "training_observations = trips.iloc[:1000]\n", - "training = MergedChoiceTable(observations = training_observations,\n", - " alternatives = tracts,\n", - " chosen_alternatives = training_observations.full_tract_id,\n", - " sample_size = 100)\n", - "\n", - "testing_observations = trips.iloc[1000:]\n", - "testing = MergedChoiceTable(observations = testing_observations,\n", - " alternatives = tracts,\n", - " chosen_alternatives = testing_observations.full_tract_id,\n", - " sample_size = 100)\n", - "\n", - "print(training.to_frame().shape)\n", - "print(testing.to_frame().shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fit a model using the training observations" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " CHOICEMODELS ESTIMATION RESULTS \n", - "===================================================================\n", - "Dep. Var.: chosen No. Observations: \n", - "Model: Multinomial Logit Df Residuals: \n", - "Method: Maximum Likelihood Df Model: \n", - "Date: Pseudo R-squ.: \n", - "Time: Pseudo R-bar-squ.: \n", - "AIC: Log-Likelihood: -4,504.887\n", - "BIC: LL-Null: -4,605.170\n", - "===================================================================\n", - " coef std err z P>|z| Conf. Int.\n", - "-------------------------------------------------------------------\n", - "home_density 0.0109 0.002 5.848 \n", - "work_density 0.0122 0.001 15.221 \n", - "school_density 0.0071 0.004 1.976 \n", - "===================================================================\n", - "CPU times: user 499 ms, sys: 46.8 ms, total: 546 ms\n", - "Wall time: 192 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "model_expression = \"home_density + work_density + school_density - 1\"\n", - "\n", - "model = MultinomialLogit(data = training.to_frame(), \n", - " observation_id_col = training.observation_id_col, \n", - " choice_col = training.choice_col,\n", - " model_expression = model_expression)\n", - "\n", - "results = model.fit()\n", - "print(results)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Predict destination choices for the testing observations\n", - "\n", - "We'll use the UrbanSim MNL functions directly, because this hasn't been integrated into the ChoiceModels results classes yet. https://github.com/UDST/choicemodels/blob/master/choicemodels/mnl.py#L536" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 0.010935\n", - "1 0.012232\n", - "2 0.007140\n", - "Name: Coefficient, dtype: float64\n" - ] - } - ], - "source": [ - "# Pull the coefs out of the results object (the PyLogit syntax would be different)\n", - "\n", - "coefs = results.get_raw_results()['fit_parameters']['Coefficient']\n", - "print(coefs)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(3473400, 3)\n", - " home_density work_density school_density\n", - "full_tract_id \n", - "6.097151e+09 10.659461 6.868701 7.160030\n", - "6.085512e+09 34.971081 5.483731 2.181334\n", - "6.013326e+09 21.491132 0.153325 1.326145\n" - ] - } - ], - "source": [ - "# The data columns for prediction need to align with the coefficients; \n", - "# you can do this manually or with patsy, as here\n", - "\n", - "df = testing.to_frame().set_index('full_tract_id')\n", - "\n", - "testing_df = dmatrix(model_expression, data=df, return_type='dataframe')\n", - "print(testing_df.shape)\n", - "print(testing_df.head(3))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34734\n", - "[90 24 75 80 70]\n" - ] - } - ], - "source": [ - "# Simulate a destination choice for each testing observation\n", - "\n", - "choices = mnl.mnl_simulate(testing_df, coefs, numalts=100, returnprobs=False)\n", - "\n", - "print(len(choices))\n", - "print(choices[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['a', 'd']\n" - ] - } - ], - "source": [ - "# Annoyingly, that identifies the choices by position rather than by id;\n", - "# here's a function to get the id's\n", - "\n", - "def get_chosen_ids(ids, positions):\n", - " \"\"\"\n", - " We observe N choice scenarios. In each, one of J alternatives is chosen.\n", - " We have a long (len N * J) list of the available alternatives. We have a \n", - " list (len N) of which alternatives were chosen, but it identifies them \n", - " by POSITION and we want their ID. \n", - " \n", - " Parameters\n", - " ----------\n", - " ids : list or list-like\n", - " List of alternative ID's (len N * J).\n", - " \n", - " positions : list or list-like\n", - " List of chosen alternatives by position (len N), where each entry is\n", - " an int in range [0, J)\n", - " \n", - " Returns\n", - " -------\n", - " chosen_ids : list\n", - " List of chosen alternatives by ID (len N)\n", - " \n", - " \"\"\"\n", - " N = len(positions)\n", - " J = len(ids) / N\n", - " \n", - " ids_by_obs = np.reshape(ids, (N,J))\n", - " return [ids_by_obs[i][positions[i]] for i in range(N)]\n", - " \n", - "\n", - "print(get_chosen_ids(['a','b','c','d'], [0,1]))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34734\n", - "[6085500400.0, 6085512020.0, 6013355115.0, 6085505008.0, 6075016802.0]\n" - ] - } - ], - "source": [ - "# Get tract id's for the simulated choices\n", - "\n", - "predicted_tracts = get_chosen_ids(testing_df.index.tolist(), choices)\n", - "\n", - "print(len(predicted_tracts))\n", - "print(predicted_tracts[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "34734\n", - "[6097150607.0, 6097150607.0, 6097153200.0, 6097151402.0, 6097151402.0]\n" - ] - } - ], - "source": [ - "# Get tract id's for observed choices\n", - "\n", - "df = testing.to_frame()\n", - "observed_tracts = df.loc[df.chosen == 1, 'full_tract_id'].tolist()\n", - "\n", - "print(len(observed_tracts))\n", - "print(observed_tracts[:5])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Compare the predicted choices to the observed ones\n", - "\n", - "Multinomial models are kind of tricky to validate. We don't expect the actual choices to match, because there are so many alternatives, but we do expect the characteristics of the predicted choices to be similar to the characteristics of the observed choices. \n", - "\n", - "Choose your own metric for this depending on what you're trying to evaluate! It's even plausible that the metric could be something not directly in the model, like the distance between the predicted and actual destination choices." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.0154603558473\n" - ] - } - ], - "source": [ - "# What portion of predicted destination choices were a perfect match?\n", - "# With an uninformative model we would expect 0.01, given that the \n", - "# observed choice is included in the 100 available alternatives.\n", - "\n", - "perfect_match = np.equal(predicted_tracts, observed_tracts)\n", - "print(sum(perfect_match)/len(perfect_match))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.145854426158\n" - ] - } - ], - "source": [ - "# What's the correlation between employment density of the predicted and \n", - "# observed destinations? With an uninformative model we would expect 0.\n", - "\n", - "density_1 = pd.Series([tracts.loc[t,'work_density'] for t in predicted_tracts])\n", - "density_2 = pd.Series([tracts.loc[t,'work_density'] for t in observed_tracts])\n", - "\n", - "print(density_1.corr(density_2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### How does UrbanSim generate household location choices?\n", - "\n", - "These three class methods collectively set up the choosers and alternatives according to various parameters like the sample size, prediction filters, \"probability mode,\" and \"choice mode\" (aggregate or individual):\n", - "\n", - "- `urbansim.models.MNLDiscreteChocieModel.probabilities()` \n", - "- `urbansim.models.MNLDiscreteChocieModel.summed_probabilities()` \n", - "- `urbansim.models.MNLDiscreteChocieModel.predict()` \n", - "\n", - "https://github.com/UDST/urbansim/blob/master/urbansim/models/dcm.py#L474\n", - "\n", - "Then this lower-level function generates a table of probabilities for each alternative, which is passed back to the `MNLDiscreteChoiceModel` class for further processing:\n", - "\n", - "- `urbansim.urbanchoice.mnl.mnl_simulate()`\n", - "\n", - "https://github.com/UDST/urbansim/blob/master/urbansim/urbanchoice/mnl.py#L121" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/_archive/Sampling-correction-01--Tim-edits.html b/notebooks/_archive/Sampling-correction-01--Tim-edits.html deleted file mode 100644 index 5f4ec19..0000000 --- a/notebooks/_archive/Sampling-correction-01--Tim-edits.html +++ /dev/null @@ -1,13193 +0,0 @@ - - - -Sampling-correction--Tim-edits - - - - - - - - - - - - - - - - - - - - - -
-
- -
-
-
-
-
-

Sampling correction for large choice sets

-
-
-
-
-
-
-
-
-
    -
  1. Replicate synthetic data from Guevara & Ben-Akiva 2013
  2. -
  3. Do MNL with and without sampling correction
  4. -
  5. Check whether parameter estimates deviate from true values
  6. -
  7. Extend to Mixed Logit
  8. -
- -
-
-
-
-
-
-
-
-

1. Generate synthetic data set

-
-
-
-
-
-
-
-
-
    -
  • N = 1000 observations
  • -
  • J = 1000 alternatives for all observations (C_n = C)
  • -
  • X = single attribute distributed Uniform(-2,1) for the first 500 alternatives and Uniform(-1,2) for the second half
  • -
  • beta = generic linear taste coefficient, distributed Normal(mu=1.5, sigma=0.8) across the 1000 observations
  • -
  • systematic utility = beta * X
  • -
  • epsilon = error term distributed ExtremeValue(0,1)
  • -
  • random utility = beta * X + epsilon
  • -
-

Utility of alternative i for agent n: -$$ U_{in} = V_{in} + \varepsilon_{in} = \beta_n x_{i} + \varepsilon_{in} $$

-

Probability that agent n will choose alternative i: -$$ L_n(i \mid \beta_n, x_n,C_n) = \frac {e^{V_{in}}} {\sum_{j \epsilon C_n} e^{V_{jn}}} $$

- -
-
-
-
-
-
In [1]:
-
-
-
import numpy as np
-import pandas as pd
-
- -
-
-
- -
-
-
-
In [162]:
-
-
-
# Generate attribute x for each of J alternatives
-
-# Start with J << 1000 to speed up runtimes
-
-J = 50  # alternatives
-
-# Set a seed for reproducibility
-np.random.seed(12)
-
-Xa = 3 * np.random.rand(J/2) - 2  # uniform distribution over [-2, 1]
-Xb = 3 * np.random.rand(J/2) - 1  # uniform distribution over [-1, 2]
-
-X = np.concatenate((Xa, Xb))
-
-print len(X)
-print X[:5]
-
- -
-
-
- -
-
- - -
-
-
50
-[-1.53751147  0.22014909 -1.21005495 -0.39878182 -1.95627511]
-
-
-
- -
-
- -
-
-
-
In [163]:
-
-
-
# Generate taste coefficient beta for each of N agents 
-
-# For regular MNL, i think we need to use a single value, instead of a 
-# distribution as Guevara & Ben-Akiva used for the mixture model
-
-N = 1000  # agents/observations
-
-beta = np.zeros(1000) + 1.5
-# beta = 0.8 * np.random.randn(N) + 1.5
-
-print len(beta)
-print beta[:5]
-
- -
-
-
- -
-
- - -
-
-
1000
-[ 1.5  1.5  1.5  1.5  1.5]
-
-
-
- -
-
- -
-
-
-
In [164]:
-
-
-
print pd.DataFrame(beta).describe()
-
- -
-
-
- -
-
- - -
-
-
            0
-count  1000.0
-mean      1.5
-std       0.0
-min       1.5
-25%       1.5
-50%       1.5
-75%       1.5
-max       1.5
-
-
-
- -
-
- -
-
-
-
In [165]:
-
-
-
# Generate probability matrix for N agents choosing among J alternatives
-
-def probs(n):
-    ''' 
-    Return list of J probabilities for agent n
-    '''
-    b = beta[n]
-    exps = [np.exp(b*x) for x in X]
-    sum_exps = np.sum(exps)
-    return [exp/sum_exps for exp in exps]
-
-P = np.array([probs(n) for n in range(N)])
-    
-print P.shape
-
- -
-
-
- -
-
- - -
-
-
(1000, 50)
-
-
-
- -
-
- -
-
-
-
In [166]:
-
-
-
# Check that each row sums to 1
-
-print np.sum(P, axis=1)[:10]
-
- -
-
-
- -
-
- - -
-
-
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
-
-
-
- -
-
- -
-
-
-
In [167]:
-
-
-
# Simulate a choice from J alternatives for each of N agents
-
-C = [np.random.choice(range(J), p=p) for p in P]
-
-print len(C)
-print C[:10]
-
- -
-
-
- -
-
- - -
-
-
1000
-[12, 41, 37, 5, 30, 27, 8, 35, 33, 6]
-
-
-
- -
-
- -
-
-
-
-
-
-

Now we have data:

    -
  • N agents/observations with true taste coefficients in array "beta"
  • -
  • J alternatives with single attributes in array "X"
  • -
  • N choice outcomes in array "C"
  • -
- -
-
-
-
-
-
-
-
-

2. Estimate beta using PyLogit MNL

-
-
-
-
-
-
In [168]:
-
-
-
import pylogit
-from collections import OrderedDict
-
- -
-
-
- -
-
-
-
In [169]:
-
-
-
# Set up an estimation dataset in long format
-
-d = [[n, i, int(C[n]==i), X[i]] for i in range(J) for n in range(N)]
-
-print len(d)
-
- -
-
-
- -
-
- - -
-
-
50000
-
-
-
- -
-
- -
-
-
-
In [170]:
-
-
-
df = pd.DataFrame(d, columns=['obs_id', 'alt_id', 'choice', 'x'])
-
-print df.describe()
-
- -
-
-
- -
-
- - -
-
-
             obs_id        alt_id        choice             x
-count  50000.000000  50000.000000  50000.000000  50000.000000
-mean     499.500000     24.500000      0.020000      0.014570
-std      288.677877     14.431014      0.140001      1.116965
-min        0.000000      0.000000      0.000000     -1.993222
-25%      249.750000     12.000000      0.000000     -0.894495
-50%      499.500000     24.500000      0.000000      0.220035
-75%      749.250000     37.000000      0.000000      0.832675
-max      999.000000     49.000000      1.000000      1.985414
-
-
-
- -
-
- -
-
-
-
In [171]:
-
-
-
# Set up model spec
-
-spec = OrderedDict([
-        ('x', [range(J)])
-    ])
-
-labels = OrderedDict([
-        ('x', ['beta_x'])
-    ])
-
- -
-
-
- -
-
-
-
In [172]:
-
-
-
m = pylogit.create_choice_model(data = df, 
-                                alt_id_col = 'alt_id', 
-                                obs_id_col = 'obs_id', 
-                                choice_col = 'choice', 
-                                specification = spec, 
-                                model_type = "MNL", 
-                                names = labels)
-
-m.fit_mle(init_vals = np.array([0]))
-print m.get_statsmodels_summary()
-
- -
-
-
- -
-
- - -
-
-
Log-likelihood at zero: -3,912.0230
-Initial Log-likelihood: -3,912.0230
-Estimation Time: 0.04 seconds.
-Final log-likelihood: -3,065.1983
-                     Multinomial Logit Model Regression Results                    
-===================================================================================
-Dep. Variable:                      choice   No. Observations:                1,000
-Model:             Multinomial Logit Model   Df Residuals:                      999
-Method:                                MLE   Df Model:                            1
-Date:                     Tue, 15 Nov 2016   Pseudo R-squ.:                   0.216
-Time:                             17:35:26   Pseudo R-bar-squ.:               0.216
-converged:                            True   Log-Likelihood:             -3,065.198
-                                             LL-Null:                    -3,912.023
-==============================================================================
-                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
-beta_x         1.5324        nan        nan        nan           nan       nan
-==============================================================================
-
-
-
- -
-
- -
-
-
-
In [173]:
-
-
-
m.hessian
-
- -
-
-
- -
-
- - -
Out[173]:
- -
-
- - - - - - - - - - - - - -
beta_x
beta_x5.826086e-13
-
-
- -
- -
-
- -
-
-
-
-
-
-

Notes:

-
    -
  1. Clearly pylogit is struggling to create the hessian. The kernel dies on my computer when it attempts to create the hessian with 100 alternatives.
  2. -
  3. However, when using 50 alternatives per person, the issue of NaN standard errors can still be reproduced. In this setting, one can see that the hessian is positive, albeit very small (i.e. near zero). A positive hessian is indicative of a local minima instead of a local maxima. However, given that the model has converged, and converged essentially to the true value of 1.5, it is instead likely that the calculation of the hessian is experiencing numerical problems. A hessian near zero indicates a log-likelihood that is essentially flat.
  4. -
  5. When using only 10 alternatives per person, we can still see issues. For one, the standard error is still reported as being huge and the calculated hessian is extremely small. From plotting the log-likelihood function (below) around the estimated value, we can see that the function is not close to being flat. This means that the hessian has been calculated incorrectly.
  6. -
- -
-
-
-
-
-
In [174]:
-
-
-
# Use a more recent version of the mnl code for convenience
-import integrated_mnl_2 as mnl_module
-
-# Import libraries for plotting
-import seaborn
-import matplotlib.pyplot as plt
-
-%matplotlib inline
-
- -
-
-
- -
-
-
-
In [175]:
-
-
-
# Recreate the estimation object to use its convenient log-likelihood function
-mnl_estimator = mnl_module.MNLEstimator(model_obj=m,
-                                        mapping_dict=m.get_mappings_for_fit(),
-                                        ridge=0,
-                                        zero_vector=np.zeros(1),
-                                        split_params=mnl_module.split_param_vec)
-
- -
-
-
- -
-
-
-
In [176]:
-
-
-
# Create a function to plot the second order taylor series based on the
-# estimated gradient and hessian, to see if the estimated values are correct
-def plot_2nd_order_taylor_series(center_value, 
-                                 original_y,
-                                 x_line, 
-                                 first_deriv, 
-                                 second_deriv,
-                                 line_format_string="-k",
-                                 line_label="2nd Order Taylor Series"):
-    # Determine the value of each x minus the value 
-    # around which the series will be centered.
-    diff_from_center = x_line - center_value
-    # Create the "y-values" to be plotted
-    y_vals = (original_y +
-              first_deriv * (diff_from_center) +
-              (second_deriv / 2.0) * np.square(diff_from_center))
-    # Make the plot
-    plt.plot(x_line, y_vals, line_format_string, label=line_label)
-    
-    return None
-    
-
- -
-
-
- -
-
-
-
In [177]:
-
-
-
# Plot the log likelihood as a function of beta, around the estimated value
-estimated_beta = m.params.values[0]
-interval_width = 0.5
-beta_line = np.linspace(estimated_beta - interval_width,
-                        estimated_beta + interval_width,
-                        num=500)
-
-log_likelihoods = [mnl_estimator.convenience_calc_log_likelihood(np.array(test_beta))
-                   for test_beta in beta_line]
-
-plt.plot(beta_line, log_likelihoods, label='log-likelihoods')
-plt.vlines(estimated_beta,
-           min(log_likelihoods),
-           m.log_likelihood,
-           linestyles='dashed',
-           label='max log-likelihood')
-
-# Plot a second order taylor series to see how well we estimated
-# the hessian of the log-likelihood function
-plot_2nd_order_taylor_series(estimated_beta,
-                             m.log_likelihood,
-                             beta_line,
-                             m.gradient.values[0],
-                             m.hessian.values[0, 0],
-                             line_format_string='-r')
-
-plt.legend(loc='best')
-plt.xlabel(r"$\beta$", fontsize=15)
-plt.ylabel("Log-likelihood", fontsize=15)
-plt.title(r"Log-likelihood versus $\beta$", fontsize=15)
-plt.ylim(ymax=m.log_likelihood + 1)
-plt.show()
-
- -
-
-
- -
-
- - -
- - -
- -
- -
- -
-
- -
-
-
-
-
-
-

Clearly we've done a poor job.

- -
-
-
-
-
-
-
-
-

Why is the hessian being calculated incorrectly?

-
-
-
-
-
-
-
-
-

First, how should the Hessian be calculated?

Note that

-
    -
  1. the design matrix only consists of variables that remain constant across the dataset
  2. -
  3. the coefficients are constant across the population
  4. -
  5. the choice set is constant across individuals -As a result of all of this, the estimated probabilities of choosing each alternative will be the same for each person.
  6. -
-

Now, the hessian is the sum of the second derivatives of the log-likelihood for each observation. So we should N * hessian_1 where hessian_1 is the second derivative of the log-likelihood for the first observation.

- -
-
-
-
-
-
In [179]:
-
-
-
# Get the estimated probabilities of choosing each alternative
-# This will be constant across individuals
-estimated_probs = [m.long_fitted_probs[N * idx]for idx in range(J)]
-estimated_probs
-
-# Calculate the derivative of the probabilities with respect
-# to s = X*B
-dp_ds = np.diag(estimated_probs) - np.outer(estimated_probs,
-                                            estimated_probs)
-
-# Calculate the hessian for a single observation
-hessian_1 = (-1 * X[None, :]).dot(dp_ds.dot(X[:, None]))
-
-# Calculate the hessian for the entire dataset that we expect
-# recover. Note this multiplication only works because of the
-# special set up of this simulation/residential choice problem
-expected_hessian = N * hessian_1
-print "The hessian we expect to see is:", expected_hessian
-
-# Calculate the standard error that we expect for this dataset
-expected_std_error = np.diag(np.linalg.inv(-1 *
-                                           expected_hessian))**0.5
-print "The standard error we expect to see is:", expected_std_error
-
- -
-
-
- -
-
- - -
-
-
The hessian we expect to see is: [[-482.17324461]]
-The standard error we expect to see is: [ 0.04554057]
-
-
-
- -
-
- -
-
-
-
-
-
-

Now, what is wrong with the current hessian calculation?

Long story short, the matrix equations I use to calculate the hessian create blocks of dp_ds, therefore the matrices that multiply and are multiplied by dp_ds should have continuous columns (rows) for a given observation. The simulation data from Sam does not meet this criteria, and I do not have a warning for the user for this criteria. Essentially, the input data should place all rows for a given observation together.

- -
-
-
-
-
-
In [180]:
-
-
-
# Get the mapping matrices from the first model object
-mapping_matrices = m.get_mappings_for_fit()
-
-# Order the dataframe in order of it's observation ids
-sub_dfs = [df.loc[indices] for indices in
-           pylogit.choice_calcs.create_matrix_block_indices(mapping_matrices["rows_to_obs"])]
-ordered_df = pd.concat(sub_dfs, axis=0, ignore_index=True)
-
- -
-
-
- -
-
-
-
In [181]:
-
-
-
m2 = pylogit.create_choice_model(data = ordered_df, 
-                                alt_id_col = 'alt_id', 
-                                obs_id_col = 'obs_id', 
-                                choice_col = 'choice', 
-                                specification = spec, 
-                                model_type = "MNL", 
-                                names = labels)
-
-m2.fit_mle(init_vals = np.array([0]))
-print m2.get_statsmodels_summary()
-
- -
-
-
- -
-
- - -
-
-
Log-likelihood at zero: -3,912.0230
-Initial Log-likelihood: -3,912.0230
-Estimation Time: 0.04 seconds.
-Final log-likelihood: -3,065.1983
-                     Multinomial Logit Model Regression Results                    
-===================================================================================
-Dep. Variable:                      choice   No. Observations:                1,000
-Model:             Multinomial Logit Model   Df Residuals:                      999
-Method:                                MLE   Df Model:                            1
-Date:                     Tue, 15 Nov 2016   Pseudo R-squ.:                   0.216
-Time:                             17:35:41   Pseudo R-bar-squ.:               0.216
-converged:                            True   Log-Likelihood:             -3,065.198
-                                             LL-Null:                    -3,912.023
-==============================================================================
-                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
-beta_x         1.5324      0.046     33.649      0.000         1.443     1.622
-==============================================================================
-
-
-
- -
-
- -
-
-
-
- -
-
-
-
In [52]:
-
-
-
from urbansim.models import MNLDiscreteChoiceModel
-
- -
-
-
- -
-
-
-
In [97]:
-
-
-
# Choosers should be a DataFrame of characteristics, with index as identifier
-
-d = [[n, C[n]] for n in range(N)]
-
-choosers = pd.DataFrame(d, columns=['id', 'choice']).set_index('id')
-
-print len(choosers)
-
- -
-
-
- -
-
- - -
-
-
1000
-
-
-
- -
-
- -
-
-
-
In [98]:
-
-
-
# Alternatives should be a DataFrame of characteristics, with index as identifier
-
-d = [[i, X[i]] for i in range(J)]
-
-alts = pd.DataFrame(d, columns=['id', 'x']).set_index('id')
-
-print len(alts)
-
- -
-
-
- -
-
- - -
-
-
100
-
-
-
- -
-
- -
-
-
-
In [84]:
-
-
-
# It seems like this implementation *requires* us to sample the alternatives, 
-# so here i'm estimating the model with J-1 alts
-
-m = MNLDiscreteChoiceModel(model_expression = 'x',
-                           sample_size = J-1)
-
-m.fit(choosers = choosers,
-      alternatives = alts,
-      current_choice = 'choice')
-
-m.report_fit()
-
- -
-
-
- -
-
- - -
-
-
Null Log-liklihood: -4595.120
-Log-liklihood at convergence: -3793.079
-Log-liklihood Ratio: 0.175
-
-+-----------+-------------+------------+---------+
-| Component | Coefficient | Std. Error | T-Score |
-+-----------+-------------+------------+---------+
-| x         |    1.544    |   0.023    |  68.242 |
-+-----------+-------------+------------+---------+
-
-
-
- -
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
# To do 
-# - look through PyLogit and LCCM code
-# - in many-alternative scenarios, attirbutes of the alternatives will 
-#   usually be in a separate data table - what helper functions do we need?
-
- -
-
-
- -
-
-
-
In [ ]:
-
-
-
 
-
- -
-
-
- -
-
-
- - diff --git a/notebooks/_archive/mnl_refactoring.py b/notebooks/_archive/mnl_refactoring.py deleted file mode 100644 index 52bb2ee..0000000 --- a/notebooks/_archive/mnl_refactoring.py +++ /dev/null @@ -1,48 +0,0 @@ -import numpy as np -import pandas as pd - -from choicemodels import MultinomialLogit -from choicemodels.tools import MergedChoiceTable -from collections import OrderedDict - - -tracts = pd.read_csv('../data/tracts.csv').set_index('full_tract_id') -trips = pd.read_csv('../data/trips.csv').set_index('place_id') - -pd.set_option('display.float_format', lambda x: '%.3f' % x) - -choosers = trips.loc[np.random.choice(trips.index, 500, replace=False)] -choosers = choosers.loc[choosers.trip_distance_miles.notnull()] - -numalts = 10 - -merged = MergedChoiceTable(observations = choosers, - alternatives = tracts, - chosen_alternatives = choosers.full_tract_id, - sample_size = numalts) - -model_expression = "home_density + work_density + school_density" - -model = MultinomialLogit(merged.to_frame(), - merged.observation_id_col, - merged.choice_col, - model_expression) - -results = model.fit() - -results.report_fit() - -""" -model_expression = OrderedDict([('home_density', 'all_same'), - ('work_density', 'all_same'), - ('school_density', 'all_same')]) - -model = MultinomialLogit(data = merged.to_frame(), - observation_id_col = merged.observation_id_col, - alternative_id_col = merged.alternative_id_col, - choice_col = merged.choice_col, - model_expression = model_expression) - -results = model.fit() -print(results.print_summaries()) -""" diff --git a/notebooks/make_distance_bands.ipynb b/notebooks/make_distance_bands.ipynb deleted file mode 100644 index 046bd5f..0000000 --- a/notebooks/make_distance_bands.ipynb +++ /dev/null @@ -1,162 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Anaconda\\envs\\cm\\lib\\site-packages\\statsmodels\\compat\\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", - " from pandas.core import datetools\n" - ] - } - ], - "source": [ - "import pandas as pd, numpy as np\n", - "from choicemodels.tools import distancematrix as dm\n", - "\n", - "# define distance bands in meters\n", - "distances = [0, 3000, 10000, 20000, np.inf]\n", - "\n", - "# specify input/output file locations\n", - "distance_matrix_file = '../data/bay_tracts_distance_matrix.csv'\n", - "distance_bands_file = '../data/bay_tracts_distance_bands.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "from to \n", - "06001400100 06001400100 0\n", - " 06001400200 2659\n", - " 06001400300 3595\n", - " 06001400400 3111\n", - " 06001400500 3579\n", - "Name: distance, dtype: int64" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# keep to and from geography IDs as string to preserve preceding zeros in tract IDs\n", - "dtypes = {0:str, 1:str}\n", - "dist_matrix = pd.read_csv(distance_matrix_file, header=None, dtype=dtypes, encoding='utf-8')\n", - "dist_matrix = dist_matrix.rename(columns={0:'from', 1:'to', 2:'distance'})\n", - "dist_matrix = dist_matrix.set_index(['from', 'to'])\n", - "dist_vector = dist_matrix['distance']\n", - "dist_vector.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wall time: 4.17 s\n" - ] - } - ], - "source": [ - "%%time\n", - "db = dm.distance_bands(dist_vector, distances)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "06001400100 0 [06001400100, 06001400200, 06001404300, 060014...\n", - " 1 [06001400300, 06001400400, 06001400500, 060014...\n", - " 2 [06001406100, 06001407200, 06001407300, 060014...\n", - " 3 [06001430101, 06001430200, 06001430300, 060014...\n", - "06001400200 0 [06001400100, 06001400200, 06001400300, 060014...\n", - " 1 [06001401300, 06001401400, 06001401500, 060014...\n", - " 2 [06001407300, 06001408100, 06001408200, 060014...\n", - " 3 [06001430101, 06001430200, 06001430300, 060014...\n", - "06001400300 0 [06001400200, 06001400300, 06001400400, 060014...\n", - " 1 [06001400100, 06001401500, 06001401600, 060014...\n", - " 2 [06001408100, 06001408200, 06001408300, 060014...\n", - " 3 [06001430101, 06001430200, 06001430300, 060014...\n", - "06001400400 0 [06001400200, 06001400300, 06001400400, 060014...\n", - " 1 [06001400100, 06001401300, 06001401400, 060014...\n", - " 2 [06001407300, 06001407500, 06001408100, 060014...\n", - " 3 [06001430101, 06001430200, 06001430300, 060014...\n", - "06001400500 0 [06001400200, 06001400300, 06001400400, 060014...\n", - " 1 [06001400100, 06001401300, 06001401400, 060014...\n", - " 2 [06001407300, 06001407400, 06001407500, 060014...\n", - " 3 [06001430101, 06001430200, 06001430300, 060014...\n", - "dtype: object" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "db.head(20)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# save to csv for now... should store in database\n", - "db.to_csv(distance_bands_file, index=True, encoding='utf-8')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/make_distance_matrix.ipynb b/notebooks/make_distance_matrix.ipynb deleted file mode 100644 index 38bee16..0000000 --- a/notebooks/make_distance_matrix.ipynb +++ /dev/null @@ -1,249 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Anaconda\\envs\\cm\\lib\\site-packages\\statsmodels\\compat\\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", - " from pandas.core import datetools\n" - ] - } - ], - "source": [ - "import pandas as pd, numpy as np\n", - "from choicemodels.tools import distancematrix as dm\n", - "\n", - "tract_centroids_file = '../data/bay_tract_centroids.csv'\n", - "distance_matrix_file = '../data/bay_tracts_distance_matrix.csv'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load the data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1588" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# use double-precision floating points to ensure sufficient significant digits\n", - "dtypes = {'GEOID10':str, 'lat':np.float64, 'lng':np.float64}\n", - "df = pd.read_csv(tract_centroids_file, dtype=dtypes, encoding='utf-8').sort_values(by='GEOID10')\n", - "len(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# alternatively, create a randomized dataframe of length n to test performance relative to size\n", - "#n = 5000\n", - "#df = pd.DataFrame({'GEOID10':range(n), 'lng':np.random.random(n), 'lat':np.random.random(n)})" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# index the dataframe by place identifier (i.e., census tract ID)\n", - "df = df.set_index('GEOID10')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Calculate distance matrices, reindexed as multi-index vectors\n", - "\n", - "#### First, the euclidean distance vector in units of degrees" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wall time: 72 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "df_eu_dm = dm.distance_matrix(df, method='euclidean')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(2521744L,)\n" - ] - }, - { - "data": { - "text/plain": [ - "06001400100 06001400100 0.000000\n", - " 06001400200 0.026261\n", - " 06001400300 0.035165\n", - " 06001400400 0.032078\n", - " 06001400500 0.037980\n", - "dtype: float64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(df_eu_dm.shape)\n", - "df_eu_dm.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Next, the great-circle distance vector in units of meters" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "g:\\geoff\\dropbox\\documents\\school\\phd\\work\\2017-summer\\paul\\code\\choicemodels\\choicemodels\\tools\\distancematrix.py:47: RuntimeWarning: invalid value encountered in arccos\n", - " arc = np.arccos(cos)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wall time: 1.68 s\n" - ] - } - ], - "source": [ - "%%time\n", - "df_gc_dm = dm.distance_matrix(df, method='greatcircle')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(2521744L,)\n" - ] - }, - { - "data": { - "text/plain": [ - "06001400100 06001400100 0\n", - " 06001400200 2659\n", - " 06001400300 3595\n", - " 06001400400 3111\n", - " 06001400500 3579\n", - "dtype: int32" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(df_gc_dm.shape)\n", - "df_gc_dm.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# save distance matrix to disk\n", - "df_gc_dm.to_csv(distance_matrix_file, index=True, encoding='utf-8')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/make_tract_centroids.ipynb b/notebooks/make_tract_centroids.ipynb deleted file mode 100644 index 484675d..0000000 --- a/notebooks/make_tract_centroids.ipynb +++ /dev/null @@ -1,155 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import pandas as pd, geopandas as gpd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# specify location of census tracts shapefile and centroids output file\n", - "cal_tracts_file = '../data/tl_2010_06_tract10/tl_2010_06_tract10.shp'\n", - "tract_centroids_file = '../data/bay_tract_centroids.csv'\n", - "\n", - "# identify bay area counties by fips code\n", - "counties = {'Alameda':'001',\n", - " 'Contra Costa':'013',\n", - " 'Marin':'041',\n", - " 'Napa':'055',\n", - " 'San Francisco':'075',\n", - " 'San Mateo':'081',\n", - " 'Santa Clara':'085',\n", - " 'Solano':'095',\n", - " 'Sonoma':'097'}" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "8057" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# load the tracts shapefile\n", - "gdf_cal = gpd.read_file(cal_tracts_file)\n", - "len(gdf_cal)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1588" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# retain only those tracts that are in the bay area counties\n", - "gdf_cal['county_fips'] = gdf_cal['GEOID10'].str.slice(start=2, stop=5)\n", - "gdf_bay = gdf_cal[gdf_cal['county_fips'].isin(counties.values())]\n", - "len(gdf_bay)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# calculate the centroid of each tract polygon then extract lat and lng coordinates\n", - "centroids = gdf_bay.centroid\n", - "lng = centroids.apply(lambda point: point.x)\n", - "lat = centroids.apply(lambda point: point.y)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1588" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# assemble into a dataframe to save\n", - "df_save = pd.DataFrame({'GEOID10':gdf_bay['GEOID10'],\n", - " 'lat':lat,\n", - " 'lng':lng})\n", - "len(df_save)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# save to disk\n", - "df_save.to_csv(tract_centroids_file, index=False, encoding='utf-8')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/setup.py b/setup.py index 0175906..d8afa66 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='choicemodels', - version='0.2.dev10', + version='0.2', description='Tools for discrete choice estimation', long_description=long_description, author='UDST',