From 7becd2d8506e1e57cb08f876bc833abdc873f2e0 Mon Sep 17 00:00:00 2001 From: Bo Wen <56280490+bwentl@users.noreply.github.com> Date: Thu, 11 May 2023 16:10:28 -0700 Subject: [PATCH 1/4] add chunking and caching of alt_values use feather and pickle for faster loading split x_ca processing into idca using chunking_size --- .../estimation/larch/location_choice.py | 66 ++++++++++++++++++- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py index 74a426e71..d287f334e 100644 --- a/activitysim/estimation/larch/location_choice.py +++ b/activitysim/estimation/larch/location_choice.py @@ -1,6 +1,8 @@ import os from pathlib import Path from typing import Collection +import pickle +from datetime import datetime import numpy as np import pandas as pd @@ -44,6 +46,7 @@ def location_choice_model( settings_file="{name}_model_settings.yaml", landuse_file="{name}_landuse.csv", return_data=False, + chunking_size=None, ): model_selector = name.replace("_location", "") model_selector = model_selector.replace("_destination", "") @@ -57,12 +60,37 @@ def _read_csv(filename, **kwargs): filename = filename.format(name=name) return pd.read_csv(os.path.join(edb_directory, filename), **kwargs) + def _read_feather(filename, **kwargs): + filename = filename.format(name=name) + return pd.read_feather(os.path.join(edb_directory, filename), **kwargs) + + def _to_feather(df, filename, **kwargs): + filename = filename.format(name=name) + return df.to_feather(os.path.join(edb_directory, filename), **kwargs) + + def _read_pickle(filename, **kwargs): + filename = filename.format(name=name) + return pd.read_pickle(os.path.join(edb_directory, filename)) + + def _to_pickle(df, filename, **kwargs): + filename = filename.format(name=name) + return df.to_pickle(os.path.join(edb_directory, filename)) + + def _file_exists(filename): + filename = filename.format(name=name) + return os.path.exists(os.path.join(edb_directory, filename)) + coefficients = _read_csv( coefficients_file, index_col="coefficient_name", ) spec = _read_csv(spec_file, comment="#") - alt_values = _read_csv(alt_values_file) + alt_values_fea_file = alt_values_file.replace(".csv", ".fea") + if os.path.exists(os.path.join(edb_directory, alt_values_fea_file.format(name=name))): + alt_values = _read_feather(alt_values_fea_file) + else: + alt_values = _read_csv(alt_values_file) + _to_feather(df=alt_values, filename=alt_values_fea_file) chooser_data = _read_csv(chooser_file) landuse = _read_csv(landuse_file, index_col="zone_id") master_size_spec = _read_csv(size_spec_file) @@ -148,7 +176,41 @@ def _read_csv(filename, **kwargs): chooser_index_name = chooser_data.columns[0] x_co = chooser_data.set_index(chooser_index_name) - x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]])) + + def split(a, n): + k, m = divmod(len(a), n) + return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) + + x_ca_pickle_file = "{name}_x_ca.pkl" + if chunking_size == None: + x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]])) + elif _file_exists(x_ca_pickle_file): + time_start = datetime.now() + x_ca = _read_pickle(x_ca_pickle_file) + print( + f"x_ca data loaded from {name}_x_ca.fea - time elapsed {(datetime.now() - time_start).total_seconds()}" + ) + else: + time_start = datetime.now() + # calculate num_chunks based on chunking_size (or max number of rows per chunk) + num_chunks = int(len(alt_values) / chunking_size) + all_person_ids = list(alt_values["person_id"].unique()) + split_ids = list(split(all_person_ids, num_chunks)) + x_ca_list = [] + i = 0 + for chunk_ids in split_ids: + alt_values_i = alt_values[alt_values["person_id"].isin(chunk_ids)] + x_ca_i = cv_to_ca( + alt_values_i.set_index([chooser_index_name, alt_values_i.columns[1]])) + x_ca_list.append(x_ca_i) + print( + f"\rx_ca_i compute done for chunk {i}/{num_chunks} - time elapsed {(datetime.now() - time_start).total_seconds()}" + ) + i = i + 1 + x_ca = pd.concat(x_ca_list, axis=0) + _to_pickle(df=x_ca, filename=x_ca_pickle_file) + print( + f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}") if CHOOSER_SEGMENT_COLUMN_NAME is not None: # label segments with names From 0e5d3bf56fefc47662955f54ec963cf7f8bcf615 Mon Sep 17 00:00:00 2001 From: Bo Wen <56280490+bwentl@users.noreply.github.com> Date: Fri, 23 Jun 2023 09:57:57 -0700 Subject: [PATCH 2/4] add alt_values_to_feather option and update comments default behavior does not change, to enable use of feather and the use of chunking, do this: # load from original model, data = component_model(modelname, return_data=True, alt_values_to_feather=True, chunking_size=chunking_size) --- activitysim/estimation/larch/location_choice.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py index d287f334e..38a304543 100644 --- a/activitysim/estimation/larch/location_choice.py +++ b/activitysim/estimation/larch/location_choice.py @@ -46,6 +46,7 @@ def location_choice_model( settings_file="{name}_model_settings.yaml", landuse_file="{name}_landuse.csv", return_data=False, + alt_values_to_feather=False, chunking_size=None, ): model_selector = name.replace("_location", "") @@ -85,12 +86,15 @@ def _file_exists(filename): index_col="coefficient_name", ) spec = _read_csv(spec_file, comment="#") + + # read alternative values either as csv or feather file alt_values_fea_file = alt_values_file.replace(".csv", ".fea") if os.path.exists(os.path.join(edb_directory, alt_values_fea_file.format(name=name))): alt_values = _read_feather(alt_values_fea_file) else: alt_values = _read_csv(alt_values_file) - _to_feather(df=alt_values, filename=alt_values_fea_file) + if alt_values_to_feather: + _to_feather(df=alt_values, filename=alt_values_fea_file) chooser_data = _read_csv(chooser_file) landuse = _read_csv(landuse_file, index_col="zone_id") master_size_spec = _read_csv(size_spec_file) @@ -181,10 +185,12 @@ def split(a, n): k, m = divmod(len(a), n) return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) + # process x_ca with cv_to_ca with or without chunking x_ca_pickle_file = "{name}_x_ca.pkl" if chunking_size == None: x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]])) elif _file_exists(x_ca_pickle_file): + # if pickle file from previous x_ca processing exist, load it to save time time_start = datetime.now() x_ca = _read_pickle(x_ca_pickle_file) print( @@ -208,6 +214,7 @@ def split(a, n): ) i = i + 1 x_ca = pd.concat(x_ca_list, axis=0) + # save final x_ca result as pickle file to save time for future data loading _to_pickle(df=x_ca, filename=x_ca_pickle_file) print( f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}") From 425467e32bc91cb6b0db32fe546449130f38039d Mon Sep 17 00:00:00 2001 From: Bo Wen <56280490+bwentl@users.noreply.github.com> Date: Fri, 23 Jun 2023 10:18:33 -0700 Subject: [PATCH 3/4] format code with black --- activitysim/estimation/larch/location_choice.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py index 38a304543..a93002418 100644 --- a/activitysim/estimation/larch/location_choice.py +++ b/activitysim/estimation/larch/location_choice.py @@ -89,7 +89,9 @@ def _file_exists(filename): # read alternative values either as csv or feather file alt_values_fea_file = alt_values_file.replace(".csv", ".fea") - if os.path.exists(os.path.join(edb_directory, alt_values_fea_file.format(name=name))): + if os.path.exists( + os.path.join(edb_directory, alt_values_fea_file.format(name=name)) + ): alt_values = _read_feather(alt_values_fea_file) else: alt_values = _read_csv(alt_values_file) @@ -183,12 +185,14 @@ def _file_exists(filename): def split(a, n): k, m = divmod(len(a), n) - return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) + return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) # process x_ca with cv_to_ca with or without chunking x_ca_pickle_file = "{name}_x_ca.pkl" if chunking_size == None: - x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]])) + x_ca = cv_to_ca( + alt_values.set_index([chooser_index_name, alt_values.columns[1]]) + ) elif _file_exists(x_ca_pickle_file): # if pickle file from previous x_ca processing exist, load it to save time time_start = datetime.now() @@ -207,7 +211,8 @@ def split(a, n): for chunk_ids in split_ids: alt_values_i = alt_values[alt_values["person_id"].isin(chunk_ids)] x_ca_i = cv_to_ca( - alt_values_i.set_index([chooser_index_name, alt_values_i.columns[1]])) + alt_values_i.set_index([chooser_index_name, alt_values_i.columns[1]]) + ) x_ca_list.append(x_ca_i) print( f"\rx_ca_i compute done for chunk {i}/{num_chunks} - time elapsed {(datetime.now() - time_start).total_seconds()}" @@ -217,7 +222,8 @@ def split(a, n): # save final x_ca result as pickle file to save time for future data loading _to_pickle(df=x_ca, filename=x_ca_pickle_file) print( - f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}") + f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}" + ) if CHOOSER_SEGMENT_COLUMN_NAME is not None: # label segments with names From 688539e624afd0f57080f1cfd0a524313197b19a Mon Sep 17 00:00:00 2001 From: Bo Wen <56280490+bwentl@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:45:07 -0800 Subject: [PATCH 4/4] format with black --- activitysim/abm/models/joint_tour_participation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py index 2c5e5fb7b..73825d162 100644 --- a/activitysim/abm/models/joint_tour_participation.py +++ b/activitysim/abm/models/joint_tour_participation.py @@ -432,7 +432,8 @@ def joint_tour_participation( # its value depends on whether the candidate's 'participant_id' is in the joint_tour_participant index survey_participants_df = estimator.get_survey_table("joint_tour_participants") participate = pd.Series( - choices.index.isin(survey_participants_df.participant_id), index=choices.index + choices.index.isin(survey_participants_df.participant_id), + index=choices.index, ) # but estimation software wants to know the choices value (alternative index)