From 7becd2d8506e1e57cb08f876bc833abdc873f2e0 Mon Sep 17 00:00:00 2001
From: Bo Wen <56280490+bwentl@users.noreply.github.com>
Date: Thu, 11 May 2023 16:10:28 -0700
Subject: [PATCH 1/4] add chunking and caching of alt_values

use feather and pickle for faster loading
split x_ca processing into idca using chunking_size
---
 .../estimation/larch/location_choice.py       | 66 ++++++++++++++++++-
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py
index 74a426e71..d287f334e 100644
--- a/activitysim/estimation/larch/location_choice.py
+++ b/activitysim/estimation/larch/location_choice.py
@@ -1,6 +1,8 @@
 import os
 from pathlib import Path
 from typing import Collection
+import pickle
+from datetime import datetime
 
 import numpy as np
 import pandas as pd
@@ -44,6 +46,7 @@ def location_choice_model(
     settings_file="{name}_model_settings.yaml",
     landuse_file="{name}_landuse.csv",
     return_data=False,
+    chunking_size=None,
 ):
     model_selector = name.replace("_location", "")
     model_selector = model_selector.replace("_destination", "")
@@ -57,12 +60,37 @@ def _read_csv(filename, **kwargs):
         filename = filename.format(name=name)
         return pd.read_csv(os.path.join(edb_directory, filename), **kwargs)
 
+    def _read_feather(filename, **kwargs):
+        filename = filename.format(name=name)
+        return pd.read_feather(os.path.join(edb_directory, filename), **kwargs)
+
+    def _to_feather(df, filename, **kwargs):
+        filename = filename.format(name=name)
+        return df.to_feather(os.path.join(edb_directory, filename), **kwargs)
+
+    def _read_pickle(filename, **kwargs):
+        filename = filename.format(name=name)
+        return pd.read_pickle(os.path.join(edb_directory, filename))
+
+    def _to_pickle(df, filename, **kwargs):
+        filename = filename.format(name=name)
+        return df.to_pickle(os.path.join(edb_directory, filename))
+
+    def _file_exists(filename):
+        filename = filename.format(name=name)
+        return os.path.exists(os.path.join(edb_directory, filename))
+
     coefficients = _read_csv(
         coefficients_file,
         index_col="coefficient_name",
     )
     spec = _read_csv(spec_file, comment="#")
-    alt_values = _read_csv(alt_values_file)
+    alt_values_fea_file = alt_values_file.replace(".csv", ".fea")
+    if os.path.exists(os.path.join(edb_directory, alt_values_fea_file.format(name=name))):
+        alt_values = _read_feather(alt_values_fea_file)
+    else:
+        alt_values = _read_csv(alt_values_file)
+        _to_feather(df=alt_values, filename=alt_values_fea_file)
     chooser_data = _read_csv(chooser_file)
     landuse = _read_csv(landuse_file, index_col="zone_id")
     master_size_spec = _read_csv(size_spec_file)
@@ -148,7 +176,41 @@ def _read_csv(filename, **kwargs):
 
     chooser_index_name = chooser_data.columns[0]
     x_co = chooser_data.set_index(chooser_index_name)
-    x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]]))
+
+    def split(a, n):
+        k, m = divmod(len(a), n)
+        return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
+
+    x_ca_pickle_file = "{name}_x_ca.pkl"
+    if chunking_size == None:
+        x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]]))
+    elif _file_exists(x_ca_pickle_file):
+        time_start = datetime.now()
+        x_ca = _read_pickle(x_ca_pickle_file)
+        print(
+            f"x_ca data loaded from {name}_x_ca.fea - time elapsed {(datetime.now() - time_start).total_seconds()}"
+        )
+    else:
+        time_start = datetime.now()
+        # calculate num_chunks based on chunking_size (or max number of rows per chunk)
+        num_chunks = int(len(alt_values) / chunking_size)
+        all_person_ids = list(alt_values["person_id"].unique())
+        split_ids = list(split(all_person_ids, num_chunks))
+        x_ca_list = []
+        i = 0
+        for chunk_ids in split_ids:
+            alt_values_i = alt_values[alt_values["person_id"].isin(chunk_ids)]
+            x_ca_i = cv_to_ca(
+                alt_values_i.set_index([chooser_index_name, alt_values_i.columns[1]]))
+            x_ca_list.append(x_ca_i)
+            print(
+                f"\rx_ca_i compute done for chunk {i}/{num_chunks} - time elapsed {(datetime.now() - time_start).total_seconds()}"
+            )
+            i = i + 1
+        x_ca = pd.concat(x_ca_list, axis=0)
+        _to_pickle(df=x_ca, filename=x_ca_pickle_file)
+        print(
+            f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}")
 
     if CHOOSER_SEGMENT_COLUMN_NAME is not None:
         # label segments with names

From 0e5d3bf56fefc47662955f54ec963cf7f8bcf615 Mon Sep 17 00:00:00 2001
From: Bo Wen <56280490+bwentl@users.noreply.github.com>
Date: Fri, 23 Jun 2023 09:57:57 -0700
Subject: [PATCH 2/4] add alt_values_to_feather option

and update comments
default behavior does not change, to enable use of feather
and the use of chunking, do this:

    # load from original
    model, data = component_model(modelname,
                                  return_data=True,
                                  alt_values_to_feather=True,
                                  chunking_size=chunking_size)
---
 activitysim/estimation/larch/location_choice.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py
index d287f334e..38a304543 100644
--- a/activitysim/estimation/larch/location_choice.py
+++ b/activitysim/estimation/larch/location_choice.py
@@ -46,6 +46,7 @@ def location_choice_model(
     settings_file="{name}_model_settings.yaml",
     landuse_file="{name}_landuse.csv",
     return_data=False,
+    alt_values_to_feather=False,
     chunking_size=None,
 ):
     model_selector = name.replace("_location", "")
@@ -85,12 +86,15 @@ def _file_exists(filename):
         index_col="coefficient_name",
     )
     spec = _read_csv(spec_file, comment="#")
+
+    # read alternative values either as csv or feather file
     alt_values_fea_file = alt_values_file.replace(".csv", ".fea")
     if os.path.exists(os.path.join(edb_directory, alt_values_fea_file.format(name=name))):
         alt_values = _read_feather(alt_values_fea_file)
     else:
         alt_values = _read_csv(alt_values_file)
-        _to_feather(df=alt_values, filename=alt_values_fea_file)
+        if alt_values_to_feather:
+            _to_feather(df=alt_values, filename=alt_values_fea_file)
     chooser_data = _read_csv(chooser_file)
     landuse = _read_csv(landuse_file, index_col="zone_id")
     master_size_spec = _read_csv(size_spec_file)
@@ -181,10 +185,12 @@ def split(a, n):
         k, m = divmod(len(a), n)
         return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
 
+    # process x_ca with cv_to_ca with or without chunking
     x_ca_pickle_file = "{name}_x_ca.pkl"
     if chunking_size == None:
         x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]]))
     elif _file_exists(x_ca_pickle_file):
+        # if pickle file from previous x_ca processing exist, load it to save time
         time_start = datetime.now()
         x_ca = _read_pickle(x_ca_pickle_file)
         print(
@@ -208,6 +214,7 @@ def split(a, n):
             )
             i = i + 1
         x_ca = pd.concat(x_ca_list, axis=0)
+        # save final x_ca result as pickle file to save time for future data loading
         _to_pickle(df=x_ca, filename=x_ca_pickle_file)
         print(
             f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}")

From 425467e32bc91cb6b0db32fe546449130f38039d Mon Sep 17 00:00:00 2001
From: Bo Wen <56280490+bwentl@users.noreply.github.com>
Date: Fri, 23 Jun 2023 10:18:33 -0700
Subject: [PATCH 3/4] format code with black

---
 activitysim/estimation/larch/location_choice.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/activitysim/estimation/larch/location_choice.py b/activitysim/estimation/larch/location_choice.py
index 38a304543..a93002418 100644
--- a/activitysim/estimation/larch/location_choice.py
+++ b/activitysim/estimation/larch/location_choice.py
@@ -89,7 +89,9 @@ def _file_exists(filename):
 
     # read alternative values either as csv or feather file
     alt_values_fea_file = alt_values_file.replace(".csv", ".fea")
-    if os.path.exists(os.path.join(edb_directory, alt_values_fea_file.format(name=name))):
+    if os.path.exists(
+        os.path.join(edb_directory, alt_values_fea_file.format(name=name))
+    ):
         alt_values = _read_feather(alt_values_fea_file)
     else:
         alt_values = _read_csv(alt_values_file)
@@ -183,12 +185,14 @@ def _file_exists(filename):
 
     def split(a, n):
         k, m = divmod(len(a), n)
-        return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
+        return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
 
     # process x_ca with cv_to_ca with or without chunking
     x_ca_pickle_file = "{name}_x_ca.pkl"
     if chunking_size == None:
-        x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]]))
+        x_ca = cv_to_ca(
+            alt_values.set_index([chooser_index_name, alt_values.columns[1]])
+        )
     elif _file_exists(x_ca_pickle_file):
         # if pickle file from previous x_ca processing exist, load it to save time
         time_start = datetime.now()
@@ -207,7 +211,8 @@ def split(a, n):
         for chunk_ids in split_ids:
             alt_values_i = alt_values[alt_values["person_id"].isin(chunk_ids)]
             x_ca_i = cv_to_ca(
-                alt_values_i.set_index([chooser_index_name, alt_values_i.columns[1]]))
+                alt_values_i.set_index([chooser_index_name, alt_values_i.columns[1]])
+            )
             x_ca_list.append(x_ca_i)
             print(
                 f"\rx_ca_i compute done for chunk {i}/{num_chunks} - time elapsed {(datetime.now() - time_start).total_seconds()}"
@@ -217,7 +222,8 @@ def split(a, n):
         # save final x_ca result as pickle file to save time for future data loading
         _to_pickle(df=x_ca, filename=x_ca_pickle_file)
         print(
-            f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}")
+            f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}"
+        )
 
     if CHOOSER_SEGMENT_COLUMN_NAME is not None:
         # label segments with names

From 688539e624afd0f57080f1cfd0a524313197b19a Mon Sep 17 00:00:00 2001
From: Bo Wen <56280490+bwentl@users.noreply.github.com>
Date: Tue, 6 Feb 2024 11:45:07 -0800
Subject: [PATCH 4/4] format with black

---
 activitysim/abm/models/joint_tour_participation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/activitysim/abm/models/joint_tour_participation.py b/activitysim/abm/models/joint_tour_participation.py
index 2c5e5fb7b..73825d162 100644
--- a/activitysim/abm/models/joint_tour_participation.py
+++ b/activitysim/abm/models/joint_tour_participation.py
@@ -432,7 +432,8 @@ def joint_tour_participation(
         # its value depends on whether the candidate's 'participant_id' is in the joint_tour_participant index
         survey_participants_df = estimator.get_survey_table("joint_tour_participants")
         participate = pd.Series(
-            choices.index.isin(survey_participants_df.participant_id), index=choices.index
+            choices.index.isin(survey_participants_df.participant_id),
+            index=choices.index,
         )
 
         # but estimation software wants to know the choices value (alternative index)