Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data Type Optimization #782

Merged
merged 50 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from 48 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
84a2a0c
trying pandas categorical dtype
i-am-sijia Jul 12, 2023
228cc28
sweeping out string variables
i-am-sijia Jul 17, 2023
c128447
telecommute freq string
i-am-sijia Jul 25, 2023
93c427f
vehicle type strings
i-am-sijia Jul 25, 2023
e91a115
categorical is not recognized by numpy
i-am-sijia Jul 25, 2023
0de0043
df info verbose
i-am-sijia Jul 25, 2023
224474c
categoricals for school escorting
i-am-sijia Aug 2, 2023
575ef45
categoricals for vehicle allocation
i-am-sijia Aug 2, 2023
ef6fbbf
add condition for pandas categoricals
i-am-sijia Aug 2, 2023
9a582da
adhoc string variables to categoricals
i-am-sijia Aug 2, 2023
fabd596
auto downcast for interim variables
i-am-sijia Aug 22, 2023
4453877
not use unsigned int
i-am-sijia Aug 22, 2023
6f64dd2
numeric overflow on chaperone weights
i-am-sijia Aug 22, 2023
e420633
numeric overflow in veh type preprocessor
i-am-sijia Aug 22, 2023
5c7ee34
downcast tour variables in source code
i-am-sijia Aug 22, 2023
282885d
condition the temps
i-am-sijia Oct 6, 2023
e06bd7d
sort time period cat in tour scheduling
i-am-sijia Oct 6, 2023
9018aa4
add loggings
i-am-sijia Oct 6, 2023
7cffe4f
formatting
i-am-sijia Oct 6, 2023
3fd3ed5
update dependencies
i-am-sijia Oct 6, 2023
be8c087
update unit tests under util\test
i-am-sijia Dec 7, 2023
0a34951
update unit tests under abm.test
i-am-sijia Dec 7, 2023
37e7ab5
Merge branch 'develop' into data-type-op-pd-cat
i-am-sijia Dec 7, 2023
cbe6310
make int downcast an option
i-am-sijia Dec 7, 2023
e4e73d3
test moving str comp out of utility evaluation
i-am-sijia Dec 11, 2023
51f9a24
fix mtc and sandag ci tests
i-am-sijia Dec 11, 2023
e1e5d41
get categories from alternatives
i-am-sijia Dec 12, 2023
ed2ae96
black formatting
i-am-sijia Dec 12, 2023
e02b0b4
add `downcast_int` and `downcast_float` into settings
i-am-sijia Dec 16, 2023
3aefc59
fix primary purpose in school escorting
i-am-sijia Dec 16, 2023
21fdb9f
sort mode categories
i-am-sijia Dec 27, 2023
379c4f4
fix estimation tests
i-am-sijia Dec 27, 2023
78db977
fix typo
i-am-sijia Jan 2, 2024
08f3be8
remove code in comments
i-am-sijia Jan 31, 2024
cfe034a
add check column type in the tests in util/test
i-am-sijia Jan 31, 2024
83d9e67
remove unwanted tracing
i-am-sijia Jan 31, 2024
3c60efc
blacken
i-am-sijia Jan 31, 2024
a9dd383
move comment
i-am-sijia Feb 1, 2024
ccc195c
relax progressive checks for categorical dtypes
jpn-- Feb 2, 2024
f81211a
update pointers
jpn-- Feb 2, 2024
54afbb0
use empty string as NaN
jpn-- Feb 6, 2024
f09ac76
Merge commit '25e4f6c9055cccaeee2145b2924bdc7d00459abc' into data-typ…
jpn-- Feb 6, 2024
ae126f1
Merge pull request #8 from camsys/data-type-op-pd-cat
i-am-sijia Feb 6, 2024
277bbb8
update mtc_ext progressive test for categorical outputs
jpn-- Feb 6, 2024
74d4f96
Merge commit 'ae126f1f85b7ac858f921a762e49e9da534a9efb' into data-typ…
jpn-- Feb 6, 2024
9cd9bbe
Merge commit '69465d9534e2ca889ad5a5acaed1eadc9a330a6f' into data-typ…
jpn-- Feb 6, 2024
42d4e8c
Merge pull request #9 from camsys/data-type-dev-merge
i-am-sijia Feb 6, 2024
a3cb622
Merge pull request #10 from wsp-sag/develop
i-am-sijia Feb 7, 2024
1f40c06
Merge branch 'develop' into data-type-op-pd-cat
jpn-- Feb 9, 2024
db03dae
Merge pull request #11 from camsys/data-type-op-pd-cat
i-am-sijia Feb 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions activitysim/abm/models/atwork_subtour_destination.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,21 @@ def atwork_subtour_destination(
estimator.end_estimation()

subtours[destination_column_name] = choices_df["choice"]
assign_in_place(tours, subtours[[destination_column_name]])
assign_in_place(
tours,
subtours[[destination_column_name]],
state.settings.downcast_int,
state.settings.downcast_float,
)

if want_logsums:
subtours[logsum_column_name] = choices_df["logsum"]
assign_in_place(tours, subtours[[logsum_column_name]])
assign_in_place(
tours,
subtours[[logsum_column_name]],
state.settings.downcast_int,
state.settings.downcast_float,
)

state.add_table("tours", tours)

Expand Down
21 changes: 20 additions & 1 deletion activitysim/abm/models/atwork_subtour_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,15 @@

def add_null_results(state, trace_label, tours):
logger.info("Skipping %s: add_null_results", trace_label)
tours["atwork_subtour_frequency"] = np.nan
cat_type = pd.api.types.CategoricalDtype(
[""],
ordered=False,
)
choices = choices.astype(cat_type)
tours["atwork_subtour_frequency"] = ""
tours["atwork_subtour_frequency"] = tours["atwork_subtour_frequency"].astype(
cat_type
)
state.add_table("tours", tours)


Expand Down Expand Up @@ -117,6 +125,11 @@ def atwork_subtour_frequency(

# convert indexes to alternative names
choices = pd.Series(model_spec.columns[choices.values], index=choices.index)
cat_type = pd.api.types.CategoricalDtype(
alternatives.index.tolist() + [""],
ordered=False,
)
choices = choices.astype(cat_type)

if estimator:
estimator.write_choices(choices)
Expand All @@ -137,6 +150,12 @@ def atwork_subtour_frequency(

subtours = process_atwork_subtours(state, work_tours, alternatives)

# convert purpose to pandas categoricals
purpose_type = pd.api.types.CategoricalDtype(
alternatives.columns.tolist() + ["atwork"], ordered=False
)
subtours["tour_type"] = subtours["tour_type"].astype(purpose_type)

tours = state.extend_table("tours", subtours)

state.tracing.register_traceable_table("tours", subtours)
Expand Down
4 changes: 3 additions & 1 deletion activitysim/abm/models/atwork_subtour_mode_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,9 @@ def atwork_subtour_mode_choice(
"%s choices" % trace_label, choices_df[mode_column_name], value_counts=True
)

assign_in_place(tours, choices_df)
assign_in_place(
tours, choices_df, state.settings.downcast_int, state.settings.downcast_float
)
state.add_table("tours", tours)

# - annotate tours table
Expand Down
4 changes: 3 additions & 1 deletion activitysim/abm/models/atwork_subtour_scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ def atwork_subtour_scheduling(
choices.to_frame("tdd"), tdd_alts, left_on=["tdd"], right_index=True, how="left"
)

assign_in_place(tours, tdd_choices)
assign_in_place(
tours, tdd_choices, state.settings.downcast_int, state.settings.downcast_float
)
state.add_table("tours", tours)

if trace_hh_id:
Expand Down
2 changes: 2 additions & 0 deletions activitysim/abm/models/cdap.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@ def cdap_simulate(
estimator.end_estimation()

choices = choices.reindex(persons.index)
cap_cat_type = pd.api.types.CategoricalDtype(["", "M", "N", "H"], ordered=False)
choices = choices.astype(cap_cat_type)
persons["cdap_activity"] = choices

expressions.assign_columns(
Expand Down
10 changes: 9 additions & 1 deletion activitysim/abm/models/joint_tour_composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
def add_null_results(state, trace_label, tours):
logger.info("Skipping %s: add_null_results" % trace_label)
tours["composition"] = ""
cat_type = pd.api.types.CategoricalDtype(
["", "adults", "children", "mixed"], ordered=False
)
tours["composition"] = tours["composition"].astype(cat_type)
state.add_table("tours", tours)


Expand Down Expand Up @@ -123,6 +127,10 @@ def joint_tour_composition(

# convert indexes to alternative names
choices = pd.Series(model_spec.columns[choices.values], index=choices.index)
cat_type = pd.api.types.CategoricalDtype(
model_spec.columns.tolist() + [""], ordered=False
)
choices = choices.astype(cat_type)

if estimator:
estimator.write_choices(choices)
Expand All @@ -134,7 +142,7 @@ def joint_tour_composition(
joint_tours["composition"] = choices

# reindex since we ran model on a subset of households
tours["composition"] = choices.reindex(tours.index).fillna("").astype(str)
tours["composition"] = choices.reindex(tours.index).fillna("")
state.add_table("tours", tours)

tracing.print_summary(
Expand Down
14 changes: 12 additions & 2 deletions activitysim/abm/models/joint_tour_destination.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,22 @@ def joint_tour_destination(

# add column as we want joint_tours table for tracing.
joint_tours["destination"] = choices_df.choice
assign_in_place(tours, joint_tours[["destination"]])
assign_in_place(
tours,
joint_tours[["destination"]],
state.settings.downcast_int,
state.settings.downcast_float,
)
state.add_table("tours", tours)

if want_logsums:
joint_tours[logsum_column_name] = choices_df["logsum"]
assign_in_place(tours, joint_tours[[logsum_column_name]])
assign_in_place(
tours,
joint_tours[[logsum_column_name]],
state.settings.downcast_int,
state.settings.downcast_float,
)

tracing.print_summary("destination", joint_tours.destination, describe=True)

Expand Down
15 changes: 13 additions & 2 deletions activitysim/abm/models/joint_tour_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,11 @@ def joint_tour_frequency(

# convert indexes to alternative names
choices = pd.Series(model_spec.columns[choices.values], index=choices.index)
cat_type = pd.api.types.CategoricalDtype(
model_spec.columns.tolist(),
ordered=False,
)
choices = choices.astype(cat_type)

if estimator:
estimator.write_choices(choices)
Expand All @@ -138,6 +143,12 @@ def joint_tour_frequency(

joint_tours = process_joint_tours(state, choices, alternatives, temp_point_persons)

# convert purpose to pandas categoricals
purpose_type = pd.api.types.CategoricalDtype(
alternatives.columns.tolist(), ordered=False
)
joint_tours["tour_type"] = joint_tours["tour_type"].astype(purpose_type)

tours = state.extend_table("tours", joint_tours)

state.tracing.register_traceable_table("tours", joint_tours)
Expand All @@ -147,8 +158,8 @@ def joint_tour_frequency(

# we expect there to be an alt with no tours - which we can use to backfill non-travelers
no_tours_alt = (alternatives.sum(axis=1) == 0).index[0]
households["joint_tour_frequency"] = (
choices.reindex(households.index).fillna(no_tours_alt).astype(str)
households["joint_tour_frequency"] = choices.reindex(households.index).fillna(
no_tours_alt
)

households["num_hh_joint_tours"] = (
Expand Down
9 changes: 7 additions & 2 deletions activitysim/abm/models/joint_tour_participation.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def get_tour_satisfaction(candidates, participate):

x = (
candidates[cols]
.groupby(["tour_id", "composition"])
.groupby(["tour_id", "composition"], observed=True)
.agg(
participants=("adult", "size"),
adults=("adult", "sum"),
Expand Down Expand Up @@ -475,7 +475,12 @@ def joint_tour_participation(
# update number_of_participants which was initialized to 1
joint_tours["number_of_participants"] = participants.groupby("tour_id").size()

assign_in_place(tours, joint_tours[["person_id", "number_of_participants"]])
assign_in_place(
tours,
joint_tours[["person_id", "number_of_participants"]],
state.settings.downcast_int,
state.settings.downcast_float,
)

state.add_table("tours", tours)

Expand Down
4 changes: 3 additions & 1 deletion activitysim/abm/models/joint_tour_scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,9 @@ def joint_tour_scheduling(
choices.to_frame("tdd"), tdd_alts, left_on=["tdd"], right_index=True, how="left"
)

assign_in_place(tours, choices)
assign_in_place(
tours, choices, state.settings.downcast_int, state.settings.downcast_float
)
state.add_table("tours", tours)

# updated df for tracing
Expand Down
4 changes: 3 additions & 1 deletion activitysim/abm/models/mandatory_scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ def mandatory_tour_scheduling(
tour_segment_col,
)

assign_in_place(tours, choices)
assign_in_place(
tours, choices, state.settings.downcast_int, state.settings.downcast_float
)
state.add_table("tours", tours)

# updated df for tracing
Expand Down
20 changes: 16 additions & 4 deletions activitysim/abm/models/mandatory_tour_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ def add_null_results(state, trace_label, mandatory_tour_frequency_settings):
logger.info("Skipping %s: add_null_results", trace_label)

persons = state.get_dataframe("persons")
persons["mandatory_tour_frequency"] = ""
persons["mandatory_tour_frequency"] = pd.categorical(
"",
categories=["", "work1", "work2", "school1", "school2", "work_and_school"],
ordered=False,
)

tours = pd.DataFrame()
tours["tour_category"] = None
Expand Down Expand Up @@ -134,6 +138,10 @@ def mandatory_tour_frequency(

# convert indexes to alternative names
choices = pd.Series(model_spec.columns[choices.values], index=choices.index)
cat_type = pd.api.types.CategoricalDtype(
model_spec.columns.tolist() + [""], ordered=False
)
choices = choices.astype(cat_type)

if estimator:
estimator.write_choices(choices)
Expand All @@ -158,6 +166,12 @@ def mandatory_tour_frequency(
state, persons=choosers, mandatory_tour_frequency_alts=alternatives
)

# convert purpose to pandas categoricals
purpose_type = pd.api.types.CategoricalDtype(
alternatives.columns.tolist() + ["univ", "home", "escort"], ordered=False
)
mandatory_tours["tour_type"] = mandatory_tours["tour_type"].astype(purpose_type)

tours = state.extend_table("tours", mandatory_tours)
state.tracing.register_traceable_table("tours", mandatory_tours)
state.get_rn_generator().add_channel("tours", mandatory_tours)
Expand All @@ -166,9 +180,7 @@ def mandatory_tour_frequency(
persons = state.get_dataframe("persons")

# need to reindex as we only handled persons with cdap_activity == 'M'
persons["mandatory_tour_frequency"] = (
choices.reindex(persons.index).fillna("").astype(str)
)
persons["mandatory_tour_frequency"] = choices.reindex(persons.index).fillna("")

expressions.assign_columns(
state,
Expand Down
14 changes: 12 additions & 2 deletions activitysim/abm/models/non_mandatory_destination.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,21 @@ def non_mandatory_tour_destination(
[pure_school_escort_tours, non_mandatory_tours]
).set_index(nm_tour_index)

assign_in_place(tours, non_mandatory_tours[["destination"]])
assign_in_place(
tours,
non_mandatory_tours[["destination"]],
state.settings.downcast_int,
state.settings.downcast_float,
)

if want_logsums:
non_mandatory_tours[logsum_column_name] = choices_df["logsum"]
assign_in_place(tours, non_mandatory_tours[[logsum_column_name]])
assign_in_place(
tours,
non_mandatory_tours[[logsum_column_name]],
state.settings.downcast_int,
state.settings.downcast_float,
)

assert all(
~tours["destination"].isna()
Expand Down
4 changes: 3 additions & 1 deletion activitysim/abm/models/non_mandatory_scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ def non_mandatory_tour_scheduling(
tour_segment_col,
)

assign_in_place(tours, choices)
assign_in_place(
tours, choices, state.settings.downcast_int, state.settings.downcast_float
)
state.add_table("tours", tours)

# updated df for tracing
Expand Down
8 changes: 8 additions & 0 deletions activitysim/abm/models/non_mandatory_tour_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,14 @@ def non_mandatory_tour_frequency(
)
assert len(non_mandatory_tours) == extended_tour_counts.sum().sum()

# convert purpose to pandas categoricals
purpose_type = pd.api.types.CategoricalDtype(
alternatives.columns.tolist(), ordered=False
)
non_mandatory_tours["tour_type"] = non_mandatory_tours["tour_type"].astype(
purpose_type
)

if estimator:
# make sure they created the right tours
survey_tours = estimation.manager.get_survey_table("tours").sort_index()
Expand Down
14 changes: 12 additions & 2 deletions activitysim/abm/models/parking_location_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,12 @@ def run_parking_destination(
if fail_some_trips_for_testing:
parking_df = parking_df.drop(parking_df.index[0])

assign_in_place(trips, parking_df.to_frame(parking_location_column_name))
assign_in_place(
trips,
parking_df.to_frame(parking_location_column_name),
state.settings.downcast_int,
state.settings.downcast_float,
)
trips[parking_location_column_name] = trips[
parking_location_column_name
].fillna(-1)
Expand Down Expand Up @@ -398,7 +403,12 @@ def parking_location(
trace_label=trace_label,
)

assign_in_place(trips_df, parking_locations.to_frame(alt_destination_col_name))
assign_in_place(
trips_df,
parking_locations.to_frame(alt_destination_col_name),
state.settings.downcast_int,
state.settings.downcast_float,
)

state.add_table("trips", trips_df)

Expand Down
Loading
Loading