Skip to content

Commit 6a20954

Browse files
authored
Merge pull request #687 from TransLinkForecasting/location_estimation_patch
Add options to handle larger dataset for location models
2 parents 62c3523 + 17a85ec commit 6a20954

File tree

1 file changed

+77
-2
lines changed

1 file changed

+77
-2
lines changed

activitysim/estimation/larch/location_choice.py

Lines changed: 77 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import os
44
from pathlib import Path
55
from typing import Collection
6+
import pickle
7+
from datetime import datetime
68

79
import numpy as np
810
import pandas as pd
@@ -46,6 +48,8 @@ def location_choice_model(
4648
settings_file="{name}_model_settings.yaml",
4749
landuse_file="{name}_landuse.csv",
4850
return_data=False,
51+
alt_values_to_feather=False,
52+
chunking_size=None,
4953
):
5054
model_selector = name.replace("_location", "")
5155
model_selector = model_selector.replace("_destination", "")
@@ -59,12 +63,42 @@ def _read_csv(filename, **kwargs):
5963
filename = filename.format(name=name)
6064
return pd.read_csv(os.path.join(edb_directory, filename), **kwargs)
6165

66+
def _read_feather(filename, **kwargs):
67+
filename = filename.format(name=name)
68+
return pd.read_feather(os.path.join(edb_directory, filename), **kwargs)
69+
70+
def _to_feather(df, filename, **kwargs):
71+
filename = filename.format(name=name)
72+
return df.to_feather(os.path.join(edb_directory, filename), **kwargs)
73+
74+
def _read_pickle(filename, **kwargs):
75+
filename = filename.format(name=name)
76+
return pd.read_pickle(os.path.join(edb_directory, filename))
77+
78+
def _to_pickle(df, filename, **kwargs):
79+
filename = filename.format(name=name)
80+
return df.to_pickle(os.path.join(edb_directory, filename))
81+
82+
def _file_exists(filename):
83+
filename = filename.format(name=name)
84+
return os.path.exists(os.path.join(edb_directory, filename))
85+
6286
coefficients = _read_csv(
6387
coefficients_file,
6488
index_col="coefficient_name",
6589
)
6690
spec = _read_csv(spec_file, comment="#")
67-
alt_values = _read_csv(alt_values_file)
91+
92+
# read alternative values either as csv or feather file
93+
alt_values_fea_file = alt_values_file.replace(".csv", ".fea")
94+
if os.path.exists(
95+
os.path.join(edb_directory, alt_values_fea_file.format(name=name))
96+
):
97+
alt_values = _read_feather(alt_values_fea_file)
98+
else:
99+
alt_values = _read_csv(alt_values_file)
100+
if alt_values_to_feather:
101+
_to_feather(df=alt_values, filename=alt_values_fea_file)
68102
chooser_data = _read_csv(chooser_file)
69103
landuse = _read_csv(landuse_file, index_col="zone_id")
70104
master_size_spec = _read_csv(size_spec_file)
@@ -152,7 +186,48 @@ def _read_csv(filename, **kwargs):
152186

153187
chooser_index_name = chooser_data.columns[0]
154188
x_co = chooser_data.set_index(chooser_index_name)
155-
x_ca = cv_to_ca(alt_values.set_index([chooser_index_name, alt_values.columns[1]]))
189+
190+
def split(a, n):
191+
k, m = divmod(len(a), n)
192+
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
193+
194+
# process x_ca with cv_to_ca with or without chunking
195+
x_ca_pickle_file = "{name}_x_ca.pkl"
196+
if chunking_size == None:
197+
x_ca = cv_to_ca(
198+
alt_values.set_index([chooser_index_name, alt_values.columns[1]])
199+
)
200+
elif _file_exists(x_ca_pickle_file):
201+
# if pickle file from previous x_ca processing exist, load it to save time
202+
time_start = datetime.now()
203+
x_ca = _read_pickle(x_ca_pickle_file)
204+
print(
205+
f"x_ca data loaded from {name}_x_ca.fea - time elapsed {(datetime.now() - time_start).total_seconds()}"
206+
)
207+
else:
208+
time_start = datetime.now()
209+
# calculate num_chunks based on chunking_size (or max number of rows per chunk)
210+
num_chunks = int(len(alt_values) / chunking_size)
211+
all_person_ids = list(alt_values["person_id"].unique())
212+
split_ids = list(split(all_person_ids, num_chunks))
213+
x_ca_list = []
214+
i = 0
215+
for chunk_ids in split_ids:
216+
alt_values_i = alt_values[alt_values["person_id"].isin(chunk_ids)]
217+
x_ca_i = cv_to_ca(
218+
alt_values_i.set_index([chooser_index_name, alt_values_i.columns[1]])
219+
)
220+
x_ca_list.append(x_ca_i)
221+
print(
222+
f"\rx_ca_i compute done for chunk {i}/{num_chunks} - time elapsed {(datetime.now() - time_start).total_seconds()}"
223+
)
224+
i = i + 1
225+
x_ca = pd.concat(x_ca_list, axis=0)
226+
# save final x_ca result as pickle file to save time for future data loading
227+
_to_pickle(df=x_ca, filename=x_ca_pickle_file)
228+
print(
229+
f"x_ca compute done - time elapsed {(datetime.now() - time_start).total_seconds()}"
230+
)
156231

157232
if CHOOSER_SEGMENT_COLUMN_NAME is not None:
158233
# label segments with names

0 commit comments

Comments
 (0)