3
3
import os
4
4
from pathlib import Path
5
5
from typing import Collection
6
+ import pickle
7
+ from datetime import datetime
6
8
7
9
import numpy as np
8
10
import pandas as pd
@@ -46,6 +48,8 @@ def location_choice_model(
46
48
settings_file = "{name}_model_settings.yaml" ,
47
49
landuse_file = "{name}_landuse.csv" ,
48
50
return_data = False ,
51
+ alt_values_to_feather = False ,
52
+ chunking_size = None ,
49
53
):
50
54
model_selector = name .replace ("_location" , "" )
51
55
model_selector = model_selector .replace ("_destination" , "" )
@@ -59,12 +63,42 @@ def _read_csv(filename, **kwargs):
59
63
filename = filename .format (name = name )
60
64
return pd .read_csv (os .path .join (edb_directory , filename ), ** kwargs )
61
65
66
+ def _read_feather (filename , ** kwargs ):
67
+ filename = filename .format (name = name )
68
+ return pd .read_feather (os .path .join (edb_directory , filename ), ** kwargs )
69
+
70
+ def _to_feather (df , filename , ** kwargs ):
71
+ filename = filename .format (name = name )
72
+ return df .to_feather (os .path .join (edb_directory , filename ), ** kwargs )
73
+
74
+ def _read_pickle (filename , ** kwargs ):
75
+ filename = filename .format (name = name )
76
+ return pd .read_pickle (os .path .join (edb_directory , filename ))
77
+
78
+ def _to_pickle (df , filename , ** kwargs ):
79
+ filename = filename .format (name = name )
80
+ return df .to_pickle (os .path .join (edb_directory , filename ))
81
+
82
+ def _file_exists (filename ):
83
+ filename = filename .format (name = name )
84
+ return os .path .exists (os .path .join (edb_directory , filename ))
85
+
62
86
coefficients = _read_csv (
63
87
coefficients_file ,
64
88
index_col = "coefficient_name" ,
65
89
)
66
90
spec = _read_csv (spec_file , comment = "#" )
67
- alt_values = _read_csv (alt_values_file )
91
+
92
+ # read alternative values either as csv or feather file
93
+ alt_values_fea_file = alt_values_file .replace (".csv" , ".fea" )
94
+ if os .path .exists (
95
+ os .path .join (edb_directory , alt_values_fea_file .format (name = name ))
96
+ ):
97
+ alt_values = _read_feather (alt_values_fea_file )
98
+ else :
99
+ alt_values = _read_csv (alt_values_file )
100
+ if alt_values_to_feather :
101
+ _to_feather (df = alt_values , filename = alt_values_fea_file )
68
102
chooser_data = _read_csv (chooser_file )
69
103
landuse = _read_csv (landuse_file , index_col = "zone_id" )
70
104
master_size_spec = _read_csv (size_spec_file )
@@ -152,7 +186,48 @@ def _read_csv(filename, **kwargs):
152
186
153
187
chooser_index_name = chooser_data .columns [0 ]
154
188
x_co = chooser_data .set_index (chooser_index_name )
155
- x_ca = cv_to_ca (alt_values .set_index ([chooser_index_name , alt_values .columns [1 ]]))
189
+
190
+ def split (a , n ):
191
+ k , m = divmod (len (a ), n )
192
+ return (a [i * k + min (i , m ) : (i + 1 ) * k + min (i + 1 , m )] for i in range (n ))
193
+
194
+ # process x_ca with cv_to_ca with or without chunking
195
+ x_ca_pickle_file = "{name}_x_ca.pkl"
196
+ if chunking_size == None :
197
+ x_ca = cv_to_ca (
198
+ alt_values .set_index ([chooser_index_name , alt_values .columns [1 ]])
199
+ )
200
+ elif _file_exists (x_ca_pickle_file ):
201
+ # if pickle file from previous x_ca processing exist, load it to save time
202
+ time_start = datetime .now ()
203
+ x_ca = _read_pickle (x_ca_pickle_file )
204
+ print (
205
+ f"x_ca data loaded from { name } _x_ca.fea - time elapsed { (datetime .now () - time_start ).total_seconds ()} "
206
+ )
207
+ else :
208
+ time_start = datetime .now ()
209
+ # calculate num_chunks based on chunking_size (or max number of rows per chunk)
210
+ num_chunks = int (len (alt_values ) / chunking_size )
211
+ all_person_ids = list (alt_values ["person_id" ].unique ())
212
+ split_ids = list (split (all_person_ids , num_chunks ))
213
+ x_ca_list = []
214
+ i = 0
215
+ for chunk_ids in split_ids :
216
+ alt_values_i = alt_values [alt_values ["person_id" ].isin (chunk_ids )]
217
+ x_ca_i = cv_to_ca (
218
+ alt_values_i .set_index ([chooser_index_name , alt_values_i .columns [1 ]])
219
+ )
220
+ x_ca_list .append (x_ca_i )
221
+ print (
222
+ f"\r x_ca_i compute done for chunk { i } /{ num_chunks } - time elapsed { (datetime .now () - time_start ).total_seconds ()} "
223
+ )
224
+ i = i + 1
225
+ x_ca = pd .concat (x_ca_list , axis = 0 )
226
+ # save final x_ca result as pickle file to save time for future data loading
227
+ _to_pickle (df = x_ca , filename = x_ca_pickle_file )
228
+ print (
229
+ f"x_ca compute done - time elapsed { (datetime .now () - time_start ).total_seconds ()} "
230
+ )
156
231
157
232
if CHOOSER_SEGMENT_COLUMN_NAME is not None :
158
233
# label segments with names
0 commit comments