Skip to content

Commit

Permalink
categorical time period dtype
Browse files Browse the repository at this point in the history
  • Loading branch information
jpn-- committed Sep 26, 2023
1 parent 8c98fbc commit 128a5fc
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 34 deletions.
2 changes: 1 addition & 1 deletion activitysim/abm/models/parking_location_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def parking_location(
if "trip_period" not in trips_merged_df:
# TODO: resolve this to the skim time period index not the label, it will be faster
trips_merged_df["trip_period"] = network_los.skim_time_period_label(
trips_merged_df[proposed_trip_departure_period]
trips_merged_df[proposed_trip_departure_period], as_cat=True
)
model_settings["TRIP_DEPARTURE_PERIOD"] = "trip_period"

Expand Down
2 changes: 1 addition & 1 deletion activitysim/abm/models/trip_mode_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def trip_mode_choice(
# setup skim keys
assert "trip_period" not in trips_merged
trips_merged["trip_period"] = network_los.skim_time_period_label(
trips_merged.depart
trips_merged.depart, as_cat=True
)

orig_col = "origin"
Expand Down
16 changes: 10 additions & 6 deletions activitysim/abm/models/util/logsums.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ def compute_logsums(
# FIXME - are we ok with altering choosers (so caller doesn't have to set these)?
if (in_period_col is not None) and (out_period_col is not None):
choosers["in_period"] = network_los.skim_time_period_label(
choosers[in_period_col]
choosers[in_period_col], as_cat=True
)
choosers["out_period"] = network_los.skim_time_period_label(
choosers[out_period_col]
choosers[out_period_col], as_cat=True
)
elif ("in_period" not in choosers.columns) and (
"out_period" not in choosers.columns
Expand All @@ -92,17 +92,21 @@ def compute_logsums(
and tour_purpose in model_settings["OUT_PERIOD"]
):
choosers["in_period"] = network_los.skim_time_period_label(
model_settings["IN_PERIOD"][tour_purpose]
model_settings["IN_PERIOD"][tour_purpose],
as_cat=True,
broadcast_to=choosers.index,
)
choosers["out_period"] = network_los.skim_time_period_label(
model_settings["OUT_PERIOD"][tour_purpose]
model_settings["OUT_PERIOD"][tour_purpose],
as_cat=True,
broadcast_to=choosers.index,
)
else:
choosers["in_period"] = network_los.skim_time_period_label(
model_settings["IN_PERIOD"]
model_settings["IN_PERIOD"], as_cat=True, broadcast_to=choosers.index
)
choosers["out_period"] = network_los.skim_time_period_label(
model_settings["OUT_PERIOD"]
model_settings["OUT_PERIOD"], as_cat=True, broadcast_to=choosers.index
)
else:
logger.error("Choosers table already has columns 'in_period' and 'out_period'.")
Expand Down
8 changes: 6 additions & 2 deletions activitysim/abm/models/util/mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,12 @@ def run_tour_mode_choice_simulate(
assert ("in_period" not in choosers) and ("out_period" not in choosers)
in_time = skims["in_time_col_name"]
out_time = skims["out_time_col_name"]
choosers["in_period"] = network_los.skim_time_period_label(choosers[in_time])
choosers["out_period"] = network_los.skim_time_period_label(choosers[out_time])
choosers["in_period"] = network_los.skim_time_period_label(
choosers[in_time], as_cat=True
)
choosers["out_period"] = network_los.skim_time_period_label(
choosers[out_time], as_cat=True
)

expressions.annotate_preprocessors(
state, choosers, locals_dict, skims, model_settings, trace_label
Expand Down
50 changes: 34 additions & 16 deletions activitysim/abm/models/util/vectorize_tour_scheduling.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ def dedupe_alt_tdd(state: workflow.State, alt_tdd, tour_purpose, trace_label):

logger.info("tdd_alt_segments specified for representative logsums")

if tdd_segments is not None:
# apply categorical dtypes
tdd_segments["time_period"] = tdd_segments["time_period"].astype(
alt_tdd["out_period"].dtype
)

with chunk.chunk_log(
state, tracing.extend_trace_label(trace_label, "dedupe_alt_tdd")
) as chunk_sizer:
Expand Down Expand Up @@ -328,11 +334,12 @@ def compute_tour_scheduling_logsums(
assert "out_period" not in alt_tdd
assert "in_period" not in alt_tdd

# FIXME:MEMORY
# These two lines each generate a massive array of strings,
# using a bunch of RAM and slowing things down.
alt_tdd["out_period"] = network_los.skim_time_period_label(alt_tdd["start"])
alt_tdd["in_period"] = network_los.skim_time_period_label(alt_tdd["end"])
alt_tdd["out_period"] = network_los.skim_time_period_label(
alt_tdd["start"], as_cat=True
)
alt_tdd["in_period"] = network_los.skim_time_period_label(
alt_tdd["end"], as_cat=True
)

alt_tdd["duration"] = alt_tdd["end"] - alt_tdd["start"]

Expand Down Expand Up @@ -383,17 +390,28 @@ def compute_tour_scheduling_logsums(

# tracing.log_runtime(model_name=trace_label, start_time=t0)

# redupe - join the alt_tdd_period logsums to alt_tdd to get logsums for alt_tdd
logsums = (
pd.merge(
alt_tdd.reset_index(),
deduped_alt_tdds.reset_index(),
on=[index_name] + redupe_columns,
how="left",
)
.set_index(index_name)
.logsums
)
logsums = pd.Series(data=0, index=alt_tdd.index, dtype=np.float64)
left_on = [alt_tdd.index]
right_on = [deduped_alt_tdds.index]
for i in redupe_columns:
if (
alt_tdd[i].dtype == "category"
and alt_tdd[i].dtype.ordered
and alt_tdd[i].dtype == deduped_alt_tdds[i].dtype
):
left_on += [alt_tdd[i].cat.codes]
right_on += [deduped_alt_tdds[i].cat.codes]
else:
left_on += [alt_tdd[i].to_numpy()]
right_on += [deduped_alt_tdds[i].to_numpy()]

logsums.iloc[:] = pd.merge(
pd.DataFrame(index=alt_tdd.index),
deduped_alt_tdds.logsums,
left_on=left_on,
right_on=right_on,
how="left",
).logsums.to_numpy()
chunk_sizer.log_df(trace_label, "logsums", logsums)

del deduped_alt_tdds
Expand Down
16 changes: 13 additions & 3 deletions activitysim/core/los.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,7 +845,9 @@ def get_tappairs3d(self, otap, dtap, dim3, key):

return s.values

def skim_time_period_label(self, time_period, fillna=None):
def skim_time_period_label(
self, time_period, fillna=None, as_cat=False, broadcast_to=None
):
"""
convert time period times to skim time period labels (e.g. 9 -> 'AM')
Expand Down Expand Up @@ -888,6 +890,12 @@ def skim_time_period_label(self, time_period, fillna=None):
result = self.skim_time_periods["labels"].get(bin, default=default)
else:
result = self.skim_time_periods["labels"][bin]
if broadcast_to is not None and as_cat:
result = pd.Series(
data=result,
index=broadcast_to,
dtype=self.skim_dicts["taz"].time_label_dtype,
)
else:
result = pd.cut(
time_period,
Expand All @@ -898,8 +906,10 @@ def skim_time_period_label(self, time_period, fillna=None):
if fillna is not None:
default = self.skim_time_periods["labels"][fillna]
result = result.fillna(default)
result = result.astype(str)

if as_cat:
result = result.astype(self.skim_dicts["taz"].time_label_dtype)
else:
result = result.astype(str)
return result

def get_tazs(self, state):
Expand Down
30 changes: 25 additions & 5 deletions activitysim/core/skim_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ def __init__(self, dataset):
self.time_map = {
j: i for i, j in enumerate(self.dataset.indexes["time_period"])
}
self.time_label_dtype = pd.api.types.CategoricalDtype(
self.dataset.indexes["time_period"],
ordered=True,
)
self.usage = set() # track keys of skims looked up

@property
Expand Down Expand Up @@ -184,6 +188,10 @@ def __init__(self, dataset, orig_key, dest_key, time_key=None, *, time_map=None)
}
else:
self.time_map = time_map
self.time_label_dtype = pd.api.types.CategoricalDtype(
self.dataset.indexes["time_period"],
ordered=True,
)

@property
def odim(self):
Expand Down Expand Up @@ -246,6 +254,11 @@ def set_df(self, df):
):
logger.info(f"natural use for time_period={self.time_key}")
positions["time_period"] = df[self.time_key]
elif (
df[self.time_key].dtype == "category"
and df[self.time_key].dtype == self.time_label_dtype
):
positions["time_period"] = df[self.time_key].cat.codes
else:
logger.info(f"vectorize lookup for time_period={self.time_key}")
positions["time_period"] = pd.Series(
Expand All @@ -257,11 +270,18 @@ def set_df(self, df):
self.positions = {}
for k, v in positions.items():
try:
self.positions[k] = v.astype(int)
except TypeError:
# possibly some missing values that are not relevant,
# fill with zeros to continue.
self.positions[k] = v.fillna(0).astype(int)
is_int = np.issubdtype(v.dtype, np.integer)
except Exception:
is_int = False
if is_int:
self.positions[k] = v
else:
try:
self.positions[k] = v.astype(int)
except TypeError:
# possibly some missing values that are not relevant,
# fill with zeros to continue.
self.positions[k] = v.fillna(0).astype(int)
else:
self.positions = pd.DataFrame(positions).astype(int)

Expand Down

0 comments on commit 128a5fc

Please sign in to comment.