Skip to content

Commit 9a62b98

Browse files
committed
[#154] Add logging to model_exploration.link_step_train_test_models
1 parent 6ed140a commit 9a62b98

File tree

1 file changed

+32
-3
lines changed

1 file changed

+32
-3
lines changed

hlink/linking/model_exploration/link_step_train_test_models.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
# https://github.com/ipums/hlink
55

66
import itertools
7+
import logging
78
import math
89
import re
10+
from time import perf_counter
911
from typing import Any
1012
import numpy as np
1113
import pandas as pd
@@ -19,6 +21,8 @@
1921

2022
from hlink.linking.link_step import LinkStep
2123

24+
logger = logging.getLogger(__name__)
25+
2226

2327
class LinkStepTrainTestModels(LinkStep):
2428
def __init__(self, task) -> None:
@@ -64,7 +68,15 @@ def _run(self) -> None:
6468
splits = self._get_splits(prepped_data, id_a, n_training_iterations, seed)
6569

6670
model_parameters = self._get_model_parameters(config)
67-
for run in model_parameters:
71+
72+
logger.info(
73+
f"There are {len(model_parameters)} sets of model parameters to explore; "
74+
f"each of these has {n_training_iterations} train-test splits to test on"
75+
)
76+
for run_index, run in enumerate(model_parameters, 1):
77+
logger.info(
78+
f"Starting run {run_index} of {len(model_parameters)} with these parameters: {run}"
79+
)
6880
params = run.copy()
6981
model_type = params.pop("type")
7082

@@ -83,20 +95,31 @@ def _run(self) -> None:
8395
threshold_ratio = False
8496

8597
threshold_matrix = _calc_threshold_matrix(alpha_threshold, threshold_ratio)
98+
logger.debug(f"The threshold matrix has {len(threshold_matrix)} entries")
99+
86100
results_dfs: dict[int, pd.DataFrame] = {}
87101
for i in range(len(threshold_matrix)):
88102
results_dfs[i] = _create_results_df()
89103

90104
first = True
91-
for training_data, test_data in splits:
105+
for split_index, (training_data, test_data) in enumerate(splits, 1):
106+
logger.debug(
107+
f"Training and testing the model on train-test split {split_index} of {n_training_iterations}"
108+
)
92109
training_data.cache()
93110
test_data.cache()
94111

95112
classifier, post_transformer = classifier_core.choose_classifier(
96113
model_type, params, dep_var
97114
)
98115

116+
logger.debug("Training the model on the training data split")
117+
start_train_time = perf_counter()
99118
model = classifier.fit(training_data)
119+
end_train_time = perf_counter()
120+
logger.debug(
121+
f"Successfully trained the model in {end_train_time - start_train_time:.2f}s"
122+
)
100123

101124
predictions_tmp = _get_probability_and_select_pred_columns(
102125
test_data, model, post_transformer, id_a, id_b, dep_var
@@ -137,7 +160,13 @@ def _run(self) -> None:
137160
first = False
138161

139162
i = 0
140-
for alpha_threshold, threshold_ratio in threshold_matrix:
163+
for threshold_index, (alpha_threshold, threshold_ratio) in enumerate(
164+
threshold_matrix, 1
165+
):
166+
logger.debug(
167+
f"Predicting with threshold matrix entry {threshold_index} of {len(threshold_matrix)}: "
168+
f"{alpha_threshold=} and {threshold_ratio=}"
169+
)
141170
predictions = threshold_core.predict_using_thresholds(
142171
predictions_tmp,
143172
alpha_threshold,

0 commit comments

Comments
 (0)