4
4
# https://github.com/ipums/hlink
5
5
6
6
import itertools
7
+ import logging
7
8
import math
8
9
import re
10
+ from time import perf_counter
9
11
from typing import Any
10
12
import numpy as np
11
13
import pandas as pd
19
21
20
22
from hlink .linking .link_step import LinkStep
21
23
24
+ logger = logging .getLogger (__name__ )
25
+
22
26
23
27
class LinkStepTrainTestModels (LinkStep ):
24
28
def __init__ (self , task ) -> None :
@@ -64,7 +68,15 @@ def _run(self) -> None:
64
68
splits = self ._get_splits (prepped_data , id_a , n_training_iterations , seed )
65
69
66
70
model_parameters = self ._get_model_parameters (config )
67
- for run in model_parameters :
71
+
72
+ logger .info (
73
+ f"There are { len (model_parameters )} sets of model parameters to explore; "
74
+ f"each of these has { n_training_iterations } train-test splits to test on"
75
+ )
76
+ for run_index , run in enumerate (model_parameters , 1 ):
77
+ logger .info (
78
+ f"Starting run { run_index } of { len (model_parameters )} with these parameters: { run } "
79
+ )
68
80
params = run .copy ()
69
81
model_type = params .pop ("type" )
70
82
@@ -83,20 +95,31 @@ def _run(self) -> None:
83
95
threshold_ratio = False
84
96
85
97
threshold_matrix = _calc_threshold_matrix (alpha_threshold , threshold_ratio )
98
+ logger .debug (f"The threshold matrix has { len (threshold_matrix )} entries" )
99
+
86
100
results_dfs : dict [int , pd .DataFrame ] = {}
87
101
for i in range (len (threshold_matrix )):
88
102
results_dfs [i ] = _create_results_df ()
89
103
90
104
first = True
91
- for training_data , test_data in splits :
105
+ for split_index , (training_data , test_data ) in enumerate (splits , 1 ):
106
+ logger .debug (
107
+ f"Training and testing the model on train-test split { split_index } of { n_training_iterations } "
108
+ )
92
109
training_data .cache ()
93
110
test_data .cache ()
94
111
95
112
classifier , post_transformer = classifier_core .choose_classifier (
96
113
model_type , params , dep_var
97
114
)
98
115
116
+ logger .debug ("Training the model on the training data split" )
117
+ start_train_time = perf_counter ()
99
118
model = classifier .fit (training_data )
119
+ end_train_time = perf_counter ()
120
+ logger .debug (
121
+ f"Successfully trained the model in { end_train_time - start_train_time :.2f} s"
122
+ )
100
123
101
124
predictions_tmp = _get_probability_and_select_pred_columns (
102
125
test_data , model , post_transformer , id_a , id_b , dep_var
@@ -137,7 +160,13 @@ def _run(self) -> None:
137
160
first = False
138
161
139
162
i = 0
140
- for alpha_threshold , threshold_ratio in threshold_matrix :
163
+ for threshold_index , (alpha_threshold , threshold_ratio ) in enumerate (
164
+ threshold_matrix , 1
165
+ ):
166
+ logger .debug (
167
+ f"Predicting with threshold matrix entry { threshold_index } of { len (threshold_matrix )} : "
168
+ f"{ alpha_threshold = } and { threshold_ratio = } "
169
+ )
141
170
predictions = threshold_core .predict_using_thresholds (
142
171
predictions_tmp ,
143
172
alpha_threshold ,
0 commit comments