10
10
11
11
import numpy as np
12
12
import xgboost as xgb
13
- import pandas as pd
13
+ import pandas as pd
14
14
15
15
from sklearn .model_selection import train_test_split
16
16
from sklearn .preprocessing import RobustScaler
24
24
25
25
26
26
class HarvesterMaintenance ():
27
-
27
+
28
28
def __init__ (self , model_name : str ):
29
29
self .model_name = model_name
30
30
self .file = ''
@@ -39,21 +39,21 @@ def __init__(self, model_name: str):
39
39
self .run_id = ''
40
40
self .active_experiment = ''
41
41
self .xgb_model = ''
42
-
42
+
43
43
def mlflow_tracking (self , tracking_uri : str = './mlruns' , experiment : str = None , new_experiment : str = None ):
44
-
44
+
45
45
# sets tracking URI
46
46
mlflow .set_tracking_uri (tracking_uri )
47
-
47
+
48
48
# creates new experiment if no experiment is specified
49
- if experiment == None :
49
+ if experiment is None :
50
50
mlflow .create_experiment (new_experiment )
51
51
self .active_experiment = new_experiment
52
52
mlflow .set_experiment (new_experiment )
53
53
else :
54
54
mlflow .set_experiment (experiment )
55
55
self .active_experiment = experiment
56
-
56
+
57
57
def process_data (self , file : str , test_size : int = .25 ):
58
58
"""processes raw data for training
59
59
@@ -72,11 +72,11 @@ def process_data(self, file: str, test_size: int = .25):
72
72
except FileNotFoundError :
73
73
sys .exit ('Dataset file not found' )
74
74
75
-
76
75
X = data .drop ('Asset_Label' , axis = 1 )
77
76
y = data .Asset_Label
78
77
79
- X_train , X_test , self .y_train , self .y_test = train_test_split (X , y , test_size = test_size )
78
+ X_train , X_test , self .y_train , self .y_test = train_test_split (
79
+ X , y , test_size = test_size )
80
80
81
81
df_num_train = X_train .select_dtypes (['float' , 'int' , 'int32' ])
82
82
df_num_test = X_test .select_dtypes (['float' , 'int' , 'int32' ])
@@ -97,21 +97,24 @@ def process_data(self, file: str, test_size: int = .25):
97
97
del X_test_scaled_transformed ['Number_Repairs' ]
98
98
99
99
# Dropping the unscaled numerical columns
100
- X_train = X_train .drop (['Age' , 'Temperature' , 'Last_Maintenance' , 'Motor_Current' ], axis = 1 )
101
- X_test = X_test .drop (['Age' , 'Temperature' , 'Last_Maintenance' , 'Motor_Current' ], axis = 1 )
102
-
100
+ X_train = X_train .drop (
101
+ ['Age' , 'Temperature' , 'Last_Maintenance' , 'Motor_Current' ], axis = 1 )
102
+ X_test = X_test .drop (
103
+ ['Age' , 'Temperature' , 'Last_Maintenance' , 'Motor_Current' ], axis = 1 )
104
+
103
105
X_train = X_train .astype (int )
104
106
X_test = X_test .astype (int )
105
107
106
108
# Creating train and test data with scaled numerical columns
107
- X_train_scaled_transformed = pd .concat ([X_train_scaled_transformed , X_train ], axis = 1 )
108
- X_test_scaled_transformed = pd .concat ([X_test_scaled_transformed , X_test ], axis = 1 )
109
+ X_train_scaled_transformed = pd .concat (
110
+ [X_train_scaled_transformed , X_train ], axis = 1 )
111
+ X_test_scaled_transformed = pd .concat (
112
+ [X_test_scaled_transformed , X_test ], axis = 1 )
109
113
110
114
self .X_train_scaled_transformed = X_train_scaled_transformed .astype (
111
- {'Motor_Current' : 'float64' })
115
+ {'Motor_Current' : 'float64' })
112
116
self .X_test_scaled_transformed = X_test_scaled_transformed .astype (
113
- {'Motor_Current' : 'float64' })
114
-
117
+ {'Motor_Current' : 'float64' })
115
118
116
119
def train (self , ncpu : int = 4 ):
117
120
"""trains an XGBoost Classifier and Tracks Models with MLFlow
@@ -121,28 +124,29 @@ def train(self, ncpu: int = 4):
121
124
ncpu : int, optional
122
125
number of CPU threads used for training, by default 4
123
126
"""
124
-
127
+
125
128
# Set xgboost parameters
126
129
self .parameters = {
127
- 'max_bin' : 256 ,
128
- 'scale_pos_weight' : 2 ,
129
- 'lambda_l2' : 1 ,
130
- 'alpha' : 0.9 ,
131
- 'max_depth' : 8 ,
132
- 'num_leaves' : 2 ** 8 ,
133
- 'verbosity' : 0 ,
134
- 'objective' : 'multi:softmax' ,
135
- 'learning_rate' : 0.3 ,
136
- 'num_class' : 3 ,
137
- 'nthread' : ncpu
130
+ 'max_bin' : 256 ,
131
+ 'scale_pos_weight' : 2 ,
132
+ 'lambda_l2' : 1 ,
133
+ 'alpha' : 0.9 ,
134
+ 'max_depth' : 8 ,
135
+ 'num_leaves' : 2 ** 8 ,
136
+ 'verbosity' : 0 ,
137
+ 'objective' : 'multi:softmax' ,
138
+ 'learning_rate' : 0.3 ,
139
+ 'num_class' : 3 ,
140
+ 'nthread' : ncpu
138
141
}
139
-
142
+
140
143
with mlflow .start_run () as run :
141
- mlflow .xgboost .autolog ()
142
- xgb_train = xgb .DMatrix (self .X_train_scaled_transformed , label = np .array (self .y_train ))
143
-
144
- self .xgb_model = xgb .train (self .parameters , xgb_train , num_boost_round = 100 )
145
-
144
+ mlflow .xgboost .autolog ()
145
+ xgb_train = xgb .DMatrix (
146
+ self .X_train_scaled_transformed , label = np .array (self .y_train ))
147
+
148
+ self .xgb_model = xgb .train (
149
+ self .parameters , xgb_train , num_boost_round = 100 )
146
150
147
151
def validate (self ):
148
152
"""performs model validation with testing data
@@ -154,17 +158,20 @@ def validate(self):
154
158
"""
155
159
dtest = xgb .DMatrix (self .X_test_scaled_transformed , self .y_test )
156
160
xgb_prediction = self .xgb_model .predict (dtest )
157
- xgb_errors_count = np .count_nonzero (xgb_prediction - np .ravel (self .y_test ))
161
+ xgb_errors_count = np .count_nonzero (
162
+ xgb_prediction - np .ravel (self .y_test ))
158
163
self .accuracy_scr = 1 - xgb_errors_count / xgb_prediction .shape [0 ]
159
-
160
- xp = mlflow .get_experiment_by_name (self .active_experiment )._experiment_id
161
- self .run_id = mlflow .search_runs (xp , output_format = "list" )[0 ].info .run_id
162
-
163
- with mlflow .start_run (self .run_id ):
164
- mlflow .log_metric ("accuracy" ,self .accuracy_scr )
165
-
164
+
165
+ xp = mlflow .get_experiment_by_name (
166
+ self .active_experiment )._experiment_id
167
+ self .run_id = mlflow .search_runs (
168
+ xp , output_format = "list" )[0 ].info .run_id
169
+
170
+ with mlflow .start_run (self .run_id ):
171
+ mlflow .log_metric ("accuracy" , self .accuracy_scr )
172
+
166
173
return self .accuracy_scr
167
-
174
+
168
175
def save (self , model_path ):
169
176
"""saves trained model to path
170
177
@@ -174,13 +181,13 @@ def save(self, model_path):
174
181
path where trained model should be saved
175
182
"""
176
183
177
- self .model_path = model_path + self .model_name + '.joblib'
178
- self .scaler_path = model_path + self .model_name + '_scaler.joblib'
179
-
184
+ self .model_path = model_path + self .model_name + '.joblib'
185
+ self .scaler_path = model_path + self .model_name + '_scaler.joblib'
186
+
180
187
logger .info ("Saving model" )
181
- with open ( self .model_path , "wb" ) as fh :
188
+ with open (self .model_path , "wb" ) as fh :
182
189
joblib .dump (self .xgb_model , fh .name )
183
-
190
+
184
191
logger .info ("Saving Scaler" )
185
- with open ( self .scaler_path , "wb" ) as fh :
192
+ with open (self .scaler_path , "wb" ) as fh :
186
193
joblib .dump (self .robust_scaler , fh .name )
0 commit comments