forked from jdwittenauer/kaggle
-
Notifications
You must be signed in to change notification settings - Fork 0
/
example.py
216 lines (171 loc) · 7.27 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import sys
import pandas as pd
from datetime import datetime
from ionyx.utils import *
from ionyx.ensemble import *
from ionyx.experiment import *
from ionyx.visualization import *
from common import *
data_dir = 'C:\\Users\\jdwittenauer\\Documents\\Data\\Property Inspection\\'
logger = Logger(data_dir + 'output.txt')
sys.stdout = logger
def generate_features(data):
"""
Generates new derived features to add to the data set for model training.
"""
return data
def process_data(directory, train_file, test_file, label_index, column_offset, ex_generate_features):
"""
Reads in training data and prepares numpy arrays.
"""
train_data = load_csv_data(directory, train_file, index='Id')
test_data = load_csv_data(directory, test_file, index='Id')
if ex_generate_features:
train_data = generate_features(train_data)
test_data = generate_features(test_data)
X = train_data.iloc[:, column_offset:].values
y = train_data.iloc[:, label_index].values
X_test = test_data.values
# label encode the categorical variables
for i in range(X.shape[1]):
if type(X[0, i]) is str:
le = LabelEncoder()
le.fit(X[:, i])
X[:, i] = le.transform(X[:, i])
X_test[:, i] = le.transform(X_test[:, i])
print('Data processing complete.')
return train_data, test_data, X, y, X_test
def bag_of_models():
"""
Defines the set of models used in the ensemble.
"""
models = []
return models
def create_submission(test_data, y_pred, data_dir, submit_file):
"""
Create a new submission file with test data and predictions generated by the model.
"""
submit = pd.DataFrame(columns=['Id', 'Hazard'])
submit['Id'] = test_data.index
submit['Hazard'] = y_pred
submit.to_csv(data_dir + submit_file, sep=',', index=False, index_label=False)
print('Submission file complete.')
def main():
ex_process_train_data = True
ex_generate_features = False
ex_load_model = False
ex_save_model = False
ex_visualize_variable_relationships = False
ex_visualize_feature_distributions = False
ex_visualize_correlations = False
ex_visualize_sequential_relationships = False
ex_visualize_transforms = False
ex_define_model = True
ex_train_model = True
ex_visualize_feature_importance = False
ex_cross_validate = True
ex_plot_learning_curve = False
ex_parameter_search = False
ex_train_ensemble = False
ex_create_submission = True
train_file = 'train.csv'
test_file = 'test.csv'
submit_file = 'submission.csv'
model_file = 'model.pkl'
model_type = 'regression' # classification, regression
algorithm = 'ridge' # bayes, logistic, ridge, svm, sgd, forest, xt, boost, xgb, nn
metric = 'mean_squared_error' # accuracy, f1, log_loss, mean_absolute_error, mean_squared_error, r2, roc_auc
ensemble_mode = 'stacking' # averaging, stacking
eval = False
plot_eval_history = False
early_stopping = False
early_stopping_rounds = 10
label_index = 0
column_offset = 1
plot_size = 16
n_components = 2
n_folds = 5
train_data = None
test_data = None
X = None
y = None
X_test = None
y_pred = None
model = None
transforms = [StandardScaler()]
print('Starting process (' + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ')...')
print('Model Type = {0}'.format(model_type))
print('Algorithm = {0}'.format(algorithm))
print('Scoring Metric = {0}'.format(metric))
print('Generate Features = {0}'.format(ex_generate_features))
print('Transforms = {0}'.format(transforms))
if ex_process_train_data:
print('Reading in and processing data files...')
train_data, test_data, X, y, X_test = process_data(data_dir, train_file, test_file, label_index,
column_offset, ex_generate_features)
if ex_visualize_variable_relationships:
print('Visualizing pairwise relationships...')
# scatter, reg, resid, kde, hex
visualize_variable_relationships(train_data, 'scatter', ['Hazard', 'T1_V1'])
if ex_visualize_feature_distributions:
print('Visualizing feature distributions...')
# hist, kde
visualize_feature_distributions(train_data, 'hist', plot_size)
if ex_visualize_correlations:
print('Visualizing feature correlations...')
visualize_correlations(train_data)
if ex_visualize_sequential_relationships:
print('Visualizing sequential relationships...')
visualize_sequential_relationships(train_data, plot_size)
if ex_visualize_transforms:
print('Visualizing transformed data...')
visualize_transforms(X, y, model_type, n_components, transforms)
if ex_load_model:
print('Loading model from disk...')
model = load_model(data_dir + model_file)
if ex_define_model:
print('Building model definition...')
model = define_model(model_type, algorithm)
if ex_train_model:
print('Training model...')
model, training_history = train_model(X, y, model, model_type, metric, transforms, eval, plot_eval_history,
early_stopping, early_stopping_rounds)
if ex_visualize_feature_importance and algorithm in ['forest', 'xt', 'boost']:
print('Generating feature importance plot...')
visualize_feature_importance(model.feature_importances_, train_data.columns(), column_offset)
if ex_cross_validate:
print('Performing cross-validation...')
cross_validate(X, y, model, metric, transforms, n_folds)
if ex_plot_learning_curve:
print('Generating learning curve...')
plot_learning_curve(X, y, model, metric, transforms, n_folds)
if ex_save_model:
print('Saving model to disk...')
save_model(model, data_dir + model_file)
if ex_parameter_search:
print('Performing hyper-parameter grid search...')
parameter_grid_search(X, y, model, metric, [transforms], get_param_grid(algorithm))
if ex_train_ensemble:
print('Creating an ensemble of models...')
if ensemble_mode == 'averaging':
y_pred = train_averaged_ensemble(X, y, X_test, bag_of_models(), metric, transforms, n_folds)
else:
y_models, y_true, y_models_test, y_pred = train_stacked_ensemble(X, y, X_test, bag_of_models(),
metric, transforms, n_folds)
pd.DataFrame(y_models).to_csv(data_dir + 'stacker_train.csv')
pd.DataFrame(y_true).to_csv(data_dir + 'stacker_label.csv')
pd.DataFrame(y_models_test).to_csv(data_dir + 'stacker_test.csv')
if ex_create_submission:
if not ex_train_ensemble:
print('Predicting test data...')
transforms = fit_transforms(X, y, transforms)
X_test = apply_transforms(X_test, transforms)
y_pred = model.predict(X_test)
print('Creating submission file...')
create_submission(test_data, y_pred, data_dir, submit_file)
print('Process complete. (' + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ')')
print('')
print('')
logger.flush()
if __name__ == "__main__":
main()