This repository has been archived by the owner on Apr 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathtrain.py
124 lines (101 loc) · 4.28 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pandas as pd, numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.model_selection import GridSearchCV
import sklearn
from sklearn_pandas import DataFrameMapper
import joblib
import roro
INPUT_FILE = 'https://s3.amazonaws.com/rorodata-datasets/lending-club-data.csv'
features = [
'grade', # grade of the loan (categorical)
'sub_grade_num', # sub-grade of the loan as a number from 0 to 1
'short_emp', # one year or less of employment
'emp_length_num', # number of years of employment
'home_ownership', # home_ownership status: own, mortgage or rent
'dti', # debt to income ratio
'purpose', # the purpose of the loan
'payment_inc_ratio', # ratio of the monthly payment to income
'delinq_2yrs', # number of delinquincies
'delinq_2yrs_zero', # no delinquincies in last 2 years
'inq_last_6mths', # number of creditor inquiries in last 6 months
'last_delinq_none', # has borrower had a delinquincy
'last_major_derog_none', # has borrower had 90 day or worse rating
'open_acc', # number of open credit accounts
'pub_rec', # number of derogatory public records
'pub_rec_zero', # no derogatory public records
'revol_util', # percent of available credit being used
]
response = 'bad_loans'
numerical_cols=['sub_grade_num', 'short_emp', 'emp_length_num','dti', 'payment_inc_ratio', 'delinq_2yrs', \
'delinq_2yrs_zero', 'inq_last_6mths', 'last_delinq_none', 'last_major_derog_none', 'open_acc',\
'pub_rec', 'pub_rec_zero','revol_util']
categorical_cols=['grade', 'home_ownership', 'purpose']
def make_mapper():
return DataFrameMapper([
('grade',sklearn.preprocessing.LabelBinarizer()),
('home_ownership', sklearn.preprocessing.LabelBinarizer()),
('purpose', sklearn.preprocessing.LabelBinarizer()),
])
def train():
print("reading the input file")
loans = pd.read_csv(INPUT_FILE, infer_datetime_format=True)
clean_data=loans[features+[response]].dropna()
print("transforming the data")
mapper = make_mapper()
X1 = mapper.fit_transform(clean_data)
X2 = np.array(clean_data[numerical_cols])
X = np.hstack((X1,X2)) #Combines X1 and X2 side by side, i.e. stacks them horizontally
y = np.array(clean_data['bad_loans'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100, stratify=y)
print("building the model")
log_lm = LogisticRegression()
log_lm.fit(X_train, y_train)
test_score = log_lm.score(X_test, y_test)
print(test_score)
print("saving the model")
joblib.dump(mapper, "/volumes/data/mapper.pkl")
joblib.dump(log_lm, "/volumes/data/logistic-regression-model.pkl")
p = roro.get_current_project()
repo = p.get_model_repository("logistic-regression-model")
image = repo.new_model_image(log_lm)
image["Numerical-Columns"] = ",".join(numerical_cols)
image["Categorical-Columns"] = ",".join(categorical_cols)
image["Test-Score"] = str(test_score)
image.save(comment="built new model")
print("done")
_mapper = None
_model = None
def load_mapper():
global _mapper
if _mapper is None:
_mapper = joblib.load("/volumes/data/mapper.pkl")
return _mapper
def load_model():
global _model
if _model is None:
_model = joblib.load("/volumes/data/logistic-regression-model.pkl")
return _model
def preprocess(mapper, row):
data=list(row.values())
colz=list(row.keys())
dfx=pd.DataFrame(data=[data], columns=colz)
XX1=mapper.transform(dfx)
XX2=dfx[numerical_cols]
XX = np.hstack((XX1,XX2))
return XX
def predict(row):
mapper = load_mapper()
model = load_model()
row = preprocess(mapper, row)
try:
return model.predict_proba(row)[:,1][0]
except:
return -1
if __name__ == '__main__':
train()