-
Notifications
You must be signed in to change notification settings - Fork 0
/
DecisionTree_RF.py
227 lines (199 loc) · 9.42 KB
/
DecisionTree_RF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# -*- coding: utf-8 -*-
"""
Created on Wed May 9 23:20:03 2018
Decision Tree classifier, used to classify datasets with any number of continuous attributes.
@author:Samuel Oswald
"""
##Import numpy for management of arrays.
import numpy as np
"""Build decision tree. data refers to the training dataset.
max_depth refers to how deep the tree can get. min_size is the minimum
amount of samples before a leaf node must be classified."""
def dt_train( data, max_depth, min_size = 1):
max_depth = int(max_depth)
min_size = int(min_size)
attr, split_val, left, right = split(data)
tree = {"attribute": attr, "split": split_val, "left": left, "right": right, "current_mode": leaf(data)}
decision(tree, max_depth, min_size)
return tree
def gini(node):
"""Calculate the gini impurity for a node. Aim is to minimize gini impurity(gain function)."""
##Find the number of classifications in current node.
classifications = node[:,-1]
samples = classifications.size
unique, counts = np.unique(classifications, return_counts = True)
##calculate gini based on number of classes
gini = 1
for i in range (0, unique.size):
proportion = counts[i] / samples
##
gini = gini - proportion * proportion
return gini
def gain(values, cur_gini, attribute, split):
"""Calculate information gain for an attribute split at each level.
Inputs are the current subset of data, initial gini at parent node,
attribute to be split and split number."""
i = attribute
samples = values[:,-1].size
left = values[values[:,i] < split, :]
right = values[values[:,i] >= split, :]
left_samples = left[:,-1].size
right_samples = right[1:,-1].size
##Calculate left and right side gini
left_gini = gini(left)
right_gini = gini(right)
##Calculate information gain at this split value.
gain = cur_gini - (left_samples/samples)*left_gini - (right_samples/samples)*right_gini
return gain, left, right
def split(node):
"""Find the ideal split point by searching for the best information gain
of all attributes and their potential split values.
If no gain improves, node is split for leaf node creation as right side left at 0 samples."""
cur_gini = gini(node)
best_gain = 0
best_attr = 0
best_split = 0
##Implement greedy, exhaustive search for best information gain
##RF choose features
variables = len(node[0])
cols = np.random.randint(len(node[0]), size = int(np.sqrt(variables)))
best_left = node
best_right = np.empty([0,variables])
##Seach through each unique value to find best division
for v in range(0, variables-1):
if v in cols:
uniques = np.unique(node[:, v])
for row in uniques:
new_gain, left, right = gain(node, cur_gini, v, row)
##Select the best gain, and associated attributes
if new_gain > best_gain:
best_gain = new_gain
best_attr = v
best_split = row
best_left = left
best_right = right
#return {"attribute": best_attr, "split": best_split, "left": best_left, "right": best_right}
return best_attr, best_split, best_left, best_right
def leaf(node):
"""Return classification value for leaf node,
when either maximum depth of tree reached or node is suitably weighted to one class."""
classes = node[:, -1].tolist()
return max(set(node[:,-1]), key = classes.count)
def decision(tree, max_depth=10, min_size=0, depth=0):
"""Uses split and leaf functions to build a tree, using a root data set.
Will assign leaf nodes if either maximum depth or minimum samples are reached.
root node contains both current node data, as well as decision rules to that point.
"""
left = tree["left"]
right = tree["right"]
##If tree is at max depth, assign most common member.
if depth >= max_depth:
tree['left'] = leaf(left)
tree['right'] = leaf(right)
##If continuing sampling
else:
##Left side child
##If minimum samples exist in current node, make it a leaf with max occuring value in samples.
if left[:, -1].size <= min_size:
tree['left'] = leaf(left)
##Else continue building tree.
else:
left_attr, left_split, left_left, left_right = split(left)
##Check if node is terminal. Make it a leaf node if so.
if left_left.size == 0 or left_right.size == 0:
tree['left'] = leaf(np.vstack([left_left,left_right]))
##Continue elsewise.
else:
tree['left'] = {"attribute": left_attr, "split": left_split, "left": left_left, "right": left_right, "current_mode": leaf(left)}
decision(tree['left'], max_depth, min_size, depth+1)
##right side child. Same process as above.
if right[:, -1].size <= min_size:
tree['right'] = leaf(right)
else:
right_attr, right_split, right_left, right_right = split(right)
if right_left.size == 0 or right_right.size == 0:
tree['right'] = leaf(np.vstack([right_left,right_right]))
else:
tree['right'] = {"attribute": right_attr, "split": right_split, "left": right_left, "right": right_right, "current_mode": leaf(right)}
decision(tree['right'], max_depth, min_size, depth+1)
def classify(tree,row):
"""classify new data based on current row.
Involves searching through tree based on the attributes of validation data.
Will return classification value once leaf of tree is reached."""
##Look at each sample to classify. append to list of output values.
##Recursively search through branches until an append can be made.
if row[tree['attribute']] < tree['split']:
if isinstance(tree['left'],dict):
return classify(tree['left'], row)
else:
return tree['left']
else:
if isinstance(tree['right'],dict):
return classify(tree['right'], row)
else:
return tree['right']
def dt_predict( tree, data):
"""For every row in the validation data,
a call to the classify function is done,
with results appended to prediction data."""
predictions = []
for row in data:
pred = classify(tree, row)
predictions.append(int(pred))
return predictions
##functions for validation and pruning.
def dt_confusion_matrix( predicted, actual,classes):
"""Return a confusion matrix showing the difference between actual values,
and model predicted values. Also returns total accuracy"""
matrix = np.zeros((len(classes), len(classes)))
for a, p in zip(actual, predicted):
matrix[a][p] += 1
accuracy = (actual == predicted).sum() / float(len(actual))*100
return matrix, accuracy
def print_dt(tree, depth = 0):
""""Iterate through decision tree, printing out values."""
print ((" " * depth) + "attribute " + str(tree['attribute']) + " > " + str(tree['split']))
if isinstance(tree['left'], dict):
print_dt(tree['left'], depth + 1)
else:
print ((" " *(depth + 1)) + str(tree['left']))
if isinstance(tree['right'], dict):
print_dt(tree['right'], depth + 1)
else:
print ((" " *(depth + 1)) + str(tree['right']))
"""Bagged decision trees contain a user-specified number of decision trees.
Classification of a sample is done by using the mode of each of these decision trees.
subsample is a fraction of the total dataset to be used.
trees refers to the number of trees to use in "forest" of trees.
By leaving default values for subsample and trees, a single decision tree classifier is created."""
def bt_train( data, max_depth, min_size = 1, subsample_ratio = 1,trees =1):
##Create a series of trees using sampling with replacement.
size = data[:, -1].size
division = int(size * subsample_ratio)
forest = []
for i in range (0,trees):
samples = data[np.random.choice(data.shape[0], division, replace = True)]
forest.append([])
forest[i] = dt_train(samples, max_depth, min_size)
return forest
def bt_predict( forest, data):
""""Classify validation data set based on built bagged trees.
This is done by taking the mode of the classifications of each decision tree."""
##Use predict function from decision tree.
##Number of trees in forest, number of validation samples. Used to create empty array showing classifications.
forest_size = len(forest)
samples = len(data)
tree_classification = np.zeros((samples, forest_size))
##With each tree, find the classification of each validation sample.
for i in range (0, forest_size):
tree_classification[:, i] = dt_predict(forest[i], data)
##Create list of modes for each sample, using tree_classification matrix.
predictions = []
for i in range(0, samples):
tree_pred = tree_classification[i,:].tolist()
predictions.append(int(max(set(tree_pred), key = tree_pred.count)))
return predictions
def bt_confusion_matrix( predicted, actual,classes):
"""Create confusion matrix for bagged trees. Makes call to DT method."""
matrix, accuracy = dt_confusion_matrix(predicted, actual, classes)
return matrix, accuracy