-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFinal Project.m
314 lines (258 loc) · 11.9 KB
/
Final Project.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
%%
% Display the number of missing values in each column
missingValues = sum(ismissing(diabetes))
varNames = diabetes.Properties.VariableNames;
disp('Missing Values in Each Column:');
disp(table(varNames', missingValues', 'VariableNames', {'Variable', 'MissingValues'}));
%%
% Numerical columns for scaling
numericalColumns = {'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'}
%%
% Z-Score scaling to standardize and scale all the numerical features. Scaling
% this way subtracts the mean and divides by the standard deviation.
diabetes{:, numericalColumns} = zscore(diabetes{:, numericalColumns})
%%
% Display mean and standard deviation after scaling to verify that the Z-Score
% scaling worked. We can see that the means are all close to zero and the standard
% deviation is 1 for all features so the scaling has therefore been successfully
% applied.
scaledStats = array2table([mean(diabetes{:, numericalColumns}); std(diabetes{:, numericalColumns})], 'VariableNames', numericalColumns);
scaledStats.Properties.RowNames = {'Mean', 'Std'};
disp('Mean and Standard Deviation After Scaling:');
disp(scaledStats);
%%
% Calculate and display the correlation matrix
correlationMatrix = corr(diabetes{:, numericalColumns});
disp('Correlation Matrix:');
disp(array2table(correlationMatrix, 'VariableNames', numericalColumns, 'RowNames', numericalColumns));
%%
% Filter out highly correlated features to avoid multicollinearity (there are
% none at this threshold of 0.7)
corrThreshold = 0.7;
highlyCorrelated = find(abs(correlationMatrix) > corrThreshold & eye(size(correlationMatrix)) == 0);
disp('Highly Correlated Features:');
disp(highlyCorrelated);
%%
% Univariate Feature Selection using the 'Minimum Redundancy Maximum Relevance'
% matlab function to evaluate the relevance of each feature individually with
% the target variable
featureRanking = fscmrmr(diabetes{:, numericalColumns}, diabetes.Outcome);
disp('Ranked Features using MRMR:');
disp(array2table(featureRanking, 'VariableNames', numericalColumns));
%%
% Train a decision tree model
treeModel = fitctree(diabetes{:, numericalColumns}, diabetes.Outcome);
%%
% Display feature importance based on the decision tree model
importance = predictorImportance(treeModel);
disp('Feature Importance from Decision Tree Model:');
disp(array2table(importance, 'VariableNames', numericalColumns));
%%
% Feature engineering: Create BMI categories
edges = [0 18.5 24.9 29.9 Inf];
labels = {'Underweight', 'Normal Weight', 'Overweight', 'Obese'};
diabetes.BMICategory = categorical(discretize(diabetes.BMI, edges), 1:numel(labels),labels);
diabetes = removevars(diabetes, "BMICategory");
%%
% Create BMI categories based on percentiles
percentiles = prctile(diabetes.BMI, [0 25 50 75 100]);
labels = {'Underweight', 'Normal Weight', 'Overweight', 'Obese'};
diabetes.BMICategory = categorical(discretize(diabetes.BMI, percentiles), 1:numel(labels), labels);
%%
% Create age groups based on percentiles
percentiles_age = prctile(diabetes.Age, [0 33.3 66.6 100]);
age_labels = {'Young', 'Adult', 'Senior'};
diabetes.AgeGroup = categorical(discretize(diabetes.Age, percentiles_age), 1:numel(age_labels), age_labels);
%%
% Display the updated dataset
disp(diabetes);
%%
% |Data Quality Report and additional measures|
disp('Summary Statistics:');
disp(summary(diabetes));
% Data Quality report for Continuous Features
disp('Data Quality report for Continuous Features:');
dataQualityCont = table();
for i = 1:length(numericalColumns)
colName = numericalColumns{i};
colData = diabetes.(colName);
count = numel(colData);
missing = sum(ismissing(colData));
cardinality = numel(unique(colData));
minimum = min(colData);
firstQuartile = prctile(colData, 25);
meanVal = mean(colData);
medianVal = median(colData);
thirdQuartile = prctile(colData, 75);
maximum = max(colData);
stdDev = std(colData);
colStats = table(count, missing, cardinality, minimum, firstQuartile, meanVal, medianVal, thirdQuartile, maximum, stdDev);
dataQualityCont = [dataQualityCont; colStats];
end
dataQualityCont.Properties.RowNames = numericalColumns;
disp(dataQualityCont);
% Data Quality report for Categorical Features
categoricalColumns = {'BMICategory', 'AgeGroup'};
disp('Data Quality report for Categorical Features:');
dataQualityCat = table();
for i = 1:length(categoricalColumns)
colName = categoricalColumns{i};
colData = diabetes.(colName);
count = numel(colData);
missing = sum(ismissing(colData));
cardinality = numel(categories(colData));
[modeVal, modeFreq] = mode(colData);
modePercent = modeFreq / count * 100;
[secondModeVal, secondModeFreq] = mode(colData(colData ~= modeVal));
secondModePercent = secondModeFreq / count * 100;
colStats = table(count, missing, cardinality, modeVal, modeFreq, modePercent, secondModeVal, secondModeFreq, secondModePercent);
dataQualityCat = [dataQualityCat; colStats];
end
dataQualityCat.Properties.RowNames = categoricalColumns;
disp(dataQualityCat);
% Histograms for Continuous Features
figure;
for i = 1:length(numericalColumns)
subplot(3, 3, i);
histogram(diabetes.(numericalColumns{i}));
title(['Distribution of ' numericalColumns{i}]);
end
% Boxplots for Continuous Features
figure;
for i = 1:length(numericalColumns)
subplot(3, 3, i);
boxplot(diabetes.(numericalColumns{i}));
title(['Boxplot of ' numericalColumns{i}]);
end
% Histogram of target feature distribution
figure;
outcomeCounts = histcounts(diabetes.Outcome, 'BinEdges', [0 0.5 1.5]);
bar([0, 1], outcomeCounts);
title('Class Distribution');
xlabel('Outcome');
ylabel('Count');
xticks([0, 1]);
xticklabels({'Class 0', 'Class 1'});
% Check for Duplicate Rows
duplicateRows = diabetes(ismember(diabetes, unique(diabetes, 'rows', 'stable', 'rows')), :);
disp('Duplicate Rows:');
disp(duplicateRows);
% Check for Zero Values in Numerical Columns
zeroValues = diabetes(diabetes{:, numericalColumns} == 0, :);
disp('Rows with Zero Values in Numerical Columns:');
disp(zeroValues);
% Missing values recheck
missingValuesAfterProcessing = sum(ismissing(diabetes{:, numericalColumns}));
disp('Missing Values Recheck:');
disp(table(numericalColumns', missingValuesAfterProcessing', 'VariableNames', {'Variable', 'MissingValues'}));
%%
% Splitting the datasest into training and testing sets (70% training, 30% testing)
% Calculate the number of observations
n = height(diabetes);
% Create a partition for training and testing using cvpartition
part = cvpartition(n, 'HoldOut', 0.3);
% Split the data into training and test sets
idxTrain = training(part);
tblTrain = diabetes(idxTrain, :);
idxTest = test(part);
tblTest = diabetes(idxTest, :);
%%
% Display the number of observations in the training and test sets
disp(['Number of Observations in Training Set: ' num2str(sum(idxTrain))]);
disp(['Number of Observations in Test Set: ' num2str(sum(idxTest))]);
% DT model
treeModel = fitctree(tblTrain{:, numericalColumns}, tblTrain.Outcome);
yPred = predict(treeModel, tblTest{:, numericalColumns});
% Confusion matrix
confMat = confusionmat(tblTest.Outcome, yPred);
% Calculate precision, recall, and F1 score
precision = confMat(2,2) / sum(confMat(:,2)); % True Positives / (True Positives + False Positives)
recall = confMat(2,2) / sum(confMat(2,:)); % True Positives / (True Positives + False Negatives)
F1Score = 2 * (precision * recall) / (precision + recall);
% Display the results
disp('Confusion Matrix:');
disp(confMat);
disp(['Precision: ' num2str(precision)]);
disp(['Recall: ' num2str(recall)]);
disp(['F1 Score: ' num2str(F1Score)]);
%%
% Boxplot for Testing Accuracy for five best algorithms
experimentResults = table2array(experimentResults)
boxplot(experimentResults, 'Labels', {'Linear SVM', 'Binary GLM Logistic Regression', 'Efficient Logistic Regression', 'Efficient Linear SVM', 'Coarse Gaussian SVM'})
xlabel("Algorithm")
ylabel("Testing Accuracy")
% Calculate the IQR, range + median for each algorithm for spread analysis
algorithmLabels = {'Linear SVM', 'Binary GLM Logistic Regression', 'Efficient Logistic Regression', 'Efficient Linear SVM', 'Coarse Gaussian SVM'};
iqrValues = iqr(experimentResults);
iqrTable = table(algorithmLabels', iqrValues', 'VariableNames', {'Algorithm', 'IQR'});
disp(iqrTable);
rangeValues = range(experimentResults);
rangeTable = table(algorithmLabels', rangeValues', 'VariableNames', {'Algorithm', 'Range'});
disp(rangeTable);
medians = median(experimentResults);
medianTable = table(algorithmLabels', medians', 'VariableNames', {'Algorithm', 'Median'});
disp(medianTable);
%%
% Boxplot for Validation Accuracy for five best algorithms
experimentResults2 = table2array(experimentResults2)
boxplot(experimentResults2, 'Labels', {'Linear SVM', 'Binary GLM Logistic Regression', 'Efficient Logistic Regression', 'Efficient Linear SVM', 'Coarse Gaussian SVM'})
xlabel("Algorithm")
ylabel("Validation Accuracy")
% Calculate the IQR, range + median for each algorithm for spread analysis
algorithmLabels = {'Linear SVM', 'Binary GLM Logistic Regression', 'Efficient Logistic Regression', 'Efficient Linear SVM', 'Coarse Gaussian SVM'};
iqrValues = iqr(experimentResults2);
iqrTable = table(algorithmLabels', iqrValues', 'VariableNames', {'Algorithm', 'IQR'});
disp(iqrTable);
rangeValues = range(experimentResults2);
rangeTable = table(algorithmLabels', rangeValues', 'VariableNames', {'Algorithm', 'Range'});
disp(rangeTable);
medians = median(experimentResults2);
medianTable = table(algorithmLabels', medians', 'VariableNames', {'Algorithm', 'Median'});
disp(medianTable);
%%
% Boxplot for Prediction Speed for five best algorithms
experimentResults3 = table2array(experimentResults3)
boxplot(experimentResults3, 'Labels', {'Linear SVM', 'Binary GLM Logistic Regression', 'Efficient Logistic Regression', 'Efficient Linear SVM', 'Coarse Gaussian SVM'})
xlabel("Algorithm")
ylabel("Prediction Speed")
% Calculate the IQR, range + median for each algorithm for spread analysis
algorithmLabels = {'Linear SVM', 'Binary GLM Logistic Regression', 'Efficient Logistic Regression', 'Efficient Linear SVM', 'Coarse Gaussian SVM'};
iqrValues = iqr(experimentResults3);
iqrTable = table(algorithmLabels', iqrValues', 'VariableNames', {'Algorithm', 'IQR'});
disp(iqrTable);
rangeValues = range(experimentResults3);
rangeTable = table(algorithmLabels', rangeValues', 'VariableNames', {'Algorithm', 'Range'});
disp(rangeTable);
medians = median(experimentResults3);
medianTable = table(algorithmLabels', medians', 'VariableNames', {'Algorithm', 'Median'});
disp(medianTable);
%%
% Boxplot for Training Time for five best algorithms
experimentResults4 = table2array(experimentResults4)
boxplot(experimentResults4, 'Labels', {'Linear SVM', 'Binary GLM Logistic Regression', 'Efficient Logistic Regression', 'Efficient Linear SVM', 'Coarse Gaussian SVM'})
xlabel("Algorithm")
ylabel("Training Time")
% Calculate the IQR, range + median for each algorithm for spread analysis
algorithmLabels = {'Linear SVM', 'Binary GLM Logistic Regression', 'Efficient Logistic Regression', 'Efficient Linear SVM', 'Coarse Gaussian SVM'};
iqrValues = iqr(experimentResults4);
iqrTable = table(algorithmLabels', iqrValues', 'VariableNames', {'Algorithm', 'IQR'});
disp(iqrTable);
rangeValues = range(experimentResults4);
rangeTable = table(algorithmLabels', rangeValues', 'VariableNames', {'Algorithm', 'Range'});
disp(rangeTable);
medians = median(experimentResults4);
medianTable = table(algorithmLabels', medians', 'VariableNames', {'Algorithm', 'Median'});
disp(medianTable);
%%
% Bargraph for Testing Accuracies for all algorithms (before narrowed down top
% 5)
algorithms = experimentResults5.Algorithm;
testingAccuracyMean = experimentResults5.TestingAccuracyMean;
figure;
bar([testingAccuracyMean]);
xlabel('Algorithm');
ylabel('Testing Accuracy Mean');
title('Testing Accuracy Mean for 24 Different Algorithms');
xticks(1:length(algorithms));
xticklabels(algorithms);
xtickangle(45);