-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdayFive.R
64 lines (47 loc) · 2.66 KB
/
dayFive.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Using Elastic Net to help us select the correct input variables for multilpe regression
library(tidyverse)
library(glmnet) # implementation of Elastic Net for glms.
library(car) # AV plots
coders_2016 <- read.csv("Data/dayFive/2016-FCC-New-Coders-Survey-Data.csv")
# create a subset of 2016 data with only the variables of interest. Both inputs & output
dataSubset <- coders_2016 %>%
mutate(IsWoman = (Gender == 'female')) %>% # In the column Gender, IsWoman = 1 else, 0. Boolean, if TRUE = 1, else FALSE which is 0.
select(Age, CommuteTime, HasChildren, AttendedBootcamp, IsWoman, HasDebt, HoursLearning, MonthsProgramming, Income) %>%
na.omit() # remove non-numeric values
# glmnet receives inputs inform of a matrix, therefore, we'll have to convert all inputs to numeric
# convert the selected input variables to a matrix
input <- dataSubset %>%
select(-Income) %>% # Select all variables but the Income. Because Income is the output we're trying to predict.
as.matrix()
# output as a vector
output <- dataSubset$Income
length(output) # count of the data points
print("Mean of our predicted value")
mean(output)
# When the mean is very far from 0, gaussian & poisson distribution tend to converge.
# In this cases, for cross-validation we'll taking 10 diff sub-samples, train a model on each of the sub-sample then average all those models to get our final model
# cv = cross validation
cv_fit <- cv.glmnet(input, output, family = "gaussian")
# get coefficients & the intercept for the best model
coef(cv_fit, s = "lambda.min")
# The code above generates a sparse matrix
# All the variables but HasDebt are useful in predicting the Income. Coefficient of HasDebt was pushed to is 0. In the console, 0 is represented by a dot. (.)
# convert the coefficients to a non-sparse matrix
# fetch the variables that != 0
coef_matrix <- coefficients(cv_fit, s = "lambda.min") %>%
as.matrix()
input_variables <- row.names(coef_matrix)[coef_matrix != 0] %>% # Get the row names of the matrix with non-zero's
setdiff("(Intercept)") # Remove the Intercept, if present
new_variables <- paste(input_variables, collapse = "+") # 'Joining all the variables with a "+"
formular <- paste("Income ~ ", new_variables, sep = " ")
income2016_model <- glm(formular, data = dataSubset, family = ("gaussian"))
# diagonstic plots
graphics.off()
par(mfrow = c(2,2))
par("mar")
par(mar=c(1,1,1,1))
plot(income2016_model)
summary(income2016_model)
# from the summary, most important value is HasChildren because of the estimate is quite big and also has a small std. error. Also MonthsProgramming
# the higher the estimate, the higher the input value has in influencing the prediction
avPlots(income2016_model)