-
Notifications
You must be signed in to change notification settings - Fork 0
/
Regression Analysis.R
174 lines (138 loc) · 5.8 KB
/
Regression Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#Reading Data
data <- read.csv("C:/ST3131 Regression Analysis/Project/Life Expectancy/Life Expectancy Data.csv", header=TRUE)
library(dplyr)
library(tidyverse)
library(leaps)
#Cleaning Data (point 2.2 in report)
#Drop NA values --> 183 to 130 Observations left.
data <- data %>% drop_na()
attach(data)
#Convert categorial variables to numeric, 1 is developed countries, 0 is developed countries
data$Status = as.numeric(Status)
data$Status[Status == 2] = 0
#To exclude country and year from regression model
#Note the change of data name to data1, creating another copy to work on.
data1 = subset(data, select = -c(Country, Year))
#Basic Linear Model (2.4.1)
model1 <- lm(data1$Life.expectancy~.,data = data1)
summary(model1)
#Histogram of Life Expectancy
#hist(data1$Life.expectancy, breaks=seq(50,85,5),freq=FALSE, main="Histogram of Life Expectancy")
#curve(dnorm(x, mean=mean(data1$Life.expectancy), sd=sqrt(var(data1$Life.expectancy))), col="darkblue", lwd=2, add=TRUE, yaxt="n")
#Using Stepwise to Obtain Best Fit (2.4.2)
library(MASS)
null_model <- lm(data1$Life.expectancy~1,data=data1)
full_model <- lm(data1$Life.expectancy~.,data=data1) #. means all variables not already mentioned in the formula
step <- stepAIC(null_model,direction="both", scope=list(upper=full_model,lower=null_model), k=2, test="F")
step$anova
summary(step)
#After getting best fit.
y <- data1$Life.expectancy
x1 <- data1$Status
x2 <- data1$Income.composition.of.resources
x3 <- data1$Hepatitis.B
x4 <- data1$Adult.Mortality
x5 <- data1$HIV.AIDS
data2 = subset(data1,select=c(Life.expectancy, HIV.AIDS, Hepatitis.B,Income.composition.of.resources, Adult.Mortality))
#Residual Plot vs Fitted Values (3.1)
modelfull <- lm(y~x2+x3+x4+x5)
res <- modelfull$res
fv <- modelfull$fit
par(mfrow=c(1,1))
plot(fv,res,xlab="Fitted Values",ylab="Residuals",main="Plot of residuals against fitted values")
abline(h=0,lty=2)
#Residual Plot vs Independent Variables (3.1)
plot(x2,res,xlab="Income composition of resources",ylab="Residuals",main="Plot of residuals against Income composition of resources")
abline(h=0,lty=2)
plot(x3,res,xlab="Hepatitis B",ylab="Residuals",main="Plot of residuals against Hepatitis B")
abline(h=0,lty=2)
plot(x4,res,xlab="Adult Mortality",ylab="Residuals",main="Plot of residuals against Adult Mortality")
abline(h=0,lty=2)
plot(x5,res,xlab="HIV/AIDS",ylab="Residuals",main="Plot of residuals against HIV/AIDS")
#Testing Normality using Q-Q Plot (3.2)
par(mfrow=c(2,2))
plot(model1)
#Testing Normality using Kolmogorov-Smirnov Test (3.2)
res <- model1$residuals
ks.test(res, "pnorm", mean(res), sd(res))
#Testing for Collinearity (3.3)
library(lattice)
splom(~data2[,1:5],groups = NULL,data = data2,axis.line.tck=0,axis.text.alpha=0)
cor(data2[1:5],use="complete.obs", method="kendall")
#Interaction Effect and Second Order Variable
library(ggplot2)
data2$HIV[data2$HIV.AIDS<0.2] = "low"
data2$HIV[data2$HIV.AIDS>=0.2] = "high"
data2$HIV
data2$HepB[data2$Hepatitis.B < 92] = "low"
data2$HepB[data2$Hepatitis.B >= 92] = "high"
data2$HepB
ggplot(data=data2, mapping = aes(x = Adult.Mortality,y = Life.expectancy, col = HepB))+
geom_point()+
geom_smooth(method = 'lm', fullrange="true")
ggplot(data=data2, mapping = aes(x = Adult.Mortality,y = Life.expectancy, col = HIV))+
geom_point()+
geom_smooth(method = 'lm',fullrange = "true")
#Improving Model By Adding Interaction Terms
#All possible interactions
x23<-x2*x3
x24<-x2*x4
x25<-x2*x5
x34<-x3*x4
x35<-x3*x5
x45<-x4*x5
m<-data.frame(y,x2,x3,x4,x5,x23,x24,x25,x34,x35,x45)
nullmodel = lm(y~1, data = m)
fullmodel = lm(y~., data = m)
step(nullmodel,data = m, scope = list(upper = fullmodel, lower = nullmodel), direction = "both",k = 2, test = "F")
#Best fit model: y ~ x2 + x3 + x25 + x34 + x35
best_model = lm(y~x2+x3+x25+x34+x35, data = m)
summary(best_model)
anova(best_model)
#Test for Coincidence
x12<-x1*x2
x13<-x1*x3
x125<-x1*x25
x134<-x1*x34
x135<-x1*x35
model3 <- lm(y~x2+x3+x25+x34+x35)
model4 <- lm(y~x2+x3+x25+x34+x35+x12+x13+x125+x134+x135)
anova(model3,model4)
#Residual Plot vs Fitted Values
res2 <- model3$res
fv2 <- model3$fit
par(mfrow=c(1,1))
plot(fv2,res,xlab="Fitted Values",ylab="Residuals",main="Plot of residuals against fitted values")
abline(h=0,lty=2)
#Residual Plot vs Independent Variables for model (2) (4.4)
#x1 <- Status
#x2 <- Income.composition.of.resources
#x3 <- Hepatitis.B
#x4 <- Adult.Mortality
#x5 <- HIV.AIDS
plot(x25,res2,xlab="Income composition of resources",ylab="Residuals",main="Plot of residuals against HIV/AIDS and Income Composiiton of Resources")
abline(h=0,lty=2)
plot(x2,res2,xlab="Income composition of resources",ylab="Residuals",main="Plot of residuals against Income Composiiton of Resources")
abline(h=0,lty=2)
plot(x34,res2,xlab="Income composition of resources",ylab="Residuals",main="Plot of residuals against Hepatitis B and Adult Mortality")
abline(h=0,lty=2)
plot(x3,res2,xlab="Income composition of resources",ylab="Residuals",main="Plot of residuals against Hepatitis B")
abline(h=0,lty=2)
plot(x35,res2,xlab="Income composition of resources",ylab="Residuals",main="Plot of residuals against Hepatitis B and HIV/AIDS")
abline(h=0,lty=2)
#Testing Normality using Q-Q Plot (4.5)
par(mfrow=c(2,2))
plot(model3)
#Testing Normality using Kolmogorov-Smirnov Test (4.5)
res3 <- model3$residuals
ks.test(res, "pnorm", mean(res3), sd(res3))
#Influential Points
library(car)
influence.measures(model3)
rstudent(model3)
#Plot showing Influential Points (7.12)
model3 <- lm(y~x2+x3+x25+x34+x35)
par(cex.axis = 0.5, cex.lab = 0.5, mar = c(5,5,2,2),mfrow = c(1,2))
plot(abs(dffits(model3)), ylim=c(0,1),type = "n", ylab = "DFFITS")
text(abs(dffits(model3)), labels = seq(1,n),cex=0.5)
abline(h=0.45, col = "red")