diff --git a/docs/assets/Scripts/Day1.R b/docs/assets/Scripts/Day1.R index 2386827..933e08f 100644 --- a/docs/assets/Scripts/Day1.R +++ b/docs/assets/Scripts/Day1.R @@ -1,12 +1,15 @@ #----------------- #----------------- # Introduction to statistics with R -# 2024 +# 2025 # In Lausanne #----------------- #----------------- - +weight <- c(65,72,55,91,95,72) #this is the weight of our classroom people +height <- c(1.73, 1.80, 1.62, 1.90, 1.78, 1.93) +bmi <- weight / height^2 +bmi # Type this in R to see the computed values #----------------- #----------------- @@ -60,23 +63,24 @@ plot(diameter ~ log(conc), data=hellung) # a fancy plot with ggplot :) library(ggplot2) -ggplot(hellung, aes(x=log(conc), y=diameter)) + geom_point() - +ggplot(hellung, aes(x=log(conc), y=diameter,col=glucose)) + geom_point() +hellung$glucose <- factor(hellung$glucose) #----------------- #----------------- # 2nd exercise #----------------- +data <- read.csv("~/Desktop/Introduction-to-statistics-with-R/docs/assets/exercises/data.csv", header=FALSE) -setwd("/Users/Rachel/Desktop/Introduction-to-statistics-with-R/docs/assets/exercises/") +setwd("/Users/rachelmarcone/Desktop/Introduction-to-statistics-with-R/docs/assets/exercises/") data <- read.csv("data.csv") data summary(data) sd(data[,1]); sd(data[,2]); sd(data[,3]) - +attach(data) datatoplot <- data[,1] #pdf("datanumber1.pdf") ## Plot 4 rows of graphs on one plot @@ -145,6 +149,23 @@ boxplot(datatoplot, horizontal=TRUE, ylim=range(datatoplot)) #dev.off() +x <- rnorm(10000,mean=0, sd=1) +hist(x) +t.test(x,mu=0)$p.value + + +s <- rep(0,100) # this is an empty vector with 10 entries +for(i in 1:100){ # this is a loop, called a "for" loop, it will repeat + # everything in parenthesis 10 times changing the variable + # i from 1 to 10 at each iteration + x <- rnorm(10,mean=10, sd=1) + s[i] <- t.test(x,mu=10)$p.value # does a t.test then takes the p.value obtained and + # puts it into the i-th entry of s + +} + +s_adj <- p.adjust(s) + ## Last exercise @@ -163,9 +184,10 @@ student$leftrighthanded <- as.factor(as.character(student$leftrighthanded)) summary(student) student[student[,"height"] ==1.77,"height"] <-177 +student[,"siblings"]<- as.factor(student[,"siblings"]) plot(student$height,student$weight,col=student$gender) boxplot(height~gender,data=student) boxplot(weight~gender,data=student) hist(student$weight) - +pairs(student) ## two by two plots of data diff --git a/docs/assets/Scripts/Day2.R b/docs/assets/Scripts/Day2.R index 3dbec8a..4247587 100644 --- a/docs/assets/Scripts/Day2.R +++ b/docs/assets/Scripts/Day2.R @@ -1,7 +1,7 @@ #----------------- #----------------- # Introduction to statistics with R -# January 2024 +# January 2025 # In Lausanne #----------------- #----------------- @@ -37,16 +37,18 @@ ggboxplot(intake$pre, width = 0.5, add = c("mean","jitter"), identify_outliers(as.data.frame(intake$pre)) # Assumption: normality -qqplot -qqline + +qqnorm(intake$pre) +qqline(intake$pre) + ggqqplot(intake,"pre") shapiro_test(intake$pre) # t test -t.test(pre, mu=7725) -t.test(pre, mu=7725, alternative="less") +t.test(pre, mu=7725) #one sample two sided t test +t.test(pre, mu=7725, alternative="less") #one sample one sided t test @@ -61,12 +63,15 @@ energy # assumption 1: data in each group are normally distributed. -ind.obese <- which(energy$stature == "obese") +ind.obese <- which(energy[,"stature"] == "obese") ind.lean <- which(energy$stature == "lean") shapiro_test(energy$expend[ind.obese]) +qqnorm(energy$expend[ind.obese]) +qqline(energy$expend[ind.obese]) shapiro_test(energy$expend[ind.lean]) - +qqnorm(energy$expend[ind.lean]) +qqline(energy$expend[ind.lean]) # assumption 2: the variances for the two independent groups are equal. levene_test(energy, expend~stature) @@ -127,6 +132,8 @@ names(WT) <- "weight" WT$genotype <- "WT" KO_WT <- rbind(KO,WT) +shapiro.test(WT$weight) +shapiro.test(KO$weight) boxplot(KO_WT$weight ~ KO_WT$genotype, main="Mice weight at 18 weeks", xlab="", ylab="") @@ -144,13 +151,13 @@ sim.p.t.test <- NULL sim.p.wilcox.test <- NULL for (i in 1:1000) { - KO <- runif(3, min=27, max=29) - WT <- runif(3, min=29, max=34) + KO <- runif(3, min=30, max=34) + WT <- runif(3, min=27, max=29) KO <- as.data.frame(KO) - names(KO)[1] <- "weight" + names(KO) <- "weight" KO$genotype <- "KO" WT <- as.data.frame(WT) - names(WT)[1] <- "weight" + names(WT) <- "weight" WT$genotype <- "WT" KO_WT <- rbind(KO,WT) @@ -167,7 +174,9 @@ str(sim.p.welch.test) str(sim.p.t.test) sum(sim.p.welch.test < 0.05) sum(sim.p.t.test < 0.05) +sum(sim.p.wilcox.test<0.05) plot(sim.p.t.test,sim.p.welch.test) + adj.bonf <- p.adjust(sim.p.welch.test, method="bonf") sum(adj.bonf < 0.05) adj.BH <- p.adjust(sim.p.welch.test, method="BH") @@ -192,7 +201,7 @@ coagulation %>% group_by(diet) %>% get_summary_stats(coag, type = "mean_sd") boxplot(coagulation$coag ~ coagulation$diet) -ggboxplot(coagulation, x="diet",y="coag") +ggboxplot(coagulation, x="diet",y="coag",add="jitter") # check normality ind.A <- which(coagulation$diet=="A") diff --git a/docs/assets/Scripts/Day3_afternoon_multiple_regression.R b/docs/assets/Scripts/Day3_afternoon_multiple_regression.R new file mode 100644 index 0000000..b79024b --- /dev/null +++ b/docs/assets/Scripts/Day3_afternoon_multiple_regression.R @@ -0,0 +1,144 @@ +##################################### +# Introduction to Statistics with R # +# Multiple Regression # +# Joao Lourenco # +# 29.01.2025 # +##################################### + +# clear the environment +rm(list = ls()) + +# clear the command line +cat("\014") + + +########################################### +# multiple regression with two variables # +########################################### + +# load the class dataset +class <- read.csv("/Users/rachelmarcone/Desktop/Introduction-to-statistics-with-R/docs/assets/exercises/class.csv") + +# fit model with two independent variables +model <- lm(Height ~ Age + Weight, data = class) +summary(model) + +# The order doesn't matter +summary(lm(Height ~ Weight + Age , data = class)) + +# compare to the models where each variable is considered separately +summary(lm(Height ~Age , data = class)) +summary(lm(Height ~ Weight, data = class)) + + +# example of a case in which the adjusted R2 decreseases when the number of variables increases + +y <- rnorm(10) # create a random vector +X <- matrix(rnorm(100),ncol = 10, nrow = 10) #create a random matrix + +# r-squared +plot(sapply( 1:10, function(i) summary(lm(y ~ X[,1:i]))$r.squared),type = "l") + +# adjusted r-squared +plot(sapply( 1:10, function(i) summary(lm(y ~ X[,1:i]))$adj.r.squared),type = "l") + +# last model +summary(lm(y ~ X)) + +######################################################### +# Categorical variables, dummy variables and contrasts # +######################################################### + +## categorical variable + +# convert Gender to a factor +class$Gender <- as.factor(class$Gender) +class$Gender +as.numeric(class$Gender) + +summary(lm(Height ~ Age + Gender, data = class )) + + +# difference in means between males and females +#?tapply +#?diff +means <- tapply(class$Height, class$Gender, mean) +diff(means) + +# where does this difference come from +summary(lm(Height ~ Gender, data = class )) +summary(lm(Height ~ Age + Gender, data = class )) + + +## interaction between Age and Gender +summary(lm(Height ~ Age + Gender + Age:Gender, data = class)) +#summary(lm(Height ~ Age*Gender, data = class)) + + +## What if males were the baseline + +# create a new categorical variable +class$Gender1 <- relevel(class$Gender, ref="M") + +# fit the model +summary(lm(Height ~ Age + Gender1, data = class)) + + + +########################### +# Diagnostic tools # +########################### + +attach(class) + + +## examination of residuals +model <- lm(Height ~ Age , data = class) +plot(class$Age, residuals(model)) # works only for simple regression +plot(fitted(model), residuals(model)) # works only for simple regression + +# hat values +?lm.influence +hat <- lm.influence(model) +plot(hat$hat, ylim = range(0,hat$hat, 3*2/19)) +abline(h=c(2,3)*2/19, lty=2, col=c("blue","red")) + + +# Predictions with confidence intervals +?predict.lm +preds <- predict(model, interval = "prediction") +attach(class) +# Plot the data +plot(Age, Height) + +# Add regression line +abline(model, col = "red", lwd = 2) + + + + +# Generate new predictor values for smooth plotting +new_x = seq(min(Age),max(Age),length.out = 100) + +# get the prediction interval +prediction_interval <- predict(model, + newdata = data.frame(Age = new_x), + interval = "prediction") + +# get the confidence intervals for a given level of confidence +confidence_interval <- predict(model, + newdata = data.frame(Age = new_x), + interval = "confidence") + +# Create the scatterplot +plot(Age, Height, ylim = range(prediction_interval[, "lwr"],prediction_interval[, "upr"])) + +# Add regression line +abline(model, col = "red", lwd = 2) + + +# Add bands +lines(new_x, confidence_interval[, "lwr"], lty = "dashed") +lines(new_x, confidence_interval[, "upr"], lty = "dashed") +lines(new_x, prediction_interval[, "lwr"], lty = "dotted") +lines(new_x, prediction_interval[, "upr"], lty = "dotted") diff --git a/docs/assets/Scripts/Day3_morning_correlation_simple_regression.R b/docs/assets/Scripts/Day3_morning_correlation_simple_regression.R new file mode 100644 index 0000000..3db8f5e --- /dev/null +++ b/docs/assets/Scripts/Day3_morning_correlation_simple_regression.R @@ -0,0 +1,97 @@ +##################################### +# Introduction to Statistics with R # +# Correlation and Simple Regression # +# Joao Lourenco # +# 29.01.2025 # +##################################### + +# clear the environment +rm(list = ls()) + +# clear the command line +cat("\014") + +################## +# Correlation # +################## + +# example +x <- c(70,60,0,90,20,100,120) +y <- c(6,5,1,8,2,10,10) + +# cor function +?cor +cor(x,y) + +# cor.test (Test for Association/Correlation Between Paired Samples) +?cor.test +cor.test(x, y) + +################## +# Regression # +################## + +# load the class dataset +class <- read.csv("/data_big/Teaching/Introduction_to_statistics_with_R/2025/datasets/class.csv" ) + +# descriptive statistics +dim(class) +head(class) +summary(class) + +# Create scatterplot to show the relationship between variables + +pairs(class[,-1]) # exclude the "Name" +#Error in pairs.default(class[, -1]) : non-numeric argument to 'pairs' +#the pairs() function in R expects all columns in the input data to be numeric. + +# Alternative 1: exclude the "Gender" +pairs(class[,c(-1,-2)]) + +# Alternative 2: convert "Gender to a factor" (which we should do, anyway) +class$Gender <- factor(class$Gender, levels = c("F","M")) +pairs(class[,-1]) + + +# fit the linear model with lm +?lm + +lm(formula = Height ~ Age, data = class) +# lm(formula = "Height ~ Age", data = class) +# lm(Height ~ Age, data = class) +# lm(Height ~ Age, class) + + +# if we want to keep the results in an object +model <- lm(formula = Height ~ Age, data = class) +model + +# plot the regression line in the scatterplot +plot(class$Age, class$Height) +abline(model, col="red", lwd = 2) + +# show the intercept +# attach(class) # columns can be accessed by simply giving their names +plot(class$Age, class$Height, xlim= range(0, Age), ylim = range(coef(model)[1], Height)) +abline(model, col="red", lwd = 2) + +# get more details from the results +summary(model) + + +# five-number summary of the residuals +fivenum(residuals(model)) + +# boxplot with the distribution of the residuals +boxplot(residuals(model), horizontal = TRUE) + +# RSE +sd(residuals(model)) +sqrt(sum(residuals(model)^2)/18) # here, we are using n-1 +sqrt(sum(residuals(model)^2)/17) # here, we are using n-2 + +# Multiple and adjusted r-squared +summary(model)$r.squared +cor(class$Age, class$Height)^2 + + diff --git a/docs/assets/exercises/easy_R_script.R b/docs/assets/exercises/easy_R_script.R index 97c533e..144f5a6 100644 --- a/docs/assets/exercises/easy_R_script.R +++ b/docs/assets/exercises/easy_R_script.R @@ -81,7 +81,7 @@ read.table() data <- matrix(c(1,2,3,5,6,7),nrow=2) # you can access the row 2 and the column 3 -data[2,3] +data[2,] # you can also have dataframes (mix of matrix with different types for the columns)