-
Notifications
You must be signed in to change notification settings - Fork 0
/
R_intro_LMatyjek.R
363 lines (276 loc) · 12 KB
/
R_intro_LMatyjek.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
################################################################################
# #
# Introduction to R #
# Lena Matyjek, 14/12/2020 #
# #
################################################################################
# BASICS:
# script ares, console/terminal/etc, environment/history/etc, Files/Plots/etc.
# R as calculator (numbers and operators: +,-,/,*,>,<,==,!=, &, |)
# objects: 1 is meaningful, but a is not; vector, matrix, table, funciton, model, etc.
# objects and values (= or <-)
# removing objects, e.g. rm(a)
# "" and ''
# packages
# arrow up, tab, ctrl/command+enter
################################################################################
# Settings
rm(list=ls()) # This cleans the environment
############################### WORKING DIRECTORY ##############################
# Set up your working directory
#setwd("xxx") # replace XXX with your directory
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) # This will set the working directory to the folder in which this script is.
# Check your working directory
getwd()
################################# LOADING FILE #################################
# Load from CSV:
read.csv2('./iris.csv', header = TRUE, sep = ',')
# Load from CSV and ascribe to object "dane":
dane <- read.csv2('./iris.csv', header = T, sep = ',')
# # You can also take these data from existing R datasets:
# library(datasets) # this loads a package. It's in the R base, no need to install it.
# data(iris)
################################# VIEWING FILE #################################
# See the data:
colnames(dane) # see the names of the columns
rownames(dane) # see row names; often automatic in reading CSV in R; can be ignored for now
head(dane,5) # see the first 5 rows; replace 5 with any other number
tail(dane,5) # see the last 5 rows
#View(dane) # see a table in a new tab; also done by simply clicking on the object in the global environment
############################ MANIPULATING DATA FRAME ###########################
# $ operator refers to a column;
# data frames are organised with rows and columns, like that: df[r,c]
# All rows and columns are indexed, so we can refer to them by numbers:
# See column no 1:
dane[,1]
# See row no 1:
dane[1,]
# See the last column in the data frame:
dane[,length(dane)] # note: length of data frame is the number of columns!
# See the last row in the data frame:
dane[length(dane$X),] # note: length of a column in a data frame is the number of rows!
# See one column based on its name
dane$Sepal.Length
# See all data for one species
unique(dane$Species) # shows unique values of Species (regardless of no of occurences)
dane[dane$Species == 'setosa',] # show all data for setosa
dane$Sepal.Length[dane$Species == 'setosa'] # show sepal length for all setosas
# Remove a column
colnames(dane)
dane$X <- NULL
dane$trojki <- 3
colnames(dane)
#################################### CLASSES ##################################
# The most commonly used classes in data frames are: numeric/integer, character, factor
# But of course there are more classes of objects. Way more.
# See a class of an object
class(1)
a = 1
class(a)
a = "ach, poniedziałki"
class(a)
class(dane)
class(dane$Sepal.Length)
# See all classes in the data frame at once
sapply(dane,class)
# Change the classes so that the data make sense!
head(dane) # see the beggining data; Sepal length probably should be a number.
dane$Sepal.Length <- as.numeric(dane$Sepal.Length)
class(dane$Sepal.Length) # check is it's numeric now
# TASK: change also: sepal width, petal legth, petal width
# (...)
# EXTRA: a faster way to do so is:
dane[,c(1:4)] <- lapply(dane[,c(1:4)],as.numeric)
# See Species as characters, numeric, and factors
as.character(dane$Species)
as.numeric(dane$Species) # notice the warning!
as.factor(dane$Species) # notice "Levels"!
as.factor(as.character(dane$Sepal.Length))
# Convert Species of factors and view it
dane$Species <- as.factor(dane$Species)
dane$Species # notice the levels and their order
########################## DESCRIPTIVE STATS ##################################
# Mean of the petal length
mean(dane$Sepal.Length)
# Round the number
round(mean(dane$Sepal.Length),2)
# See what happens when there are missing data. Save one to an object, remove it in the data frame (replace with NA), and find the mean
missing_value <- dane$Sepal.Length[1]
missing_value
dane$Sepal.Length[1] <- NA
dane$Sepal.Length[1]
mean(dane$Sepal.Length) # the mean is NA! That's because NA isn't a number that can be added or divided.
mean(dane$Sepal.Length, na.rm = T) # that's a solution. We ask R to ignore the NAs in the column
dane$Sepal.Length[1] <- missing_value # assign the value back
rm(missing_value) # remove the object
# Median
median(dane$Sepal.Length)
# Min / max
min(dane$Sepal.Length)
max(dane$Sepal.Length)
# Standard deviation & error
sd(dane$Sepal.Length)
sd(dane$Sepal.Length)/sqrt(length(dane$Sepal.Length)) # standard error is just the standard deviation divided by the square root of the sample size
# EXTRA
# We can create a custom function to get se! For that we need to create an object (se), which is a function, and which does what we already did
# se <- function(x) sd(x)/sqrt(length(x))
# We can see different descriptive stats in one function, but we need a package which provides a funciton for it:
#install.packages("psych")
library(psych) # load the package
describe(dane$Sepal.Length)
# Changing the names of factors
unique(dane$Species)
dane$Species <- as.character(dane$Species)
dane$Species[dane$Species == "setosa"] <- "1"
dane$Species[dane$Species == "versicolor"] <- "2"
dane$Species[dane$Species == "virginica"] <- "3"
unique(dane$Species)
class(dane$Species)
dane$Species <- as.factor(dane$Species)
# Let's change them back to full names
dane$Species <- as.character(dane$Species)
dane$Species[dane$Species == "1"] <- "setosa"
dane$Species[dane$Species == "2"] <- "versicolor"
dane$Species[dane$Species == "3"] <- "virginica"
unique(dane$Species)
class(dane$Species)
dane$Species <- as.factor(dane$Species)
library(dplyr)
dane %>%
group_by(Species) %>%
summarise(sl_mean <- mean(Sepal.Length),
sw_mean <- mean(Sepal.Width))
# TASK
# Find means, sd, and se for all 4 variables across the species!
############################## DATA VISUALISATION #############################
# Histogram
hist(dane$Sepal.Length)
# Scatter plot
plot(dane$Sepal.Length)
# Scatter plot of two variables
plot(dane$Sepal.Length, dane$Sepal.Width)
# Boxplot
boxplot(dane$Sepal.Length)
# The real deal for plotting is ggplot. Let's (install and) load the package
library(ggplot2)
ggplot(data = dane, aes(x = Sepal.Length, y = Sepal.Width)) # this only has the data in, but R doesn't know what to do with this yet
ggplot(data = dane, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_point() # now we ask R to show us the raw data points
ggplot(data = dane, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_line() # now we ask R to connect the points with a line; not informative here
ggplot(data = dane, aes(x = Sepal.Length, y = Sepal.Width)) +
geom_smooth() # here R chooses a smoothing function to show the relationship between x and y
# Let's customise a bit
ggplot(data = dane, aes(x = Sepal.Length, y = Sepal.Width, colour = Species)) +
geom_smooth()
ggplot(data = dane, aes(x = Sepal.Length, y = Sepal.Width, colour = Species)) +
geom_smooth() +
theme_classic()
ggplot(data = dane, aes(x = Sepal.Length, y = Sepal.Width, colour = Species)) +
geom_smooth() +
theme_minimal() +
scale_color_brewer(palette = 1)
ggplot(data = dane, aes(x = Sepal.Length, y = Sepal.Width, colour = Species)) +
geom_smooth() +
theme_minimal() +
scale_color_manual(values = c("black","red","blue")) # or no of colours in HEX
ggplot(data = dane, aes(x = Sepal.Length, y = Sepal.Width, colour = Species)) +
geom_smooth() +
theme_minimal() +
scale_color_manual(values = c("black","red","blue")) + # or no of colours in HEX
xlab("Length of the sepal") +
ylab("Width of the sepal")
################################## LOOPS ######################################
# Let's say we want to add to the data frame info about the colour of the flower.
# Let's say that it happens so that all flowers with Sepal length less than 6 are red and all others - blue.
# Let's add this information using a for and if loop.
# Then, let's add that all setosas are from lab1, all versicolor and virginica from lab2.
# For loop
for (i in 1:length(dane$Sepal.Length)) {
if (dane$Sepal.Length[i] <= 6) {
dane$colour[i] <- "red" # see the comment below
} else {
dane$colour[i] <- "blue"
}
}
# additional comment: note that R can ascribe a value to a column that doesn't yet exist!
# This may be sometimes problematic, so you may want to first create an empty column.
# As an example, we will do it below before the while loop.
# While loop
dane <- plyr::ddply(dane,c("Species")) # Let's say we have a sorted data frame by the column "Species"
dane['lab'] <- as.character()
i = 1 # set a counter
while (i <= length(dane$Sepal.Length)) {
if (dane$Species[i] == 'setosa') {
dane$lab[i] <- 'lab1'
} else if (dane$Species[i] == 'versicolor' | dane$Species[i] == 'virginica') {
dane$lab[i] <- 'lab2'
}
i = i+1 # don't forget to increase the counter!
}
# NOTE
# Often, loops can be replaced by indexing. Indexing is faster and more elegant in the code.
# Let's see how the work for both the previous loops.
t1 <- Sys.time() # get time of the system
for (i in 1:length(dane$Sepal.Length)) {
if (dane$Sepal.Length[i] <= 6) {
dane$colour[i] <- "red" # see the comment below
} else {
dane$colour[i] <- "blue"
}
}
t2 <- Sys.time() # get time of the system again
text <- "This loop took:" # create a message to beprited in the consol
text2 <- "secs."
info_whole <- paste(text,round(t2-t1,3),text2)
print(info_whole) # print it
t1 <- Sys.time()
dane$colour[dane$Sepal.Length <= 6] <- "red"
dane$colour[dane$Sepal.Length > 6] <- "blue"
t2 <- Sys.time()
text <- "Indexing took:" # create a message to beprited in the consol
text2 <- "secs."
info_whole <- paste(text,round(t2-t1,3),text2)
print(info_whole) # print it
# So indeed, it's faster with indexing and it takes less code to achieve the same.
# Take-home message: use indexing if you can! Leave loops for when it's necessary.
# Let's check the while loop too:
t1 <- Sys.time()
i = 1 # set a counter
while (i <= length(dane$Sepal.Length)) {
if (dane$Species[i] == 'setosa') {
dane$lab[i] <- 'lab1'
} else if (dane$Species[i] == 'versicolor' | dane$Species[i] == 'virginica') {
dane$lab[i] <- 'lab2'
}
i = i+1 # don't forget to increase the counter!
}
t2 <- Sys.time()
print(paste("This loop took:",round(t2-t1,3),"secs."))
t1 <- Sys.time()
dane$lab[dane$Species == "setosa"] <- 'lab1'
dane$lab[dane$Species != "setosa"] <- 'lab2'
t2 <- Sys.time()
print(paste("Indexing took:",round(t2-t1,3),"secs."))
# TASK
# add a dummy variable (0,1) to the data frame so that only cases of flowers
# with sepals longer than the mean of the whole sample AND petals wider than
# the median of the sample get value 1. You can use indexes or a loop.
################################# SIMPLE STATS ################################
# EXTRA
# Always check classes before running stats!
# t test - difference between petal length in setosa and in virginica
t.test(dane$Petal.Length[dane$Species == 'setosa'],
dane$Petal.Length[dane$Species == 'virginica'])
# anova
aov(formula = Petal.Length ~ Species, data = dane)
my_anova <- aov(formula = Petal.Length ~ Species, data = dane)
summary(my_anova)
options(scipen=999) # remove scientific notation
summary(my_anova)
# linear regression
my_model <- lm(Petal.Length ~ Species, data = dane)
summary(my_model)
# correlation
library(Hmisc)
rcorr(as.matrix(dane[,c(1:4)]),type="pearson")