Skip to content

Commit b7b0e00

Browse files
committed
add 2018 and 2019 data; change preprocessing for these car data
1 parent 0a4e4e5 commit b7b0e00

File tree

2 files changed

+66
-31
lines changed

2 files changed

+66
-31
lines changed

src/01_download_data.R

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@ dir.create("data/raw/cars_unzipped", showWarnings = F)
1313

1414
# DOWNLOAD BICYCLE DATA
1515
url = "https://github.com/codeformuenster/open-data/raw/master/verkehrsdaten/fahrrad/Fahrradzaehlstellen-Stundenwerte.csv"
16-
download.file(url = url,
17-
destfile = "data/raw/Fahrradzaehlstellen-Stundenwerte.csv")
16+
download.file(url = url, destfile = "data/raw/Fahrradzaehlstellen-Stundenwerte.csv")
1817

1918
# DOWNLOAD CAR DATA
2019
# 2015
@@ -29,13 +28,18 @@ download.file(url = url, destfile = "data/raw/kfzzaehlstellen2017.zip")
2928
# 2018
3029
url = "https://github.com/codeformuenster/open-data/raw/master/verkehrsdaten/kfz/kfzzaehlstellen2018.zip"
3130
download.file(url = url, destfile = "data/raw/kfzzaehlstellen2018.zip")
31+
# 2019 until August
32+
url = "https://github.com/codeformuenster/open-data/raw/master/verkehrsdaten/kfz/kfzzaehlstellen2019BisAugust.zip"
33+
download.file(url = url, destfile = "data/raw/kfzzaehlstellen2019BisAugust.zip")
3234

3335
# UNZIP CAR DATA
34-
unzip(zipfile = "data/raw/kfzzaehlstellen2015.zip",
36+
unzip(zipfile = "data/raw/kfzzaehlstellen2015.zip",
3537
exdir = "data/raw/cars_unzipped/")
36-
unzip(zipfile = "data/raw/kfzzaehlstellen2016.zip",
38+
unzip(zipfile = "data/raw/kfzzaehlstellen2016.zip",
3739
exdir = "data/raw/cars_unzipped/")
38-
unzip(zipfile = "data/raw/kfzzaehlstellen2017.zip",
40+
unzip(zipfile = "data/raw/kfzzaehlstellen2017.zip",
3941
exdir = "data/raw/cars_unzipped/")
40-
unzip(zipfile = "data/raw/kfzzaehlstellen2018.zip",
42+
unzip(zipfile = "data/raw/kfzzaehlstellen2018.zip",
43+
exdir = "data/raw/cars_unzipped/")
44+
unzip(zipfile = "data/raw/kfzzaehlstellen2019BisAugust.zip",
4145
exdir = "data/raw/cars_unzipped/")

src/02_cars_to_db.R

Lines changed: 56 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,29 +9,59 @@
99
sapply(c("dplyr", "assertthat", "lubridate", "chron", "tidyr", "DBI", "RSQLite"),
1010
require, character.only = TRUE)
1111

12-
process_df <- function(df) {
13-
# shift header left and remove last column
12+
process_df <- function(df, filename) {
13+
14+
# shift header left and remove last column, if necessary
1415
if (!is.na(colnames(df)[26])) {
15-
colnames(df) <-
16-
c(colnames(df), "NA") %>%
17-
tail(-1)
18-
assert_that(df %>% dplyr::select(26) %>% is.na %>% all)
16+
# second if does not fit in the first expression due to non-short-circuit R
17+
# (if there is no 26'th column, the expression will fail ...)
18+
if (all(is.na(df[26]))) {
19+
colnames(df) <-
20+
c(colnames(df), "NA") %>%
21+
tail(-1)
22+
assert_that(df %>% dplyr::select(26) %>% is.na %>% all)
23+
df <-
24+
df %>%
25+
dplyr::select(-26)
26+
}
27+
}
28+
29+
# remove whole column even if classification seems to work for two junctions
30+
# see https://github.com/codeformuenster/traffic-dynamics/issues/13
31+
if ("X" %in% colnames(df)) {
1932
df <-
2033
df %>%
21-
dplyr::select(-26)
34+
dplyr::select(-X)
35+
}
36+
# from Oct 2018 onwards
37+
if ("Klasse" %in% colnames(df)) {
38+
df <-
39+
df %>%
40+
dplyr::select(-Klasse)
41+
}
42+
# from mid-december onwards: wrong naming of columns (see raw files)
43+
# crude measure: remove supoosed classification column, if there are too many "500"s
44+
if (sum(df[,2] == 500, na.rm = T) > 510) {
45+
df <-
46+
df %>%
47+
dplyr::select(-2)
48+
}
49+
50+
# not NA if is needed for empty files
51+
if (!is.na(df[1, 1])) {
52+
# remove metadata about not happening classification (from August 2018 onwards)
53+
if (startsWith(as.character(df[1, 1]), "Datum")) {
54+
df <- df[-seq(1,12), ]
55+
}
2256
}
2357

2458
# DATE
25-
# identify date from first column label
26-
date <-
27-
df %>%
28-
colnames %>%
29-
.[1] %>%
30-
ymd(.)
31-
# add date to new column
59+
# identify date from filename
60+
date_from_filename <- ymd(substr(filename, 1, 10))
61+
# add date from filename to new column
3262
df <-
3363
df %>%
34-
mutate(date = as.character(date))
64+
mutate(date = as.character(date_from_filename))
3565
# rename first header to 'location'
3666
colnames(df)[1] <- "location"
3767

@@ -43,7 +73,7 @@ process_df <- function(df) {
4373
"04050", # Wolbecker Straße / Servatiiplatz
4474
"03052", # Hüfferstraße
4575
"07030", # Hammer Straße
46-
"04051", # Eisenbahnstraße (there are no traffic lights on the Promenade, this one is one the parallel street
76+
"04051", # Eisenbahnstraße (there are no traffic lights on the Promenade at this place, this one is one the parallel street
4777
"04073", # Gartenstraße
4878
"04061", # Warendorfer Straße
4979
"04010", # Hafenstraße
@@ -65,10 +95,10 @@ process_df <- function(df) {
6595
mutate(day = as.integer(day(date))) %>%
6696
# subtract 1 because sqlite counts Sun = 0 but lubridate Sun = 1
6797
mutate(weekday = wday(date, label = F) - 1) %>%
68-
mutate(weekend = is.weekend(date)) %>%
98+
mutate(weekend = is.weekend(date)) %>%
6999
# 'hour' to integer format
70-
mutate(hour = substring(hour, 2)) %>%
71-
mutate(hour = as.integer(hour)) %>%
100+
mutate(hour = substring(hour, 2)) %>%
101+
mutate(hour = as.integer(hour)) %>%
72102
mutate(vehicle = "car")
73103

74104
return(df)
@@ -84,15 +114,16 @@ con <- dbConnect(SQLite(), dbname = "data/database/traffic_data.sqlite")
84114
if (dbExistsTable(con, "cars")) { dbRemoveTable(con, "cars") }
85115

86116
# EACH source file: read, preprocess, add to 'df_target'
87-
for (raw_file in raw_files) {
88-
print(paste("processing ", raw_file))
117+
for (raw_file_name in raw_files) {
118+
print(paste("processing ", raw_file_name))
89119
df_source <-
90-
read.csv(paste(data_folder, raw_file, sep = ""),
91-
sep = ";", row.names = NULL) %>%
92-
process_df()
120+
read.csv(paste(data_folder, raw_file_name, sep = ""),
121+
sep = ";", row.names = NULL)
122+
123+
df_processed <- process_df(df_source, strsplit(raw_file_name, "/")[[1]][2])
93124

94125
# write 'df_source' to SQLite database
95-
dbWriteTable(con, "cars", df_source,
126+
dbWriteTable(con, "cars", df_processed,
96127
append = T, row.names = F, overwrite = F)
97128
}
98129

0 commit comments

Comments
 (0)