9
9
sapply(c(" dplyr" , " assertthat" , " lubridate" , " chron" , " tidyr" , " DBI" , " RSQLite" ),
10
10
require , character.only = TRUE )
11
11
12
- process_df <- function (df ) {
13
- # shift header left and remove last column
12
+ process_df <- function (df , filename ) {
13
+
14
+ # shift header left and remove last column, if necessary
14
15
if (! is.na(colnames(df )[26 ])) {
15
- colnames(df ) <-
16
- c(colnames(df ), " NA" ) %> %
17
- tail(- 1 )
18
- assert_that(df %> % dplyr :: select(26 ) %> % is.na %> % all )
16
+ # second if does not fit in the first expression due to non-short-circuit R
17
+ # (if there is no 26'th column, the expression will fail ...)
18
+ if (all(is.na(df [26 ]))) {
19
+ colnames(df ) <-
20
+ c(colnames(df ), " NA" ) %> %
21
+ tail(- 1 )
22
+ assert_that(df %> % dplyr :: select(26 ) %> % is.na %> % all )
23
+ df <-
24
+ df %> %
25
+ dplyr :: select(- 26 )
26
+ }
27
+ }
28
+
29
+ # remove whole column even if classification seems to work for two junctions
30
+ # see https://github.com/codeformuenster/traffic-dynamics/issues/13
31
+ if (" X" %in% colnames(df )) {
19
32
df <-
20
33
df %> %
21
- dplyr :: select(- 26 )
34
+ dplyr :: select(- X )
35
+ }
36
+ # from Oct 2018 onwards
37
+ if (" Klasse" %in% colnames(df )) {
38
+ df <-
39
+ df %> %
40
+ dplyr :: select(- Klasse )
41
+ }
42
+ # from mid-december onwards: wrong naming of columns (see raw files)
43
+ # crude measure: remove supoosed classification column, if there are too many "500"s
44
+ if (sum(df [,2 ] == 500 , na.rm = T ) > 510 ) {
45
+ df <-
46
+ df %> %
47
+ dplyr :: select(- 2 )
48
+ }
49
+
50
+ # not NA if is needed for empty files
51
+ if (! is.na(df [1 , 1 ])) {
52
+ # remove metadata about not happening classification (from August 2018 onwards)
53
+ if (startsWith(as.character(df [1 , 1 ]), " Datum" )) {
54
+ df <- df [- seq(1 ,12 ), ]
55
+ }
22
56
}
23
57
24
58
# DATE
25
- # identify date from first column label
26
- date <-
27
- df %> %
28
- colnames %> %
29
- . [1 ] %> %
30
- ymd(. )
31
- # add date to new column
59
+ # identify date from filename
60
+ date_from_filename <- ymd(substr(filename , 1 , 10 ))
61
+ # add date from filename to new column
32
62
df <-
33
63
df %> %
34
- mutate(date = as.character(date ))
64
+ mutate(date = as.character(date_from_filename ))
35
65
# rename first header to 'location'
36
66
colnames(df )[1 ] <- " location"
37
67
@@ -43,7 +73,7 @@ process_df <- function(df) {
43
73
" 04050" , # Wolbecker Straße / Servatiiplatz
44
74
" 03052" , # Hüfferstraße
45
75
" 07030" , # Hammer Straße
46
- " 04051" , # Eisenbahnstraße (there are no traffic lights on the Promenade, this one is one the parallel street
76
+ " 04051" , # Eisenbahnstraße (there are no traffic lights on the Promenade at this place , this one is one the parallel street
47
77
" 04073" , # Gartenstraße
48
78
" 04061" , # Warendorfer Straße
49
79
" 04010" , # Hafenstraße
@@ -65,10 +95,10 @@ process_df <- function(df) {
65
95
mutate(day = as.integer(day(date ))) %> %
66
96
# subtract 1 because sqlite counts Sun = 0 but lubridate Sun = 1
67
97
mutate(weekday = wday(date , label = F ) - 1 ) %> %
68
- mutate(weekend = is.weekend(date )) %> %
98
+ mutate(weekend = is.weekend(date )) %> %
69
99
# 'hour' to integer format
70
- mutate(hour = substring(hour , 2 )) %> %
71
- mutate(hour = as.integer(hour )) %> %
100
+ mutate(hour = substring(hour , 2 )) %> %
101
+ mutate(hour = as.integer(hour )) %> %
72
102
mutate(vehicle = " car" )
73
103
74
104
return (df )
@@ -84,15 +114,16 @@ con <- dbConnect(SQLite(), dbname = "data/database/traffic_data.sqlite")
84
114
if (dbExistsTable(con , " cars" )) { dbRemoveTable(con , " cars" ) }
85
115
86
116
# EACH source file: read, preprocess, add to 'df_target'
87
- for (raw_file in raw_files ) {
88
- print(paste(" processing " , raw_file ))
117
+ for (raw_file_name in raw_files ) {
118
+ print(paste(" processing " , raw_file_name ))
89
119
df_source <-
90
- read.csv(paste(data_folder , raw_file , sep = " " ),
91
- sep = " ;" , row.names = NULL ) %> %
92
- process_df()
120
+ read.csv(paste(data_folder , raw_file_name , sep = " " ),
121
+ sep = " ;" , row.names = NULL )
122
+
123
+ df_processed <- process_df(df_source , strsplit(raw_file_name , " /" )[[1 ]][2 ])
93
124
94
125
# write 'df_source' to SQLite database
95
- dbWriteTable(con , " cars" , df_source ,
126
+ dbWriteTable(con , " cars" , df_processed ,
96
127
append = T , row.names = F , overwrite = F )
97
128
}
98
129
0 commit comments