-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils_text_processing.R
258 lines (198 loc) · 9.21 KB
/
utils_text_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
drop_quantile <- function(data, # data
tokens, # column with tokens
quantile = 0.1, # quantile to be dropped
ignore_case = TRUE, # ignore casing?
group, # grouping column, e.g. type of word, for report statistics
verbose = TRUE # report staistics
)
{
require(dplyr)
require(stringr)
require(scales)
require(stats)
if (ignore_case == TRUE) {
data <- data %>%
dplyr::mutate(
orig_tokens = !!as.name(tokens), # preserve original tokens
{{tokens}} := stringr::str_to_lower(!!as.name(tokens))) # convert to lower case to ignore case
}
# add count for tokens
res <- data %>%
dplyr::group_by(!!as.name(tokens)) %>%
dplyr::mutate(entity_count = dplyr::n()) %>%
dplyr::ungroup()
# save min and max counts for verbose output
if (verbose == TRUE) {
range <- dplyr::tibble(min = min(res$entity_count),
max = max(res$entity_count))
}
# calculate quantile for filtering
threshold <- res %>%
dplyr::distinct(!!as.name(tokens), .keep_all = T) %>%
dplyr::pull(entity_count) %>%
stats::quantile(probs = quantile) %>%
.[[1]]
# drop quantile
res <- res %>%
dplyr::filter(entity_count > threshold)
if (verbose == TRUE) {
cat(paste(scales::percent(quantile), "Quantile:", threshold, "\n"))
cat(paste("Count Range:", range$min, "-", range$max, "\n"))
original_count <- data %>% # save unprocessed count statistics
dplyr::distinct(!!as.name(tokens), !!as.name(group)) %>%
dplyr::summarise(distinct_unprocessed = n(), .by = {{group}})
res %>%
dplyr::distinct(!!as.name(tokens),!!as.name(group)) %>%
dplyr::group_by(!!as.name(group)) %>%
dplyr::summarise(distinct_processed = n()) %>%
dplyr::left_join(original_count, by = group) %>%
dplyr::arrange(tag) %>%
dplyr::add_row(
{{group}} := "total",
distinct_unprocessed = sum(.$distinct_unprocessed),
distinct_processed = sum(.$distinct_processed)
) %>%
dplyr::mutate(
dropped = distinct_unprocessed - distinct_processed,
reduction = scales::percent(1 - distinct_processed / distinct_unprocessed)
) %>%
print()
}
if (ignore_case == TRUE) {
res <- res %>% dplyr::mutate({{tokens}} := orig_tokens) %>% # restore original tokens
dplyr::select(!orig_tokens)
}
return(res)
}
filter_tokens <- function(tokens,
tokens_col = "lemma", # which column should be used for token filtering. Usually "token" or "lemma"
tags = c("NN", "NE"), # tags of a tokens object to keep. Expects a "tag" column. NULL to skip
min_str_length = 2, # minimum string length to keep
tolower = TRUE, # should tokens be set to lower? Ignores URLs and all-caps acronyms. Note that this is executed before stopword filtering, which is case-sensitive
stopword_languages = c("en", "de", "fr"), # stopword dictionaries to be removed. NULL to skip
stopword_dictionary = "snowball", # stopword dictionary to be used. See stopwords::stopwords()
additional_stopwords = c("amp", "&", "RT", "rt", "--", "---"), # provide additional stopwords to drop here
replies = NULL, # should replies be considered for the random walks? Expects a column named is_reply. NULL to skip
keep_mentions = TRUE, # should @-mentions be filtered out?
keep_urls = TRUE # should URLs be filtered out?
) {
require(dplyr)
require(stopwords)
require(stringr)
require(purrr)
# some checks
if (!is.null(tags) & !("tag" %in% colnames(tokens))){
stop("Missing 'tag' column for tag filtering")
}
if (!is.null(replies) & !("is_reply" %in% colnames(tokens))){
stop("Missing 'is_reply' column for reply filtering")
}
if (any(!(stopword_languages %in% stopwords::stopwords_getlanguages(stopword_dictionary)))) {
stop(
cat("Language '",
stopword_languages[!stopword_languages %in%
stopwords::stopwords_getlanguages(stopword_dictionary)],
"' not provided in stopword dictionary '",
stopword_dictionary, "'.\n", sep = "")
)
}
# Filtering Sequences
if (!is.null(tags)) {
tokens <- tokens %>%
dplyr::filter(tag %in% tags)
}
tokens <- tokens %>%
dplyr::filter(stringr::str_length(!!as.name(tokens_col)) >= min_str_length)
if (tolower) {
tokens <- tokens %>%
dplyr::mutate(lemma = # lower case -
dplyr::case_when(
str_detect(!!as.name(tokens_col), "http") # except for URLs (so they don't break) ...
~ !!as.name(tokens_col),
!!as.name(tokens_col) == toupper(!!as.name(tokens_col)) # ... and acronyms (all caps), e.g. "UN", "IT" etc
~ !!as.name(tokens_col),
.default = tolower(!!as.name(tokens_col))))
}
if (!is.null(stopword_languages)) {
stopwords <- stopword_languages %>% # combine stopwords dicts
purrr::map(\(lang) stopwords::stopwords(language = lang,
source = stopword_dictionary)) %>%
unlist()
tokens <- tokens %>%
dplyr::filter(!(!!as.name(tokens_col) %in% stopwords))
}
if (!is.null(additional_stopwords)) {
tokens <- tokens %>%
dplyr::filter(!(!!as.name(tokens_col) %in% additional_stopwords))
}
if (!is.null(replies)) {
tokens <- tokens %>%
dplyr::filter(is_reply == replies | is_reply == FALSE) # filter for reply condition (TRUE includes replies, FALSE does not)
}
if (keep_mentions == FALSE) {
tokens <- tokens %>%
dplyr::filter(!stringr::str_detect(!!as.name(tokens_col), "@")) # drop all lemmas containing "@" - that is, all mentions
}
if (keep_urls == FALSE) {
tokens <- tokens %>%
dplyr::filter(!stringr::str_detect(!!as.name(tokens_col), "http")) # drop all URLs
}
return(tokens)
}
get_latest_tokens_file <- function(path,
pattern ="tokens.csv.tar.gz", # regular updates of the tokens file will always be named "tokens.csv.tar.gz"
fallback = "tokens_init") { # fallback pattern to look for, with only the latest returned
# if there are multiple matches according to the pattern "tokens_init_[date]"
require(stringr)
require(dplyr)
files <- list.files(path)
if (pattern %in% files) { # check for latest tokens file and load
latest_tokens <- pattern
} else { # else, fall back to initial tokenization
latest_tokens <- dplyr::tibble(files = files) %>%
dplyr::filter(stringr::str_detect(files, fallback)) %>%
dplyr::arrange(dplyr::desc(files)) %>% dplyr::slice_head(n = 1)
}
return(file.path(path, latest_tokens))
}
read_timelimited_data <- function(file,
..., # arguments to pass to vroom()
filter_var = "_source.created_at", # filtering var as datetime
starting_point, # the starting point, as "YYYY-MM-DD"
timeframe = lubridate::weeks(1), # the timeframe before or after the starting point, as a lubridate period
before_after = c("before", "after")) {
require(rlang)
require(dplyr)
require(lubridate)
require(vroom)
rlang::arg_match(before_after)
if (before_after == "before") {
data <- vroom::vroom(file, ...) %>%
dplyr::filter(!!as.name(filter_var) <= lubridate::ymd(starting_point) &
!!as.name(filter_var) >= (lubridate::ymd(starting_point) - timeframe))}
if (before_after == "after") {
data <- vroom::vroom(file, ...) %>%
dplyr::filter(!!as.name(filter_var) >= lubridate::ymd(starting_point) &
!!as.name(filter_var) <= (lubridate::ymd(starting_point) + timeframe))}
gc(verbose = FALSE)
return(data)
}
split_timeframes <- function(data, # the dataframe to read in
datetime_var, # the name of the datetime var to use for splitting
timeframe = lubridate::weeks(1), # the timeframe. can be any lubridate period
before_after = c("before", "after") # split before or after the timeframe's endpoint
){
rlang::arg_match(before_after)
if (before_after == "before") {
out <- data %>%
dplyr::mutate(period = lubridate::ceiling_date(!!as.name(datetime_var), unit = timeframe)) # end of the week
}
if (before_after == "after") {
out <- data %>%
dplyr::mutate(period = lubridate::floor_date(!!as.name(datetime_var), unit = timeframe)) # beginning of the week
}
out <- out %>%
data.table::as.data.table() %>%
split(by = "period")
return(out)
}