forked from EPINetz/EPINetz-Policy-Parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworkflow_walk_terms.R
194 lines (160 loc) · 9.46 KB
/
workflow_walk_terms.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
## Full Walk Terms Workflow ##
##############################
## A high-level function incorporating the complete workflow to get random walk terms based on seeds
## For testing and API purposes
## Should not be used to process multiple samples/time frames, as it would unnecessarily reproduce certain steps
walk_terms_workflow <- function(
tokens = get_latest_tokens_file(), # tokens object. Expects tokenized data returned by the tokenizer workflow
seed_terms, # seeds objects. Expects a liste with the named dataframes "seed_terms_ministries", "seed_term_committees" and "seed_term_committee_members"
date = Sys.Date(), # date for filtering purposes (max date
walk_replies = FALSE, # should replies be utilized?
walk_mentions = TRUE, # should mentions be utilizued?
walk_urls = TRUE, # should URLs be utilized?
time_frame_walks = weeks(12), # length of the time frame for random walk term extraction
quantile_drop = 0.1, # what quantile should be dropped from the token counts? NULL to skip
walk_score_normalization = "seeds", # Should scores be normalized? "seeds" to normalize the scores for each seed walk. "group" to normalize within grouping vars. Set to NULL for no normalization.
calculate_means = TRUE, # should the means of the score be calculated and displayed? The can also be used for minimum walk_score filtering (see below)
normalize_means = TRUE, # a second normalization of the means
reduce_to_means = TRUE, # should only means be returned, dropping duplicated Nodes and their associated scores?
positive_scores_only = TRUE, # should negative Walk Scores (i.e. very unlikely connection due to negative weights) and 0 scores be dropped? Applied before normalization
walk_score = 0.9, # cutoff value for normalized random walk score. Non-Null Values require the selection of a measure to filter on if more than one walk_score_normalization method is picked
walk_score_measure = "seeds_mean", # value to apply the walk_score filter on.
# Possible are: "default" (auto-pick), "raw" (non-normalized rwr score), "seeds" (seed normalized), "seeds_mean" (mean of seed normalized), "group" (group normalized), "group_mean" (mean of group normalized). Needs to be specified if more than one
walk_score_quantile = TRUE, # Should the quantile be calculated as a dynamic minimum walk_score for each group? If TRUE, walk_score specifies the quantile, rather than a fixed value. Cutoff values may differ from group to group
keep_seed_terms = TRUE, # should seed terms within the policy field of the same period always be kept, regardless of walk score?
seedterm_value = NULL, # should the actual value of seed terms be overwritten by a default value, e.g. 1? NULL to skip
seed = as.numeric(date), # seed to prevent RNG issues in parallelization. by default the numeric conversion of the date
verbose = F
)
{
source("utils_text_processing.R") # text processing utils
source("get_rwr_terms.R") # random walk functions
require(purrr)
require(future)
require(dplyr)
require(data.table)
# some checks
expected_tokens_cols <- c("doc_id", "lemma", "tag", "is_reply", "_source.created_at", "_source.author_id")
if (any(!(expected_tokens_cols %in% colnames(tokens)))) {
stop(
cat("Expected column",
expected_tokens_cols[!(expected_tokens_cols %in% colnames(tokens))],
"not found in tokens. \n", sep = " ")
)
}
expected_seed_dataframes <- c("seed_terms_ministries", "seed_terms_committees","seed_terms_committee_members")
if(any(!(expected_seed_dataframes %in% names(seed_terms)))) {
stop(cat("Expected Data Frame",
expected_seed_dataframes[!(expected_seed_dataframes %in% names(seed_terms))],
"not found in seed_terms. \n", sep = " "))
}
expected_seed_cols <- c("feature", "policy_field")
seed_terms %>% purrr::iwalk(\(dataframe, name)
{
try({ # try and silent to suppress the purrr error
if (any(!(expected_seed_cols %in% colnames(dataframe)))) {
stop(
cat("Expected column",
expected_seed_cols[!(expected_seed_cols %in% colnames(dataframe))],
"not found in",
name,
"\n", sep = " ")
)
}
}, silent = TRUE)
})
walk_NE <- tokens %>% dplyr::as_tibble() %>%
dplyr::filter(`_source.created_at` >= (date - time_frame_walks) & # time frame filtering
`_source.created_at` <= date) %>%
filter_tokens(tokens_col = "lemma",
tags = c("NN", "NE"), # Noun words and NEs only
#minimum string length, stopwords dictionaries, additional stopwords and lower casing set to default
replies = walk_replies, # filter for reply condition (TRUE includes replies, FALSE does not)
keep_mentions = walk_mentions, # should @-mentions be kept?
keep_urls = walk_urls) # should URLs be kept?
## droppercentile of counts
if (!is.null(drop_quantile))
{
if(verbose){
cat("\nRemoving lower Token Quantiles:\n")
}
walk_NE <- drop_quantile(walk_NE,
tokens = "lemma",
quantile = quantile_drop,
ignore_case = FALSE, # case is already lowered
group = "tag",
verbose = verbose)
}
# Prepare Network for Random Walks
walk_network <- future::future({make_multiplex_objects(walk_NE,
vertex_a = "doc_id",
vertex_b = "lemma",
directed = F,
pmi_weight = T,
keep_igraph_network = F,
keep_multiplex_network = T,
keep_adjacency_matrix = F,
keep_normalized_adjacency_matrix = T)},
seed = seed,
stdout = FALSE)
# Prepare Seeds
seeds <- future::future({
data.table::rbindlist(list(seed_terms$seed_terms_ministries, # bind seed terms of subsets together...
seed_terms$seed_terms_committees),
fill = TRUE) %>%
dplyr::anti_join(seed_terms$seed_terms_committee_members, # drop seed terms prevalent for single committee members
by = join_by(feature, committee)) %>%
dplyr::filter(feature %in% walk_NE$lemma) %>% # drop seed terms not in the walk network
dplyr::distinct(feature, policy_field) %>% # drop features duplicated within policy fields (from committees etc)
split(.$policy_field)}, # ... and split by policy field
seed = seed,
stdout = FALSE)
# Housekeeping
rm(tokens)
rm(walk_NE)
gc(verbose = FALSE)
if (verbose) {
# make internal indicators for the verbose loop
cat("..")
seeds_done <- FALSE
networks_done <- FALSE
while(any(!(future::resolved(seeds)) |
!(future::resolved(walk_network)))) {
cat(".")
Sys.sleep(0.5)
if (future::resolved(seeds) &
seeds_done == FALSE) {
cat("Seed preparation done")
seeds_done <- TRUE
}
if (future::resolved(walk_network) &
networks_done == FALSE) {
cat("Network calculation done")
networks_done <- TRUE
}
}
cat("...Calculating Random Walks...\n")
}
# Compute Random Walks
walk_terms <- get_rwr_terms(walk_network = future::value(walk_network),
network_name = NULL, # not required for a single period
seeds = future::value(seeds),
seed_var = "feature",
match_var = NULL, # not required for a single period
flatten_results = TRUE,
group_name = "policy_field",
normalize_score = walk_score_normalization,
calculate_means = calculate_means,
normalize_means = normalize_means, # a second normalization of the means
reduce_to_means = reduce_to_means, # should only means be returned, dropping duplicated Nodes and their associated scores?
positive_scores_only = positive_scores_only,
walk_score = walk_score,
walk_score_measure = walk_score_measure,
walk_score_quantile = walk_score_quantile,
report_quantiles = verbose,
keep_seed_terms = keep_seed_terms,
seedterm_value = seedterm_value, # should the actual value of seed terms be overwritten by a default value, e.g. 1? NULL to skip
progress = verbose)
## The terms returned for 'normalize_score = "seeds"' include duplicates, esp. seed terms with keep_seed_terms = TRUE
return(walk_terms)
}