-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclassify_documents.R
250 lines (216 loc) · 11.5 KB
/
classify_documents.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
classify_documents <- function(
walk_terms, # data with terms generated by get_rwr_terms()
group_name = "policy_field", # name of the group variable which specifies the classes the documents will be sorted into
document_tokens, # data with the documents to be classified. Expects tokenized data, with a doc_id and one token per row
tokens_var, # name of the tokens variable within the documents dataframe. Usually "tokens", "lemma", etc.
doc_id, # name of the doc_id variable in document_tokens
classification_measure = c("Score", "ScoreMean", # set the measure to use for classification here. Must be present in the data
"ScoreNorm", "ScoreNormMean",
"ScoreNormGroup", "ScoreNormGroupMean"),
cutoff = NULL, # Should a cutoff be set to filter the walk_terms?? Applies to classification measure. NULL to skip. This is useful if a more strict cutoff is desired than in get_rwr_terms()
keep_seed_terms = TRUE, # should seed terms be kept even if their score is lower than the cutoff? only applies if a cutoff is specified
seedterm_value = NULL, # Should Seed Term Scores values be set to a fixed value for classification? NULL to skip. Otherwise enter a numerical value. Applies to classification_measure only
normalize_scores = c("doc", "group", NULL), # should the score in the documents be normalized between 0 and 1? Can be doc (normalize within each document), group (normalize for each group), or NULL to skip
cut_lower_quantile_fields = NULL, # Should policy classifications within a document be set to 0 if they fall below a certain quantile? NULL to skip. Else numerical value to specify the quantile
cut_frequent_group_terms = c(NULL, numeric(), "auto"), # Should terms appearing in numerous groups be cut?
# "auto" to cut terms appearing in more than 50% of the policy fields
# numeric value for a specific number
# NULL to skip
return_walk_terms = TRUE, # should the processed walk terms be returned for further analysis and transparency?
return_unclassified_docs = TRUE, # should the IDs of the unlassified docs be returned?
# setting return_walk_terms or return_unclassified_docs to TURE returns a list of dataframes rather than a single dataframe
verbose = TRUE # should the number of unclassified documents be reported?
){
## Data and Input Checks
rlang::arg_match(classification_measure)
rlang::arg_match(normalize_scores)
if (!is.null(cut_frequent_group_terms)) {
if (!is.na(cut_frequent_group_terms)) {
if (!(is.numeric(cut_frequent_group_terms) |
cut_frequent_group_terms == "auto")) {
stop(
paste0(
"cut_frequent_group_terms must be NULL, 'auto', or a numeric value, not '",
cut_frequent_group_terms,
"'.\n"
)
)
}
} else {
stop("cut_frequent_group_terms must be NULL, 'auto', or a numeric value, not NA.\n")
}
}
if (!(classification_measure %in% names(walk_terms))) {
stop(paste("classification_measure", classification_measure,
"not present in walk_terms data\n"))
}
if (!(doc_id %in% names(document_tokens))) {
stop(paste("doc_id", doc_id, "not present in document_tokens data\n"))
}
## Data Prep
### apply cutoff
if (!is.null(cutoff)) {
walk_terms <-
walk_terms %>% dplyr::filter(!!as.name(classification_measure) >= cutoff |
seed_term == TRUE)
}
### overwrite seedterm values (if desired)
if (!is.null(seedterm_value)) {
walk_terms <- walk_terms %>%
dplyr::mutate(
!!as.name(classification_measure) := dplyr::case_when(
seed_term == TRUE ~ seedterm_value,
.default = !!as.name(classification_measure)
)
)
}
### calculate means (if necessary) to gain one score for each term in a policy field (rather then one for each seedterm-term connection)
#### Note that if we calculate the means only now, means are calculated AFTER the initial filtering, and scores will accordingly be higher (all values below the walk_score threshold have already been dropped!)
if (stringr::str_detect(classification_measure, "Mean")) {
# if one of the mean scores is set as classification_measure, we simply use that
classification_terms <- document_tokens %>%
dplyr::semi_join(walk_terms, # filter for lemmas in the walk terms
dplyr::join_by(!!as.name(tokens_var) == NodeNames)) %>%
dplyr::left_join( # add classification attributes
walk_terms %>% dplyr::distinct(NodeNames,
!!as.name(classification_measure),
!!as.name(group_name)),
dplyr::join_by(!!as.name(tokens_var) == NodeNames),
relationship = "many-to-many"
) # multi-matches for a) terms in multiple docs, b) terms in multiple groups
} else {
# else, we need to calculate the mean for each term within a policy field first, in order to handle duplicates from different seed terms
classification_terms <- document_tokens %>%
dplyr::semi_join(walk_terms, # filter for token var in the walk terms
dplyr::join_by(!!as.name(tokens_var) == NodeNames)) %>%
dplyr::left_join(
walk_terms %>%
dplyr::summarise(
!!as.name(classification_measure) := mean(!!as.name(classification_measure)),
.by = c(NodeNames, !!as.name(group_name))
),
dplyr::join_by(!!as.name(tokens_var) == NodeNames),
# add classification attributes
relationship = "many-to-many"
)
}
# cut frequent policy terms
if (!is.null(cut_frequent_group_terms)) {
if (cut_frequent_group_terms == "auto") {
classification_terms <- classification_terms %>%
filter(!!as.name(tokens_var) %in% (classification_terms %>%
count(!!as.name(tokens_var), !!as.name(group_name)) %>%
count(!!as.name(tokens_var)) %>%
filter(n <= (distinct(classification_terms,
!!as.name(group_name)) %>%
nrow() / 2)) %>%
pull(!!as.name(tokens_var))))
} else {
classification_terms <- classification_terms %>%
filter(!!as.name(tokens_var) %in% (classification_terms %>%
count(!!as.name(tokens_var), !!as.name(group_name)) %>%
count(!!as.name(tokens_var)) %>%
filter(n <= cut_frequent_group_terms) %>%
pull(!!as.name(tokens_var))))
}
}
## classify
classified_documents <- classification_terms %>%
dplyr::summarize(score = sum(!!as.name(classification_measure)),
.by = c(!!as.name(doc_id), !!as.name(group_name))) %>% # sum policy scores by field and document
tidyr::complete(!!as.name(doc_id),!!as.name(group_name),
fill = list(score = 0))
if (!is.null(normalize_scores)) {
if (normalize_scores == "doc") {
# rescale the scores in documents
classified_documents <- classified_documents %>%
dplyr::mutate(
score_norm = scales::rescale(score, to = c(0, 1)),
.by = !!as.name(doc_id)
)
}
if (normalize_scores == "group") {
# rescale the scores in groups
classified_documents <- classified_documents %>%
dplyr::mutate(
score_norm = scales::rescale(score, to = c(0, 1)),
.by = !!as.name(group_name)
)
}
}
if (!is.null(cut_lower_quantile_fields)) { # set scores to 0 for lower quantiles
quantile <- stats::quantile(classified_documents$score,
cut_lower_quantile_fields)[[1]]
classified_documents <- classified_documents %>%
dplyr::mutate(score = dplyr::case_when(score < quantile ~ 0,
.default = score))
if(!is.null(normalize_scores)) { # set normalized scores to 0
quantile <- stats::quantile(classified_documents$score_norm,
cut_lower_quantile_fields)[[1]]
classified_documents <- classified_documents %>%
dplyr::mutate(score_norm = dplyr::case_when(score_norm < quantile ~ 0,
.default = score_norm))
}
}
# report unclassified documents
if (verbose | return_unclassified_docs) {
unclassified_documents <- document_tokens %>%
dplyr::anti_join(classified_documents,
by = dplyr::join_by(!!as.name(doc_id))) %>%
dplyr::distinct(!!as.name(doc_id))
if (verbose) {
unclassified <- unclassified_documents %>% nrow()
total <- document_tokens %>% distinct(!!as.name(doc_id)) %>% nrow()
cat(paste(unclassified,
"out of",
total,
"documents could not be classified",
paste0("(", scales::percent(unclassified/total), ")"),
"\n"))
}
}
# return the result
if (return_walk_terms | return_unclassified_docs) {
out <- list()
out$classified_documents <- classified_documents
if (return_walk_terms) { # return the processed and formatted walk_terms
classification_terms_out <- classification_terms %>%
dplyr::distinct(!!as.name(tokens_var),
!!as.name(group_name),
!!as.name(classification_measure))
if (stringr::str_detect(classification_measure, "Mean")) {
classification_terms_out <- classification_terms_out %>%
dplyr::left_join(walk_terms %>% # add seed_term indicator
dplyr::distinct(!!as.name(group_name),
NodeNames, seed_term),
by = dplyr::join_by(!!as.name(group_name),
!!as.name(tokens_var) == NodeNames))
} else { # if means were calculated during the process, we report the unprocessed values
classification_terms_out <- classification_terms_out %>%
dplyr::left_join(walk_terms %>%
dplyr::distinct(!!as.name(group_name),
NodeNames, seed_term,
!!as.name(classification_measure)) %>%
rename_with(~ paste0(classification_measure,
"_unprocessed"),
all_of(classification_measure)),
by = dplyr::join_by(!!as.name(group_name),
!!as.name(tokens_var) == NodeNames))
}
out$walk_terms <- classification_terms_out %>%
dplyr::arrange(!!as.name(group_name),
dplyr::desc(!!as.name(classification_measure))) %>%
dplyr::select(!!as.name(tokens_var),
!!as.name(group_name),
seed_term,
!!as.name(classification_measure),
everything())
}
if (return_unclassified_docs) {
out$unclassified_documents <- unclassified_documents
}
return(out)
} else {
return(classified_documents)
}
}