Skip to content
This repository was archived by the owner on Feb 11, 2024. It is now read-only.

Commit f324e6a

Browse files
authored
Add tolower ref #35 (#43)
* Make `tolower` default for tokens_proximity() * Update README
1 parent 767b580 commit f324e6a

File tree

5 files changed

+43
-16
lines changed

5 files changed

+43
-16
lines changed

R/get_dist.R

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,21 @@ resolve_keywords <- function(keywords, features, valuetype) {
4141
#' Extract Proximity Information
4242
#'
4343
#' This function extracts distance information from a [quanteda::tokens()] object.
44-
#' @param x a `tokens` or `tokens_with_proximity` object
45-
#' @param pattern Pattern for selecting keywords, see [quanteda::pattern] for details.
44+
#' @param x a `tokens` or `tokens_with_proximity` object.
45+
#' @param pattern pattern for selecting keywords, see [quanteda::pattern] for details.
4646
#' @param get_min logical, whether to return only the minimum distance or raw distance information; it is more relevant when `keywords` have more than one word. See details.
47-
#' @param valuetype See [quanteda::valuetype]
47+
#' @param valuetype See [quanteda::valuetype].
4848
#' @param count_from numeric, how proximity is counted from when `get_min` is `TRUE`. The keyword is assigned with this proximity. Default to 1 (not zero) to prevent division by 0 with the default behaviour of [dfm.tokens_with_proximity()].
49+
#' @param tolower logical, convert all features to lowercase.
50+
#' @param keep_acronyms logical, if `TRUE`, do not lowercase any all-uppercase words. See [quanteda::tokens_tolower()].
4951
#' @details Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: \["I", "eat", "this", "apple"\] and suppose "eat" is the keyword. The vector of minimum proximity for each word from "eat" is \[2, 1, 2, 3\], if `count_from` is 1. In another case: \["I", "wash", "and", "eat", "this", "apple"\] and \["wash", "eat"\] are the keywords. The minimal distance vector is \[2, 1, 2, 1, 2, 3\]. If `get_min` is `FALSE`, the output is a list of two vectors. For "wash", the distance vector is \[1, 0, 1, 2, 3\]. For "eat", \[3, 2, 1, 0, 1, 2\].
5052
#' Please conduct all text maniputation tasks with `tokens_*()` functions before calling this function. To convert the output back to a `tokens` object, use [quanteda::as.tokens()].
5153
#' @return a `tokens_with_proximity` object. It is similar to [quanteda::tokens()], but only [dfm.tokens_with_proximity()], [quanteda::convert()], [quanteda::docvars()], and [quanteda::meta()] methods are available. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included
5254
#' * a document variation `dist`
5355
#' * a metadata slot `keywords`
5456
#' * a metadata slot `get_min`
57+
#' * a metadata slot `tolower`
58+
#' * a metadata slot `keep_acronyms`
5559
#' @examples
5660
#' library(quanteda)
5761
#' tok1 <- data_char_ukimmig2010 %>%
@@ -75,20 +79,26 @@ resolve_keywords <- function(keywords, features, valuetype) {
7579
#' tok1 %>% tokens_proximity("britain")
7680
#' @seealso [dfm.tokens_with_proximity()] [quanteda::tokens()]
7781
#' @export
78-
tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), count_from = 1) {
82+
tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), count_from = 1,
83+
tolower = TRUE, keep_acronyms = FALSE) {
7984
if (!inherits(x, "tokens") && !inherits(x, "tokens_with_proximity")) {
8085
stop("x is not a `tokens` or `tokens_with_proximity` object.", call. = FALSE)
8186
}
8287
if (inherits(x, "tokens_with_proximity")) {
8388
x <- as.tokens(x, remove_docvars_proximity = TRUE)
8489
}
90+
if (tolower) {
91+
x <- quanteda::tokens_tolower(x, keep_acronyms = keep_acronyms)
92+
}
8593
valuetype <- match.arg(valuetype)
8694
keywords <- resolve_keywords(pattern, attr(x, "types"), valuetype)
8795
toks <- x
8896
proximity <- get_proximity(x = toks, keywords = keywords, get_min = get_min, count_from = count_from)
8997
quanteda::docvars(toks)$proximity <- I(proximity)
9098
quanteda::meta(toks, field = "keywords") <- keywords
9199
quanteda::meta(toks, field = "get_min") <- get_min
100+
quanteda::meta(toks, field = "tolower") <- tolower
101+
quanteda::meta(toks, field = "keep_acronyms") <- keep_acronyms
92102
class(toks) <- c("tokens_with_proximity")
93103
return(toks)
94104
}
@@ -104,9 +114,7 @@ convert_df <- function(tokens_obj, proximity_obj, doc_id) {
104114
#' @method print tokens_with_proximity
105115
#' @export
106116
print.tokens_with_proximity <- function(x, ...) {
107-
y <- x
108-
class(y) <- "tokens"
109-
print(y, ...)
117+
print(as.tokens(x), ...)
110118
cat("With proximity vector(s).\n")
111119
cat("keywords: ", quanteda::meta(x, field = "keywords"), "\n")
112120
}

README.Rmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ c("Turkish President Tayyip Erdogan, in his strongest comments yet on the Gaza c
4545
`tokens_proximity()` generates the proximity vectors and stores them as a `docvar` (document variable).
4646

4747
```{r tokens_proximity}
48-
tok1 <- txt1 %>% tokens() %>% tokens_tolower() %>%
48+
tok1 <- txt1 %>% tokens() %>%
4949
tokens_proximity(pattern = "turkish")
5050
tok1
5151
```

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@ c("Turkish President Tayyip Erdogan, in his strongest comments yet on the Gaza c
4040
a `docvar` (document variable).
4141

4242
``` r
43-
tok1 <- txt1 %>% tokens() %>% tokens_tolower() %>%
43+
tok1 <- txt1 %>% tokens() %>%
4444
tokens_proximity(pattern = "turkish")
4545
tok1
46-
#> Tokens consisting of 2 documents and 1 docvar.
46+
#> Tokens consisting of 2 documents.
4747
#> text1 :
4848
#> [1] "turkish" "president" "tayyip" "erdogan" "," "in"
4949
#> [7] "his" "strongest" "comments" "yet" "on" "the"
@@ -137,7 +137,7 @@ How about changing the target to “Hamas”?
137137
``` r
138138
tok2 <- tok1 %>% tokens_proximity(pattern = "hamas")
139139
tok2
140-
#> Tokens consisting of 2 documents and 1 docvar.
140+
#> Tokens consisting of 2 documents.
141141
#> text1 :
142142
#> [1] "turkish" "president" "tayyip" "erdogan" "," "in"
143143
#> [7] "his" "strongest" "comments" "yet" "on" "the"
@@ -164,7 +164,7 @@ Can we use two targets, e.g. “EU” and “Brussels”?
164164
``` r
165165
tok3 <- tok1 %>% tokens_proximity(pattern = c("eu", "brussels"))
166166
tok3
167-
#> Tokens consisting of 2 documents and 1 docvar.
167+
#> Tokens consisting of 2 documents.
168168
#> text1 :
169169
#> [1] "turkish" "president" "tayyip" "erdogan" "," "in"
170170
#> [7] "his" "strongest" "comments" "yet" "on" "the"

man/tokens_proximity.Rd

Lines changed: 12 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-tokens_dist.R

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,14 @@ test_that("token_proximity() only emit token_proximity #35", {
5151
expect_error(tokens_select(as.tokens(res), "life"), NA)
5252
})
5353

54+
test_that("tolower", {
55+
suppressPackageStartupMessages(library(quanteda))
56+
"this is my MIT life" %>% tokens() %>% tokens_proximity("my") -> res
57+
expect_false("MIT" %in% attr(res, "types"))
58+
"this is my MIT life" %>% tokens() %>% tokens_proximity("my", tolower = FALSE) -> res
59+
expect_true("MIT" %in% attr(res, "types"))
60+
"this is my MIT life" %>% tokens() %>% tokens_proximity("my", tolower = TRUE, keep_acronyms = TRUE) -> res
61+
expect_true("MIT" %in% attr(res, "types"))
62+
expect_true("tolower" %in% names(meta(res)))
63+
expect_true("keep_acronyms" %in% names(meta(res)))
64+
})

0 commit comments

Comments
 (0)