From f324e6a7ecfd1eae4b5c504e6ad0fb4c1fe5cdaa Mon Sep 17 00:00:00 2001 From: chainsawriot Date: Mon, 20 Nov 2023 16:37:13 +0100 Subject: [PATCH] Add `tolower` ref #35 (#43) * Make `tolower` default for tokens_proximity() * Update README --- R/get_dist.R | 22 +++++++++++++++------- README.Rmd | 2 +- README.md | 8 ++++---- man/tokens_proximity.Rd | 16 ++++++++++++---- tests/testthat/test-tokens_dist.R | 11 +++++++++++ 5 files changed, 43 insertions(+), 16 deletions(-) diff --git a/R/get_dist.R b/R/get_dist.R index 027e496..ab39f28 100644 --- a/R/get_dist.R +++ b/R/get_dist.R @@ -41,17 +41,21 @@ resolve_keywords <- function(keywords, features, valuetype) { #' Extract Proximity Information #' #' This function extracts distance information from a [quanteda::tokens()] object. -#' @param x a `tokens` or `tokens_with_proximity` object -#' @param pattern Pattern for selecting keywords, see [quanteda::pattern] for details. +#' @param x a `tokens` or `tokens_with_proximity` object. +#' @param pattern pattern for selecting keywords, see [quanteda::pattern] for details. #' @param get_min logical, whether to return only the minimum distance or raw distance information; it is more relevant when `keywords` have more than one word. See details. -#' @param valuetype See [quanteda::valuetype] +#' @param valuetype See [quanteda::valuetype]. #' @param count_from numeric, how proximity is counted from when `get_min` is `TRUE`. The keyword is assigned with this proximity. Default to 1 (not zero) to prevent division by 0 with the default behaviour of [dfm.tokens_with_proximity()]. +#' @param tolower logical, convert all features to lowercase. +#' @param keep_acronyms logical, if `TRUE`, do not lowercase any all-uppercase words. See [quanteda::tokens_tolower()]. #' @details Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: \["I", "eat", "this", "apple"\] and suppose "eat" is the keyword. The vector of minimum proximity for each word from "eat" is \[2, 1, 2, 3\], if `count_from` is 1. In another case: \["I", "wash", "and", "eat", "this", "apple"\] and \["wash", "eat"\] are the keywords. The minimal distance vector is \[2, 1, 2, 1, 2, 3\]. If `get_min` is `FALSE`, the output is a list of two vectors. For "wash", the distance vector is \[1, 0, 1, 2, 3\]. For "eat", \[3, 2, 1, 0, 1, 2\]. #' Please conduct all text maniputation tasks with `tokens_*()` functions before calling this function. To convert the output back to a `tokens` object, use [quanteda::as.tokens()]. #' @return a `tokens_with_proximity` object. It is similar to [quanteda::tokens()], but only [dfm.tokens_with_proximity()], [quanteda::convert()], [quanteda::docvars()], and [quanteda::meta()] methods are available. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included #' * a document variation `dist` #' * a metadata slot `keywords` #' * a metadata slot `get_min` +#' * a metadata slot `tolower` +#' * a metadata slot `keep_acronyms` #' @examples #' library(quanteda) #' tok1 <- data_char_ukimmig2010 %>% @@ -75,13 +79,17 @@ resolve_keywords <- function(keywords, features, valuetype) { #' tok1 %>% tokens_proximity("britain") #' @seealso [dfm.tokens_with_proximity()] [quanteda::tokens()] #' @export -tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), count_from = 1) { +tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), count_from = 1, + tolower = TRUE, keep_acronyms = FALSE) { if (!inherits(x, "tokens") && !inherits(x, "tokens_with_proximity")) { stop("x is not a `tokens` or `tokens_with_proximity` object.", call. = FALSE) } if (inherits(x, "tokens_with_proximity")) { x <- as.tokens(x, remove_docvars_proximity = TRUE) } + if (tolower) { + x <- quanteda::tokens_tolower(x, keep_acronyms = keep_acronyms) + } valuetype <- match.arg(valuetype) keywords <- resolve_keywords(pattern, attr(x, "types"), valuetype) toks <- x @@ -89,6 +97,8 @@ tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", " quanteda::docvars(toks)$proximity <- I(proximity) quanteda::meta(toks, field = "keywords") <- keywords quanteda::meta(toks, field = "get_min") <- get_min + quanteda::meta(toks, field = "tolower") <- tolower + quanteda::meta(toks, field = "keep_acronyms") <- keep_acronyms class(toks) <- c("tokens_with_proximity") return(toks) } @@ -104,9 +114,7 @@ convert_df <- function(tokens_obj, proximity_obj, doc_id) { #' @method print tokens_with_proximity #' @export print.tokens_with_proximity <- function(x, ...) { - y <- x - class(y) <- "tokens" - print(y, ...) + print(as.tokens(x), ...) cat("With proximity vector(s).\n") cat("keywords: ", quanteda::meta(x, field = "keywords"), "\n") } diff --git a/README.Rmd b/README.Rmd index c9fdd37..747a1e9 100644 --- a/README.Rmd +++ b/README.Rmd @@ -45,7 +45,7 @@ c("Turkish President Tayyip Erdogan, in his strongest comments yet on the Gaza c `tokens_proximity()` generates the proximity vectors and stores them as a `docvar` (document variable). ```{r tokens_proximity} -tok1 <- txt1 %>% tokens() %>% tokens_tolower() %>% +tok1 <- txt1 %>% tokens() %>% tokens_proximity(pattern = "turkish") tok1 ``` diff --git a/README.md b/README.md index 6f3c156..ac3f5cf 100644 --- a/README.md +++ b/README.md @@ -40,10 +40,10 @@ c("Turkish President Tayyip Erdogan, in his strongest comments yet on the Gaza c a `docvar` (document variable). ``` r -tok1 <- txt1 %>% tokens() %>% tokens_tolower() %>% +tok1 <- txt1 %>% tokens() %>% tokens_proximity(pattern = "turkish") tok1 -#> Tokens consisting of 2 documents and 1 docvar. +#> Tokens consisting of 2 documents. #> text1 : #> [1] "turkish" "president" "tayyip" "erdogan" "," "in" #> [7] "his" "strongest" "comments" "yet" "on" "the" @@ -137,7 +137,7 @@ How about changing the target to “Hamas”? ``` r tok2 <- tok1 %>% tokens_proximity(pattern = "hamas") tok2 -#> Tokens consisting of 2 documents and 1 docvar. +#> Tokens consisting of 2 documents. #> text1 : #> [1] "turkish" "president" "tayyip" "erdogan" "," "in" #> [7] "his" "strongest" "comments" "yet" "on" "the" @@ -164,7 +164,7 @@ Can we use two targets, e.g. “EU” and “Brussels”? ``` r tok3 <- tok1 %>% tokens_proximity(pattern = c("eu", "brussels")) tok3 -#> Tokens consisting of 2 documents and 1 docvar. +#> Tokens consisting of 2 documents. #> text1 : #> [1] "turkish" "president" "tayyip" "erdogan" "," "in" #> [7] "his" "strongest" "comments" "yet" "on" "the" diff --git a/man/tokens_proximity.Rd b/man/tokens_proximity.Rd index ef2862e..3b90534 100644 --- a/man/tokens_proximity.Rd +++ b/man/tokens_proximity.Rd @@ -9,19 +9,25 @@ tokens_proximity( pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), - count_from = 1 + count_from = 1, + tolower = TRUE, + keep_acronyms = FALSE ) } \arguments{ -\item{x}{a \code{tokens} or \code{tokens_with_proximity} object} +\item{x}{a \code{tokens} or \code{tokens_with_proximity} object.} -\item{pattern}{Pattern for selecting keywords, see \link[quanteda:pattern]{quanteda::pattern} for details.} +\item{pattern}{pattern for selecting keywords, see \link[quanteda:pattern]{quanteda::pattern} for details.} \item{get_min}{logical, whether to return only the minimum distance or raw distance information; it is more relevant when \code{keywords} have more than one word. See details.} -\item{valuetype}{See \link[quanteda:valuetype]{quanteda::valuetype}} +\item{valuetype}{See \link[quanteda:valuetype]{quanteda::valuetype}.} \item{count_from}{numeric, how proximity is counted from when \code{get_min} is \code{TRUE}. The keyword is assigned with this proximity. Default to 1 (not zero) to prevent division by 0 with the default behaviour of \code{\link[=dfm.tokens_with_proximity]{dfm.tokens_with_proximity()}}.} + +\item{tolower}{logical, convert all features to lowercase.} + +\item{keep_acronyms}{logical, if \code{TRUE}, do not lowercase any all-uppercase words. See \code{\link[quanteda:tokens_tolower]{quanteda::tokens_tolower()}}.} } \value{ a \code{tokens_with_proximity} object. It is similar to \code{\link[quanteda:tokens]{quanteda::tokens()}}, but only \code{\link[=dfm.tokens_with_proximity]{dfm.tokens_with_proximity()}}, \code{\link[quanteda:convert]{quanteda::convert()}}, \code{\link[quanteda:docvars]{quanteda::docvars()}}, and \code{\link[quanteda:meta]{quanteda::meta()}} methods are available. A \code{tokens_with_proximity} has a modified \code{\link[=print]{print()}} method. Also, additional data slots are included @@ -29,6 +35,8 @@ a \code{tokens_with_proximity} object. It is similar to \code{\link[quanteda:tok \item a document variation \code{dist} \item a metadata slot \code{keywords} \item a metadata slot \code{get_min} +\item a metadata slot \code{tolower} +\item a metadata slot \code{keep_acronyms} } } \description{ diff --git a/tests/testthat/test-tokens_dist.R b/tests/testthat/test-tokens_dist.R index 20e1c5c..fbb85bf 100644 --- a/tests/testthat/test-tokens_dist.R +++ b/tests/testthat/test-tokens_dist.R @@ -51,3 +51,14 @@ test_that("token_proximity() only emit token_proximity #35", { expect_error(tokens_select(as.tokens(res), "life"), NA) }) +test_that("tolower", { + suppressPackageStartupMessages(library(quanteda)) + "this is my MIT life" %>% tokens() %>% tokens_proximity("my") -> res + expect_false("MIT" %in% attr(res, "types")) + "this is my MIT life" %>% tokens() %>% tokens_proximity("my", tolower = FALSE) -> res + expect_true("MIT" %in% attr(res, "types")) + "this is my MIT life" %>% tokens() %>% tokens_proximity("my", tolower = TRUE, keep_acronyms = TRUE) -> res + expect_true("MIT" %in% attr(res, "types")) + expect_true("tolower" %in% names(meta(res))) + expect_true("keep_acronyms" %in% names(meta(res))) +})