diff --git a/NAMESPACE b/NAMESPACE index 9650ae4..78af033 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,16 @@ # Generated by roxygen2: do not edit by hand +S3method(as.tokens,tokens_with_proximity) S3method(convert,tokens_with_proximity) S3method(dfm,tokens_with_proximity) +S3method(docvars,tokens_with_proximity) +S3method(meta,tokens_with_proximity) S3method(print,tokens_with_proximity) export(tokens_proximity) +importFrom(quanteda,as.tokens) importFrom(quanteda,convert) importFrom(quanteda,dfm) +importFrom(quanteda,docvars) +importFrom(quanteda,meta) useDynLib(quanteda.proximity, .registration = TRUE) useDynLib(quanteda.proximity,row_mins_) diff --git a/R/get_dist.R b/R/get_dist.R index a531a23..027e496 100644 --- a/R/get_dist.R +++ b/R/get_dist.R @@ -41,14 +41,14 @@ resolve_keywords <- function(keywords, features, valuetype) { #' Extract Proximity Information #' #' This function extracts distance information from a [quanteda::tokens()] object. -#' @param x a `tokens` object +#' @param x a `tokens` or `tokens_with_proximity` object #' @param pattern Pattern for selecting keywords, see [quanteda::pattern] for details. #' @param get_min logical, whether to return only the minimum distance or raw distance information; it is more relevant when `keywords` have more than one word. See details. #' @param valuetype See [quanteda::valuetype] #' @param count_from numeric, how proximity is counted from when `get_min` is `TRUE`. The keyword is assigned with this proximity. Default to 1 (not zero) to prevent division by 0 with the default behaviour of [dfm.tokens_with_proximity()]. #' @details Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: \["I", "eat", "this", "apple"\] and suppose "eat" is the keyword. The vector of minimum proximity for each word from "eat" is \[2, 1, 2, 3\], if `count_from` is 1. In another case: \["I", "wash", "and", "eat", "this", "apple"\] and \["wash", "eat"\] are the keywords. The minimal distance vector is \[2, 1, 2, 1, 2, 3\]. If `get_min` is `FALSE`, the output is a list of two vectors. For "wash", the distance vector is \[1, 0, 1, 2, 3\]. For "eat", \[3, 2, 1, 0, 1, 2\]. -#' It is recommended conducting all text maniputation tasks with `tokens_*()` functions before calling this function. -#' @return a `tokens_with_proximity` object. It is a derivative of [quanteda::tokens()], i.e. all `token_*` functions still work. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included +#' Please conduct all text maniputation tasks with `tokens_*()` functions before calling this function. To convert the output back to a `tokens` object, use [quanteda::as.tokens()]. +#' @return a `tokens_with_proximity` object. It is similar to [quanteda::tokens()], but only [dfm.tokens_with_proximity()], [quanteda::convert()], [quanteda::docvars()], and [quanteda::meta()] methods are available. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included #' * a document variation `dist` #' * a metadata slot `keywords` #' * a metadata slot `get_min` @@ -76,8 +76,11 @@ resolve_keywords <- function(keywords, features, valuetype) { #' @seealso [dfm.tokens_with_proximity()] [quanteda::tokens()] #' @export tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), count_from = 1) { - if (!inherits(x, "tokens")) { - stop("x is not a `tokens` object.", call. = FALSE) + if (!inherits(x, "tokens") && !inherits(x, "tokens_with_proximity")) { + stop("x is not a `tokens` or `tokens_with_proximity` object.", call. = FALSE) + } + if (inherits(x, "tokens_with_proximity")) { + x <- as.tokens(x, remove_docvars_proximity = TRUE) } valuetype <- match.arg(valuetype) keywords <- resolve_keywords(pattern, attr(x, "types"), valuetype) @@ -86,7 +89,7 @@ tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", " quanteda::docvars(toks)$proximity <- I(proximity) quanteda::meta(toks, field = "keywords") <- keywords quanteda::meta(toks, field = "get_min") <- get_min - class(toks) <- c("tokens_with_proximity", "tokens") + class(toks) <- c("tokens_with_proximity") return(toks) } @@ -108,16 +111,42 @@ print.tokens_with_proximity <- function(x, ...) { cat("keywords: ", quanteda::meta(x, field = "keywords"), "\n") } +#' @importFrom quanteda as.tokens +#' @method as.tokens tokens_with_proximity +#' @export +as.tokens.tokens_with_proximity <- function(x, concatenator = "/", remove_docvars_proximity = TRUE, ...) { + if (remove_docvars_proximity) { + attr(x, which = "docvars")$proximity <- NULL + } + class(x) <- "tokens" + return(x) +} + +#' @importFrom quanteda docvars +#' @method docvars tokens_with_proximity +#' @export +docvars.tokens_with_proximity <- function(x, field = NULL) { + quanteda::docvars(as.tokens(x, remove_docvars_proximity = FALSE), field = field) +} + +#' @importFrom quanteda meta +#' @method meta tokens_with_proximity +#' @export +meta.tokens_with_proximity <- function(x, field = NULL, type = c("user", "object", "system", "all")) { + quanteda::meta(as.tokens(x, remove_docvars_proximity = FALSE), field = field, type = type) +} + #' @method convert tokens_with_proximity #' @export #' @importFrom quanteda convert convert.tokens_with_proximity <- function(x, to = c("data.frame"), ...) { to <- match.arg(to) + x_docnames <- attr(x, "docvars")$docname_ result_list <- mapply( FUN = convert_df, tokens_obj = as.list(x), proximity_obj = quanteda::docvars(x, "proximity"), - doc_id = quanteda::docnames(x), + doc_id = x_docnames, SIMPLIFY = FALSE, USE.NAMES = FALSE ) return(do.call(rbind, result_list)) @@ -168,7 +197,8 @@ dfm.tokens_with_proximity <- function(x, tolower = TRUE, remove_padding = FALSE, }, ...) { x_attrs <- attributes(x) x_docvars <- quanteda::docvars(x) - type <- quanteda::types(x) + x_docnames <- attr(x, "docvars")$docname_ + type <- attr(x, "types") temp <- unclass(x) index <- unlist(temp, use.names = FALSE) val <- weight_function(unlist(quanteda::docvars(x, "proximity"), use.names = FALSE)) @@ -180,7 +210,7 @@ dfm.tokens_with_proximity <- function(x, tolower = TRUE, remove_padding = FALSE, length(x), length(type) ), - dimnames = list(quanteda::docnames(x), type) + dimnames = list(x_docnames, type) ) output <- quanteda::as.dfm(temp) attributes(output)[["meta"]] <- x_attrs[["meta"]] diff --git a/man/tokens_proximity.Rd b/man/tokens_proximity.Rd index 1153a29..ef2862e 100644 --- a/man/tokens_proximity.Rd +++ b/man/tokens_proximity.Rd @@ -13,7 +13,7 @@ tokens_proximity( ) } \arguments{ -\item{x}{a \code{tokens} object} +\item{x}{a \code{tokens} or \code{tokens_with_proximity} object} \item{pattern}{Pattern for selecting keywords, see \link[quanteda:pattern]{quanteda::pattern} for details.} @@ -24,7 +24,7 @@ tokens_proximity( \item{count_from}{numeric, how proximity is counted from when \code{get_min} is \code{TRUE}. The keyword is assigned with this proximity. Default to 1 (not zero) to prevent division by 0 with the default behaviour of \code{\link[=dfm.tokens_with_proximity]{dfm.tokens_with_proximity()}}.} } \value{ -a \code{tokens_with_proximity} object. It is a derivative of \code{\link[quanteda:tokens]{quanteda::tokens()}}, i.e. all \verb{token_*} functions still work. A \code{tokens_with_proximity} has a modified \code{\link[=print]{print()}} method. Also, additional data slots are included +a \code{tokens_with_proximity} object. It is similar to \code{\link[quanteda:tokens]{quanteda::tokens()}}, but only \code{\link[=dfm.tokens_with_proximity]{dfm.tokens_with_proximity()}}, \code{\link[quanteda:convert]{quanteda::convert()}}, \code{\link[quanteda:docvars]{quanteda::docvars()}}, and \code{\link[quanteda:meta]{quanteda::meta()}} methods are available. A \code{tokens_with_proximity} has a modified \code{\link[=print]{print()}} method. Also, additional data slots are included \itemize{ \item a document variation \code{dist} \item a metadata slot \code{keywords} @@ -35,8 +35,8 @@ a \code{tokens_with_proximity} object. It is a derivative of \code{\link[quanted This function extracts distance information from a \code{\link[quanteda:tokens]{quanteda::tokens()}} object. } \details{ -Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: ["I", "eat", "this", "apple"] and suppose "eat" is the target. The vector of minimum proximity for each word from "eat" is [2, 1, 2, 3], if \code{count_from} is 1. In another case: ["I", "wash", "and", "eat", "this", "apple"] and ["wash", "eat"] are the keywords. The minimal distance vector is [2, 1, 2, 1, 2, 3]. If \code{get_min} is \code{FALSE}, the output is a list of two vectors. For "wash", the distance vector is [1, 0, 1, 2, 3]. For "eat", [3, 2, 1, 0, 1, 2]. -It is recommended conducting all text maniputation tasks with \verb{tokens_*()} functions before calling this function. +Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: ["I", "eat", "this", "apple"] and suppose "eat" is the keyword. The vector of minimum proximity for each word from "eat" is [2, 1, 2, 3], if \code{count_from} is 1. In another case: ["I", "wash", "and", "eat", "this", "apple"] and ["wash", "eat"] are the keywords. The minimal distance vector is [2, 1, 2, 1, 2, 3]. If \code{get_min} is \code{FALSE}, the output is a list of two vectors. For "wash", the distance vector is [1, 0, 1, 2, 3]. For "eat", [3, 2, 1, 0, 1, 2]. +Please conduct all text maniputation tasks with \verb{tokens_*()} functions before calling this function. To convert the output back to a \code{tokens} object, use \code{\link[quanteda:as.tokens]{quanteda::as.tokens()}}. } \examples{ library(quanteda) diff --git a/tests/testthat/test-tokens_dist.R b/tests/testthat/test-tokens_dist.R index 663a081..fd83e5c 100644 --- a/tests/testthat/test-tokens_dist.R +++ b/tests/testthat/test-tokens_dist.R @@ -35,3 +35,10 @@ test_that("convert no strange rownames, #39", { expect_true(is.data.frame(res)) expect_equal(rownames(res), c("1", "2", "3", "4")) ## default rownames }) + +test_that("Changing keywords", { + suppressPackageStartupMessages(library(quanteda)) + "this is my life" %>% tokens() %>% tokens_proximity("my") -> res + expect_error(res2 <- tokens_proximity(res, "life"), NA) + expect_equal(meta(res2, "keywords"), "life") +})