Make the class unique and add several methods ref #35 (#42)

* Make the class unique and add several methods ref #35 * Make tokens_proxitmity() still work for changing keywords [no ci] * Update Doc [no ci]
gesistsa · Nov 20, 2023 · 3fdd505 · 3fdd505
1 parent ce7af1e
commit 3fdd505
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 13 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,10 +1,16 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(as.tokens,tokens_with_proximity)
 S3method(convert,tokens_with_proximity)
 S3method(dfm,tokens_with_proximity)
+S3method(docvars,tokens_with_proximity)
+S3method(meta,tokens_with_proximity)
 S3method(print,tokens_with_proximity)
 export(tokens_proximity)
+importFrom(quanteda,as.tokens)
 importFrom(quanteda,convert)
 importFrom(quanteda,dfm)
+importFrom(quanteda,docvars)
+importFrom(quanteda,meta)
 useDynLib(quanteda.proximity, .registration = TRUE)
 useDynLib(quanteda.proximity,row_mins_)
diff --git a/R/get_dist.R b/R/get_dist.R
@@ -41,14 +41,14 @@ resolve_keywords <- function(keywords, features, valuetype) {
 #' Extract Proximity Information
 #'
 #' This function extracts distance information from a [quanteda::tokens()] object.
-#' @param x a `tokens` object
+#' @param x a `tokens` or `tokens_with_proximity` object
 #' @param pattern Pattern for selecting keywords, see [quanteda::pattern] for details.
 #' @param get_min logical, whether to return only the minimum distance or raw distance information; it is more relevant when `keywords` have more than one word. See details.
 #' @param valuetype See [quanteda::valuetype]
 #' @param count_from numeric, how proximity is counted from when `get_min` is `TRUE`. The keyword is assigned with this proximity. Default to 1 (not zero) to prevent division by 0 with the default behaviour of [dfm.tokens_with_proximity()].
 #' @details Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: \["I", "eat", "this", "apple"\] and suppose "eat" is the keyword. The vector of minimum proximity for each word from "eat" is \[2, 1, 2, 3\], if `count_from` is 1. In another case: \["I", "wash", "and", "eat", "this", "apple"\] and \["wash", "eat"\] are the keywords. The minimal distance vector is \[2, 1, 2, 1, 2, 3\]. If `get_min` is `FALSE`, the output is a list of two vectors. For "wash", the distance vector is \[1, 0, 1, 2, 3\]. For "eat", \[3, 2, 1, 0, 1, 2\].
-#' It is recommended conducting all text maniputation tasks with `tokens_*()` functions before calling this function.
-#' @return a `tokens_with_proximity` object. It is a derivative of [quanteda::tokens()], i.e. all `token_*` functions still work. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included
+#' Please conduct all text maniputation tasks with `tokens_*()` functions before calling this function. To convert the output back to a `tokens` object, use [quanteda::as.tokens()].
+#' @return a `tokens_with_proximity` object. It is similar to [quanteda::tokens()], but only [dfm.tokens_with_proximity()], [quanteda::convert()], [quanteda::docvars()], and [quanteda::meta()] methods are available. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included
 #' * a document variation `dist`
 #' * a metadata slot `keywords`
 #' * a metadata slot `get_min`
@@ -76,8 +76,11 @@ resolve_keywords <- function(keywords, features, valuetype) {
 #' @seealso [dfm.tokens_with_proximity()] [quanteda::tokens()]
 #' @export
 tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), count_from = 1) {
-    if (!inherits(x, "tokens")) {
-        stop("x is not a `tokens` object.", call. = FALSE)
+    if (!inherits(x, "tokens") && !inherits(x, "tokens_with_proximity")) {
+        stop("x is not a `tokens` or `tokens_with_proximity` object.", call. = FALSE)
+    }
+    if (inherits(x, "tokens_with_proximity")) {
+        x <- as.tokens(x, remove_docvars_proximity = TRUE)
     }
     valuetype <- match.arg(valuetype)
     keywords <- resolve_keywords(pattern, attr(x, "types"), valuetype)
@@ -86,7 +89,7 @@ tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "
     quanteda::docvars(toks)$proximity <- I(proximity)
     quanteda::meta(toks, field = "keywords") <- keywords
     quanteda::meta(toks, field = "get_min") <- get_min
-    class(toks) <- c("tokens_with_proximity", "tokens")
+    class(toks) <- c("tokens_with_proximity")
     return(toks)
 }
 
@@ -108,16 +111,42 @@ print.tokens_with_proximity <- function(x, ...) {
     cat("keywords: ", quanteda::meta(x, field = "keywords"), "\n")
 }
 
+#' @importFrom quanteda as.tokens
+#' @method as.tokens tokens_with_proximity
+#' @export
+as.tokens.tokens_with_proximity <- function(x, concatenator = "/", remove_docvars_proximity = TRUE, ...) {
+    if (remove_docvars_proximity) {
+        attr(x, which = "docvars")$proximity <- NULL
+    }
+    class(x) <- "tokens"
+    return(x)
+}
+
+#' @importFrom quanteda docvars
+#' @method docvars tokens_with_proximity
+#' @export
+docvars.tokens_with_proximity <- function(x, field = NULL) {
+    quanteda::docvars(as.tokens(x, remove_docvars_proximity = FALSE), field = field)
+}
+
+#' @importFrom quanteda meta
+#' @method meta tokens_with_proximity
+#' @export
+meta.tokens_with_proximity <- function(x, field = NULL, type = c("user", "object", "system", "all")) {
+    quanteda::meta(as.tokens(x, remove_docvars_proximity = FALSE), field = field, type = type)
+}
+
 #' @method convert tokens_with_proximity
 #' @export
 #' @importFrom quanteda convert
 convert.tokens_with_proximity <- function(x, to = c("data.frame"), ...) {
     to <- match.arg(to)
+    x_docnames <- attr(x, "docvars")$docname_
     result_list <- mapply(
         FUN = convert_df,
         tokens_obj = as.list(x),
         proximity_obj = quanteda::docvars(x, "proximity"),
-        doc_id = quanteda::docnames(x),
+        doc_id = x_docnames,
         SIMPLIFY = FALSE, USE.NAMES = FALSE
     )
     return(do.call(rbind, result_list))
@@ -168,7 +197,8 @@ dfm.tokens_with_proximity <- function(x, tolower = TRUE, remove_padding = FALSE,
                                       }, ...) {
     x_attrs <- attributes(x)
     x_docvars <- quanteda::docvars(x)
-    type <- quanteda::types(x)
+    x_docnames <- attr(x, "docvars")$docname_
+    type <- attr(x, "types")
     temp <- unclass(x)
     index <- unlist(temp, use.names = FALSE)
     val <- weight_function(unlist(quanteda::docvars(x, "proximity"), use.names = FALSE))
@@ -180,7 +210,7 @@ dfm.tokens_with_proximity <- function(x, tolower = TRUE, remove_padding = FALSE,
             length(x),
             length(type)
         ),
-        dimnames = list(quanteda::docnames(x), type)
+        dimnames = list(x_docnames, type)
     )
     output <- quanteda::as.dfm(temp)
     attributes(output)[["meta"]] <- x_attrs[["meta"]]

diff --git a/man/tokens_proximity.Rd b/man/tokens_proximity.Rd
diff --git a/tests/testthat/test-tokens_dist.R b/tests/testthat/test-tokens_dist.R
@@ -35,3 +35,10 @@ test_that("convert no strange rownames, #39", {
     expect_true(is.data.frame(res))
     expect_equal(rownames(res), c("1", "2", "3", "4")) ## default rownames
 })
+
+test_that("Changing keywords", {
+    suppressPackageStartupMessages(library(quanteda))
+    "this is my life" %>% tokens() %>% tokens_proximity("my") -> res
+    expect_error(res2 <- tokens_proximity(res, "life"), NA)
+    expect_equal(meta(res2, "keywords"), "life")
+})