gesistsa · chainsawriot · Nov 21, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 21, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -19,6 +19,5 @@ Suggests:
 Config/testthat/edition: 3
 Imports: 
     quanteda,
-    Matrix,
-    utils
+    Matrix
 VignetteBuilder: knitr
diff --git a/R/get_dist.R b/R/get_dist.R
@@ -3,39 +3,49 @@ row_mins_c <- function(mat) {
     .Call("row_mins_", mat, as.integer(nrow(mat)), as.integer(ncol(mat)))
 }
 
-cal_dist <- function(y, poss) {
-    return(abs(y - poss))
+cal_dist <- function(from, to, poss) {
+    return(pmin(abs(to - poss), abs(from - poss)))
 }
 
-cal_proximity <- function(tokenized_text, keywords_poss, get_min = TRUE, count_from = 1) {
-    target_idx <- which(tokenized_text %in% keywords_poss)
-    poss <- seq_along(tokenized_text)
-    if (length(target_idx) == 0) {
-        return(rep(length(poss) + count_from, length(poss)))
-    }
-    res <- sapply(target_idx, cal_dist, poss = poss)
-    if (get_min) {
-        return(row_mins_c(res) + count_from)
-    }
-    return(res)
-}
-
-get_proximity <- function(x, keywords, get_min = TRUE, count_from = 1) {
-    keywords_poss <- which(attr(x, "types") %in% keywords)
-    return(lapply(unclass(x), cal_proximity, keywords_poss = keywords_poss, get_min = get_min, count_from = count_from))
+cal_dist_singular <- function(from, to, poss) {
+    return(abs(from - poss))
 }
 
-resolve_keywords <- function(keywords, features, valuetype) {
-    if (valuetype == "fixed") {
-        return(keywords)
+get_proximity <- function(x, pattern, get_min = TRUE, count_from = 1, valuetype, case_insensitive) {
+    output <- list()
+    idx <- quanteda::index(x, pattern = pattern, valuetype = valuetype, case_insensitive = case_insensitive)
+    singular_pattern_only <- all(idx$to == idx$from)
+    if (singular_pattern_only) {
+        cal_func <- cal_dist_singular
+    } else {
+        cal_func <- cal_dist
     }
-    if (valuetype == "glob") {
-        regex <- paste(utils::glob2rx(keywords), collapse = "|")
+    nt <- as.numeric(quanteda::ntoken(x))
+    dn <- quanteda::docnames(x)
+    for (i in seq_along(x)) {
+        if (dn[i] %in% idx$docname) {
+            poss <- seq_len(nt[i])
+            matched_rows <- idx$docname == dn[i]
+            res <- mapply(cal_func, from = idx$from[matched_rows], to = idx$to[matched_rows], MoreArgs = list("poss" = poss))
+            if (get_min) {
+                output[[i]] <- row_mins_c(res) + count_from
+            } else {
+                output[[i]] <- res
+            }
+        } else {
+            output[[i]] <- rep(nt[i] + count_from, nt[i])
+        }
     }
-    if (valuetype == "regex") {
-        regex <- paste(keywords, collapse = "|")
+    names(output) <- quanteda::docnames(x)
+    return(output)
+}
+
+pp <- function(pattern) {
+    ## pretty print the pattern if it contains phrases
+    if (!is.list(pattern)) {
+        return(pattern)
     }
-    return(grep(regex, features, value = TRUE))
+    return(vapply(pattern, paste, collapse = " ", character(1)))
 }
 
 #' Extract Proximity Information
@@ -45,6 +55,7 @@ resolve_keywords <- function(keywords, features, valuetype) {
 #' @param pattern pattern for selecting keywords, see [quanteda::pattern] for details.
 #' @param get_min logical, whether to return only the minimum distance or raw distance information; it is more relevant when `keywords` have more than one word. See details.
 #' @param valuetype See [quanteda::valuetype].
+#' @param case_insensitive logical, see [quanteda::valuetype].
 #' @param count_from numeric, how proximity is counted from when `get_min` is `TRUE`. The keyword is assigned with this proximity. Default to 1 (not zero) to prevent division by 0 with the default behaviour of [dfm.tokens_with_proximity()].
 #' @param tolower logical, convert all features to lowercase.
 #' @param keep_acronyms logical, if `TRUE`, do not lowercase any all-uppercase words. See [quanteda::tokens_tolower()].
@@ -79,7 +90,7 @@ resolve_keywords <- function(keywords, features, valuetype) {
 #' tok1 %>% tokens_proximity("britain")
 #' @seealso [dfm.tokens_with_proximity()] [quanteda::tokens()]
 #' @export
-tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), count_from = 1,
+tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, count_from = 1,
                              tolower = TRUE, keep_acronyms = FALSE) {
     if (!inherits(x, "tokens") && !inherits(x, "tokens_with_proximity")) {
         stop("x is not a `tokens` or `tokens_with_proximity` object.", call. = FALSE)
@@ -91,11 +102,13 @@ tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "
         x <- quanteda::tokens_tolower(x, keep_acronyms = keep_acronyms)
     }
     valuetype <- match.arg(valuetype)
-    keywords <- resolve_keywords(pattern, attr(x, "types"), valuetype)
     toks <- x
-    proximity <- get_proximity(x = toks, keywords = keywords, get_min = get_min, count_from = count_from)
+    proximity <- get_proximity(x = toks, pattern = pattern, get_min = get_min, count_from = count_from,
+                               valuetype = valuetype, case_insensitive = case_insensitive)
     quanteda::docvars(toks)$proximity <- I(proximity)
-    quanteda::meta(toks, field = "keywords") <- keywords
+    ## only for printing
+    quanteda::meta(toks, field = "pattern") <- pp(pattern)
+    attr(toks, "pattern") <- pattern ## custom field
     quanteda::meta(toks, field = "get_min") <- get_min
     quanteda::meta(toks, field = "tolower") <- tolower
     quanteda::meta(toks, field = "keep_acronyms") <- keep_acronyms
@@ -116,7 +129,7 @@ convert_df <- function(tokens_obj, proximity_obj, doc_id) {
 print.tokens_with_proximity <- function(x, ...) {
     print(as.tokens(x), ...)
     cat("With proximity vector(s).\n")
-    cat("keywords: ", quanteda::meta(x, field = "keywords"), "\n")
+    cat("Pattern: ", quanteda::meta(x, field = "pattern"), "\n")
 }
 
 #' @importFrom quanteda as.tokens
@@ -125,6 +138,7 @@ print.tokens_with_proximity <- function(x, ...) {
 as.tokens.tokens_with_proximity <- function(x, concatenator = "/", remove_docvars_proximity = TRUE, ...) {
     if (remove_docvars_proximity) {
         attr(x, which = "docvars")$proximity <- NULL
+        attr(x, which = "pattern") <- NULL
     }
     class(x) <- "tokens"
     return(x)
@@ -134,14 +148,14 @@ as.tokens.tokens_with_proximity <- function(x, concatenator = "/", remove_docvar
 #' @method docvars tokens_with_proximity
 #' @export
 docvars.tokens_with_proximity <- function(x, field = NULL) {
-    quanteda::docvars(as.tokens(x, remove_docvars_proximity = FALSE), field = field)
+    return(quanteda::docvars(as.tokens(x, remove_docvars_proximity = FALSE), field = field))
 }
 
 #' @importFrom quanteda meta
 #' @method meta tokens_with_proximity
 #' @export
 meta.tokens_with_proximity <- function(x, field = NULL, type = c("user", "object", "system", "all")) {
-    quanteda::meta(as.tokens(x, remove_docvars_proximity = FALSE), field = field, type = type)
+    return(quanteda::meta(as.tokens(x, remove_docvars_proximity = FALSE), field = field, type = type))
 }
 
 #' @method convert tokens_with_proximity

diff --git a/README.Rmd b/README.Rmd
@@ -110,6 +110,21 @@ docvars(tok3, "proximity")
 dfm(tok3) %>% dfm_lookup(dict1) %>% rowSums()
 ```
 
+Can we use [phrase](https://quanteda.io/reference/phrase.html)?
+
+```{r}
+tok4 <- tok1 %>% tokens_proximity(pattern = phrase("Tayyip Erdogan"))
+tok4
+```
+
+```{r}
+docvars(tok4, "proximity")
+```
+
+```{r}
+dfm(tok4) %>% dfm_lookup(dict1) %>% rowSums()
+```
+
 # Similar functions
 
 * [quanteda](https://quanteda.io/): `quanteda::tokens_select(window)`, `quanteda::fcm()`

diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ tok1
 #> [ ... and 31 more ]
 #> 
 #> With proximity vector(s).
-#> keywords:  turkish
+#> Pattern:  turkish
 ```
 
 You can access the proximity vectors by
@@ -150,7 +150,7 @@ tok2
 #> [ ... and 31 more ]
 #> 
 #> With proximity vector(s).
-#> keywords:  hamas
+#> Pattern:  hamas
 ```
 
 ``` r
@@ -177,7 +177,7 @@ tok3
 #> [ ... and 31 more ]
 #> 
 #> With proximity vector(s).
-#> keywords:  eu brussels
+#> Pattern:  eu brussels
 ```
 
 ``` r
@@ -197,6 +197,44 @@ dfm(tok3) %>% dfm_lookup(dict1) %>% rowSums()
 #> 0.02564103 0.45833333
 ```
 
+Can we use [phrase](https://quanteda.io/reference/phrase.html)?
+
+``` r
+tok4 <- tok1 %>% tokens_proximity(pattern = phrase("Tayyip Erdogan"))
+tok4
+#> Tokens consisting of 2 documents.
+#> text1 :
+#>  [1] "turkish"   "president" "tayyip"    "erdogan"   ","         "in"       
+#>  [7] "his"       "strongest" "comments"  "yet"       "on"        "the"      
+#> [ ... and 26 more ]
+#> 
+#> text2 :
+#>  [1] "eu"           "policymakers" "proposed"     "the"          "new"         
+#>  [6] "agency"       "in"           "2021"         "to"           "stop"        
+#> [11] "financial"    "firms"       
+#> [ ... and 31 more ]
+#> 
+#> With proximity vector(s).
+#> Pattern:  Tayyip Erdogan
+```
+
+``` r
+docvars(tok4, "proximity")
+#> $text1
+#>  [1]  3  2  1  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
+#> [26] 23 24 25 26 27 28 29 30 31 32 33 34 35
+#> 
+#> $text2
+#>  [1] 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44
+#> [26] 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44
+```
+
+``` r
+dfm(tok4) %>% dfm_lookup(dict1) %>% rowSums()
+#>      text1      text2 
+#> 0.04166667 0.04545455
+```
+
 # Similar functions
 
   - [quanteda](https://quanteda.io/): `quanteda::tokens_select(window)`,

diff --git a/man/tokens_proximity.Rd b/man/tokens_proximity.Rd
diff --git a/tests/testthat/test-tokens_dist.R b/tests/testthat/test-tokens_dist.R
@@ -6,12 +6,6 @@ test_that("edge cases", {
     expect_error("" %>% tokens() %>% tokens_proximity("") %>% convert(), NA)
 })
 
-test_that("resolve_keywords", {
-    expect_equal(resolve_keywords(c("abc", "def"), c("abcd", "defa"), valuetype = "fixed"), c("abc", "def"))
-    expect_equal(resolve_keywords(c("abc*", "def*"), c("abcd", "defa"), valuetype = "glob"), c("abcd", "defa"))
-    expect_equal(resolve_keywords(c("a"), c("abcd", "defa"), valuetype = "regex"), c("abcd", "defa"))
-})
-
 test_that("count_from", {
     suppressPackageStartupMessages(library(quanteda))
     "this is my life" %>% tokens() %>% tokens_proximity("my") %>% docvars("proximity") -> res
@@ -36,11 +30,11 @@ test_that("convert no strange rownames, #39", {
     expect_equal(rownames(res), c("1", "2", "3", "4")) ## default rownames
 })
 
-test_that("Changing keywords", {
+test_that("Changing pattern", {
     suppressPackageStartupMessages(library(quanteda))
     "this is my life" %>% tokens() %>% tokens_proximity("my") -> res
     expect_error(res2 <- tokens_proximity(res, "life"), NA)
-    expect_equal(meta(res2, "keywords"), "life")
+    expect_equal(meta(res2, "pattern"), "life")
 })
 
 test_that("token_proximity() only emit token_proximity #35", {
@@ -62,3 +56,19 @@ test_that("tolower", {
     expect_true("tolower" %in% names(meta(res)))
     expect_true("keep_acronyms" %in% names(meta(res)))    
 })
+
+test_that("case_insensitive", {
+    suppressPackageStartupMessages(library(quanteda))
+    "this is my MIT life" %>% tokens() %>% tokens_proximity("MIT") -> res
+    expect_false("MIT" %in% attr(res, "types"))
+    expect_equal(docvars(res, "proximity")$text1, c(4, 3, 2, 1, 2))
+    "this is my MIT life" %>% tokens() %>% tokens_proximity("MIT", case_insensitive = FALSE) -> res
+    expect_false("MIT" %in% attr(res, "types"))
+    expect_equal(docvars(res, "proximity")$text1, c(6, 6, 6, 6, 6))    
+})
+
+test_that("phrase", {
+    suppressPackageStartupMessages(library(quanteda))
+    expect_error("Seid ihr das Essen? Nein, wir sind die Jäger." %>% tokens() %>% tokens_proximity(phrase("das Essen")) -> res, NA)
+    expect_equal(docvars(res, "proximity")$text1, c(3,2,1,1,2,3,4,5,6,7,8,9))
+})