Skip to content
This repository was archived by the owner on Feb 11, 2024. It is now read-only.

Use quanteda::index() ref #38 #44

Merged
merged 7 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,5 @@ Suggests:
Config/testthat/edition: 3
Imports:
quanteda,
Matrix,
utils
Matrix
VignetteBuilder: knitr
80 changes: 47 additions & 33 deletions R/get_dist.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,49 @@ row_mins_c <- function(mat) {
.Call("row_mins_", mat, as.integer(nrow(mat)), as.integer(ncol(mat)))
}

cal_dist <- function(y, poss) {
return(abs(y - poss))
cal_dist <- function(from, to, poss) {
return(pmin(abs(to - poss), abs(from - poss)))
}

cal_proximity <- function(tokenized_text, keywords_poss, get_min = TRUE, count_from = 1) {
target_idx <- which(tokenized_text %in% keywords_poss)
poss <- seq_along(tokenized_text)
if (length(target_idx) == 0) {
return(rep(length(poss) + count_from, length(poss)))
}
res <- sapply(target_idx, cal_dist, poss = poss)
if (get_min) {
return(row_mins_c(res) + count_from)
}
return(res)
}

get_proximity <- function(x, keywords, get_min = TRUE, count_from = 1) {
keywords_poss <- which(attr(x, "types") %in% keywords)
return(lapply(unclass(x), cal_proximity, keywords_poss = keywords_poss, get_min = get_min, count_from = count_from))
cal_dist_singular <- function(from, to, poss) {
return(abs(from - poss))
}

resolve_keywords <- function(keywords, features, valuetype) {
if (valuetype == "fixed") {
return(keywords)
get_proximity <- function(x, pattern, get_min = TRUE, count_from = 1, valuetype, case_insensitive) {
output <- list()
idx <- quanteda::index(x, pattern = pattern, valuetype = valuetype, case_insensitive = case_insensitive)
singular_pattern_only <- all(idx$to == idx$from)
if (singular_pattern_only) {
cal_func <- cal_dist_singular
} else {
cal_func <- cal_dist
}
if (valuetype == "glob") {
regex <- paste(utils::glob2rx(keywords), collapse = "|")
nt <- as.numeric(quanteda::ntoken(x))
dn <- quanteda::docnames(x)
for (i in seq_along(x)) {
if (dn[i] %in% idx$docname) {
poss <- seq_len(nt[i])
matched_rows <- idx$docname == dn[i]
res <- mapply(cal_func, from = idx$from[matched_rows], to = idx$to[matched_rows], MoreArgs = list("poss" = poss))
if (get_min) {
output[[i]] <- row_mins_c(res) + count_from
} else {
output[[i]] <- res
}
} else {
output[[i]] <- rep(nt[i] + count_from, nt[i])
}
}
if (valuetype == "regex") {
regex <- paste(keywords, collapse = "|")
names(output) <- quanteda::docnames(x)
return(output)
}

pp <- function(pattern) {
## pretty print the pattern if it contains phrases
if (!is.list(pattern)) {
return(pattern)
}
return(grep(regex, features, value = TRUE))
return(vapply(pattern, paste, collapse = " ", character(1)))
}

#' Extract Proximity Information
Expand All @@ -45,6 +55,7 @@ resolve_keywords <- function(keywords, features, valuetype) {
#' @param pattern pattern for selecting keywords, see [quanteda::pattern] for details.
#' @param get_min logical, whether to return only the minimum distance or raw distance information; it is more relevant when `keywords` have more than one word. See details.
#' @param valuetype See [quanteda::valuetype].
#' @param case_insensitive logical, see [quanteda::valuetype].
#' @param count_from numeric, how proximity is counted from when `get_min` is `TRUE`. The keyword is assigned with this proximity. Default to 1 (not zero) to prevent division by 0 with the default behaviour of [dfm.tokens_with_proximity()].
#' @param tolower logical, convert all features to lowercase.
#' @param keep_acronyms logical, if `TRUE`, do not lowercase any all-uppercase words. See [quanteda::tokens_tolower()].
Expand Down Expand Up @@ -79,7 +90,7 @@ resolve_keywords <- function(keywords, features, valuetype) {
#' tok1 %>% tokens_proximity("britain")
#' @seealso [dfm.tokens_with_proximity()] [quanteda::tokens()]
#' @export
tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), count_from = 1,
tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, count_from = 1,
tolower = TRUE, keep_acronyms = FALSE) {
if (!inherits(x, "tokens") && !inherits(x, "tokens_with_proximity")) {
stop("x is not a `tokens` or `tokens_with_proximity` object.", call. = FALSE)
Expand All @@ -91,11 +102,13 @@ tokens_proximity <- function(x, pattern, get_min = TRUE, valuetype = c("glob", "
x <- quanteda::tokens_tolower(x, keep_acronyms = keep_acronyms)
}
valuetype <- match.arg(valuetype)
keywords <- resolve_keywords(pattern, attr(x, "types"), valuetype)
toks <- x
proximity <- get_proximity(x = toks, keywords = keywords, get_min = get_min, count_from = count_from)
proximity <- get_proximity(x = toks, pattern = pattern, get_min = get_min, count_from = count_from,
valuetype = valuetype, case_insensitive = case_insensitive)
quanteda::docvars(toks)$proximity <- I(proximity)
quanteda::meta(toks, field = "keywords") <- keywords
## only for printing
quanteda::meta(toks, field = "pattern") <- pp(pattern)
attr(toks, "pattern") <- pattern ## custom field
quanteda::meta(toks, field = "get_min") <- get_min
quanteda::meta(toks, field = "tolower") <- tolower
quanteda::meta(toks, field = "keep_acronyms") <- keep_acronyms
Expand All @@ -116,7 +129,7 @@ convert_df <- function(tokens_obj, proximity_obj, doc_id) {
print.tokens_with_proximity <- function(x, ...) {
print(as.tokens(x), ...)
cat("With proximity vector(s).\n")
cat("keywords: ", quanteda::meta(x, field = "keywords"), "\n")
cat("Pattern: ", quanteda::meta(x, field = "pattern"), "\n")
}

#' @importFrom quanteda as.tokens
Expand All @@ -125,6 +138,7 @@ print.tokens_with_proximity <- function(x, ...) {
as.tokens.tokens_with_proximity <- function(x, concatenator = "/", remove_docvars_proximity = TRUE, ...) {
if (remove_docvars_proximity) {
attr(x, which = "docvars")$proximity <- NULL
attr(x, which = "pattern") <- NULL
}
class(x) <- "tokens"
return(x)
Expand All @@ -134,14 +148,14 @@ as.tokens.tokens_with_proximity <- function(x, concatenator = "/", remove_docvar
#' @method docvars tokens_with_proximity
#' @export
docvars.tokens_with_proximity <- function(x, field = NULL) {
quanteda::docvars(as.tokens(x, remove_docvars_proximity = FALSE), field = field)
return(quanteda::docvars(as.tokens(x, remove_docvars_proximity = FALSE), field = field))
}

#' @importFrom quanteda meta
#' @method meta tokens_with_proximity
#' @export
meta.tokens_with_proximity <- function(x, field = NULL, type = c("user", "object", "system", "all")) {
quanteda::meta(as.tokens(x, remove_docvars_proximity = FALSE), field = field, type = type)
return(quanteda::meta(as.tokens(x, remove_docvars_proximity = FALSE), field = field, type = type))
}

#' @method convert tokens_with_proximity
Expand Down
15 changes: 15 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,21 @@ docvars(tok3, "proximity")
dfm(tok3) %>% dfm_lookup(dict1) %>% rowSums()
```

Can we use [phrase](https://quanteda.io/reference/phrase.html)?

```{r}
tok4 <- tok1 %>% tokens_proximity(pattern = phrase("Tayyip Erdogan"))
tok4
```

```{r}
docvars(tok4, "proximity")
```

```{r}
dfm(tok4) %>% dfm_lookup(dict1) %>% rowSums()
```

# Similar functions

* [quanteda](https://quanteda.io/): `quanteda::tokens_select(window)`, `quanteda::fcm()`
Expand Down
44 changes: 41 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ tok1
#> [ ... and 31 more ]
#>
#> With proximity vector(s).
#> keywords: turkish
#> Pattern: turkish
```

You can access the proximity vectors by
Expand Down Expand Up @@ -150,7 +150,7 @@ tok2
#> [ ... and 31 more ]
#>
#> With proximity vector(s).
#> keywords: hamas
#> Pattern: hamas
```

``` r
Expand All @@ -177,7 +177,7 @@ tok3
#> [ ... and 31 more ]
#>
#> With proximity vector(s).
#> keywords: eu brussels
#> Pattern: eu brussels
```

``` r
Expand All @@ -197,6 +197,44 @@ dfm(tok3) %>% dfm_lookup(dict1) %>% rowSums()
#> 0.02564103 0.45833333
```

Can we use [phrase](https://quanteda.io/reference/phrase.html)?

``` r
tok4 <- tok1 %>% tokens_proximity(pattern = phrase("Tayyip Erdogan"))
tok4
#> Tokens consisting of 2 documents.
#> text1 :
#> [1] "turkish" "president" "tayyip" "erdogan" "," "in"
#> [7] "his" "strongest" "comments" "yet" "on" "the"
#> [ ... and 26 more ]
#>
#> text2 :
#> [1] "eu" "policymakers" "proposed" "the" "new"
#> [6] "agency" "in" "2021" "to" "stop"
#> [11] "financial" "firms"
#> [ ... and 31 more ]
#>
#> With proximity vector(s).
#> Pattern: Tayyip Erdogan
```

``` r
docvars(tok4, "proximity")
#> $text1
#> [1] 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
#> [26] 23 24 25 26 27 28 29 30 31 32 33 34 35
#>
#> $text2
#> [1] 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44
#> [26] 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44
```

``` r
dfm(tok4) %>% dfm_lookup(dict1) %>% rowSums()
#> text1 text2
#> 0.04166667 0.04545455
```

# Similar functions

- [quanteda](https://quanteda.io/): `quanteda::tokens_select(window)`,
Expand Down
3 changes: 3 additions & 0 deletions man/tokens_proximity.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 18 additions & 8 deletions tests/testthat/test-tokens_dist.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@ test_that("edge cases", {
expect_error("" %>% tokens() %>% tokens_proximity("") %>% convert(), NA)
})

test_that("resolve_keywords", {
expect_equal(resolve_keywords(c("abc", "def"), c("abcd", "defa"), valuetype = "fixed"), c("abc", "def"))
expect_equal(resolve_keywords(c("abc*", "def*"), c("abcd", "defa"), valuetype = "glob"), c("abcd", "defa"))
expect_equal(resolve_keywords(c("a"), c("abcd", "defa"), valuetype = "regex"), c("abcd", "defa"))
})

test_that("count_from", {
suppressPackageStartupMessages(library(quanteda))
"this is my life" %>% tokens() %>% tokens_proximity("my") %>% docvars("proximity") -> res
Expand All @@ -36,11 +30,11 @@ test_that("convert no strange rownames, #39", {
expect_equal(rownames(res), c("1", "2", "3", "4")) ## default rownames
})

test_that("Changing keywords", {
test_that("Changing pattern", {
suppressPackageStartupMessages(library(quanteda))
"this is my life" %>% tokens() %>% tokens_proximity("my") -> res
expect_error(res2 <- tokens_proximity(res, "life"), NA)
expect_equal(meta(res2, "keywords"), "life")
expect_equal(meta(res2, "pattern"), "life")
})

test_that("token_proximity() only emit token_proximity #35", {
Expand All @@ -62,3 +56,19 @@ test_that("tolower", {
expect_true("tolower" %in% names(meta(res)))
expect_true("keep_acronyms" %in% names(meta(res)))
})

test_that("case_insensitive", {
suppressPackageStartupMessages(library(quanteda))
"this is my MIT life" %>% tokens() %>% tokens_proximity("MIT") -> res
expect_false("MIT" %in% attr(res, "types"))
expect_equal(docvars(res, "proximity")$text1, c(4, 3, 2, 1, 2))
"this is my MIT life" %>% tokens() %>% tokens_proximity("MIT", case_insensitive = FALSE) -> res
expect_false("MIT" %in% attr(res, "types"))
expect_equal(docvars(res, "proximity")$text1, c(6, 6, 6, 6, 6))
})

test_that("phrase", {
suppressPackageStartupMessages(library(quanteda))
expect_error("Seid ihr das Essen? Nein, wir sind die Jäger." %>% tokens() %>% tokens_proximity(phrase("das Essen")) -> res, NA)
expect_equal(docvars(res, "proximity")$text1, c(3,2,1,1,2,3,4,5,6,7,8,9))
})