Skip to content

Commit

Permalink
Merge pull request #102 from schochastics/data.table
Browse files Browse the repository at this point in the history
reimplemented with Data.table
  • Loading branch information
schochastics authored Mar 11, 2024
2 parents 7acacce + a64e034 commit b615834
Show file tree
Hide file tree
Showing 13 changed files with 184 additions and 231 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: webtrackR
Title: Preprocessing and Analyzing Web Tracking Data
Version: 0.2.0
Version: 0.2.0.9000
Authors@R: c(
person("David", "Schoch", email = "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-2952-4812")),
Expand All @@ -19,13 +19,14 @@ Depends:
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.0
Imports:
utils,
stats,
fastmatch,
adaR,
httr
httr,
data.table
LazyData: true
Suggests:
knitr,
Expand Down
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ export(parse_path)
export(sum_activity)
export(sum_durations)
export(sum_visits)
importFrom(data.table,":=")
importFrom(data.table,.N)
importFrom(data.table,.SD)
importFrom(fastmatch,"%fin%")
importFrom(fastmatch,`%fin%`)
importFrom(stats,aggregate)
importFrom(stats,ave)
importFrom(stats,reshape)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# webtrackR 0.2.0.9000

* revert back to data.table

# webtrackR 0.2.0

* reimplemented in base R removing several dependencies
Expand Down
22 changes: 14 additions & 8 deletions R/classify.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,31 +59,37 @@ classify_visits <- function(wt, classes, match_by = "domain",
return_rows_val = NULL) {
abort_if_not_wtdt(wt)
match_by <- match.arg(match_by, c("domain", "host", "regex"))
if (!data.table::is.data.table(classes)) {
stop("classes needs to be a data.table")
}
if (match_by == "domain") {
vars_exist(wt, "domain")
vars_exist(classes, "domain")
wt <- merge(wt, classes, by = "domain", all.x = TRUE)
wt <- classes[wt, on = "domain"]
} else if (match_by == "host") {
vars_exist(wt, "host")
vars_exist(classes, "host")
wt <- merge(wt, classes, by = "host", all.x = TRUE)
wt <- classes[wt, on = "host"]
} else if (match_by == "regex") {
stopifnot("You have to specify regex_on if match_by is set to 'regex'" = !is.null(regex_on))
vars_exist(wt, "url")
vars_exist(classes, regex_on)
wt <- drop_query(wt)
wt <- wt[, tmp_index := seq_len(.N)]
tmp_wt <- wt[, list(tmp_index, url_noquery)]
pattern <- paste(classes[[regex_on]], collapse = "|")
idx <- grepl(pattern, wt$url_noquery)
wt$match <- NA
wt$match[idx] <- regmatches(wt$url_noquery, regexpr(pattern, wt$url_noquery))
names(wt)[names(wt) == "match"] <- names(classes)[names(classes) != regex_on]
tmp_wt_matched <- tmp_wt[grepl(pattern, url_noquery)]
tmp_wt_matched <- tmp_wt_matched[, match := regmatches(url_noquery, regexpr(pattern, url_noquery))]
wt_matched <- tmp_wt_matched[, url_noquery := NULL][wt, on = "tmp_index"]
data.table::setnames(wt_matched, "match", regex_on)
wt <- classes[wt_matched, on = regex_on][, c("url_noquery", "tmp_index") := NULL]
}

if (!is.null(return_rows_by)) {
vars_exist(classes, return_rows_by)
stopifnot("You have to specify return_rows_val if return_rows_by is not NULL" = !is.null(return_rows_val))
wt <- wt[!is.na(wt[[return_rows_by]]) & wt[[return_rows_by]] == return_rows_val, ]
wt <- wt[get(return_rows_by) == return_rows_val]
}
class(wt) <- c("wt_dt", class(wt))
wt
wt[]
}
8 changes: 8 additions & 0 deletions R/globals.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
utils::globalVariables(c(
"duration", "timestamp", "panelist_id", "domain", "visit", "day", "type", "prev_type", "tmp", "session", "path",
"host", "suffix", "domain_name", "url_next", "host_next", "domain_next", "url_previous", "host_previous", "domain_previous",
"date", "week", "month", "year", "wave", "duplicate", "i.type", "title", "visits", "url_noquery", "referral",
"tmp_timestamp_next", "tmp_url_next", "tmp_host", "tmp_suffix", "tmp_path", "tmp_scheme", "tmp_domain_name", "tmp_index",
"tmp_class", "tmp_duration", "tmp_timeframe", "tmp_visits", "tmp_last", "device", "device_next",
"tmp_timestamp_prev", "tmp_url_prev", "tmp_index"
))
Loading

0 comments on commit b615834

Please sign in to comment.