From 8548c669dd3fbd0d79a4b69080f138e610d68edc Mon Sep 17 00:00:00 2001 From: schochastics Date: Sun, 24 Sep 2023 20:32:07 +0200 Subject: [PATCH 1/3] added cpp implementation and tests --- NAMESPACE | 1 + R/RcppExports.R | 12 ++++++++++++ R/parse.R | 7 +++++-- man/url_decode.Rd | 17 +++++++++++++++++ src/RcppExports.cpp | 12 ++++++++++++ src/urldecode.cpp | 34 +++++++++++++++++++++++++++++++++ tests/testthat/test-urldecode.R | 6 ++++++ 7 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 man/url_decode.Rd create mode 100644 src/urldecode.cpp diff --git a/NAMESPACE b/NAMESPACE index 5f09851..32d3b7a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,5 +20,6 @@ export(ada_has_port) export(ada_has_search) export(ada_url_parse) export(public_suffix) +export(url_decode) importFrom(Rcpp,sourceCpp) useDynLib(adaR, .registration = TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R index 0065e86..12a1931 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -77,3 +77,15 @@ Rcpp_ada_get_protocol <- function(input_vec, length_vec) { .Call(`_adaR_Rcpp_ada_get_protocol`, input_vec, length_vec) } +#' Function to percent-decode characters in URLs +#' +#' Similar to [utils::URLdecode] +#' +#' @param url a character vector +#' @export +#' @examples +#' url_decode("Hello%20World") +url_decode <- function(url) { + .Call(`_adaR_url_decode`, url) +} + diff --git a/R/parse.R b/R/parse.R index 0ab0066..241a508 100644 --- a/R/parse.R +++ b/R/parse.R @@ -22,10 +22,13 @@ ada_url_parse <- function(url, decode = TRUE) { df } -## NA-aware utils::URLdecode, hopefully without great performance impact +## NA/NULL-aware utils::URLdecode, hopefully without great performance impact .URLdecode <- function(URL) { + if (is.null(URL)) { + return(character(0)) + } non_na_index <- which(!is.na(URL)) - URL[non_na_index] <- utils::URLdecode(URL[non_na_index]) + URL[non_na_index] <- url_decode(URL[non_na_index]) URL[!non_na_index] <- NA_character_ return(URL) } diff --git a/man/url_decode.Rd b/man/url_decode.Rd new file mode 100644 index 0000000..230bab1 --- /dev/null +++ b/man/url_decode.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{url_decode} +\alias{url_decode} +\title{Function to percent-decode characters in URLs} +\usage{ +url_decode(url) +} +\arguments{ +\item{url}{a character vector} +} +\description{ +Similar to \link[utils:URLencode]{utils::URLdecode} +} +\examples{ +url_decode("Hello\%20World") +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 49c17cc..0d06307 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -238,6 +238,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// url_decode +CharacterVector url_decode(CharacterVector url); +RcppExport SEXP _adaR_url_decode(SEXP urlSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< CharacterVector >::type url(urlSEXP); + rcpp_result_gen = Rcpp::wrap(url_decode(url)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_adaR_Rcpp_ada_parse", (DL_FUNC) &_adaR_Rcpp_ada_parse, 2}, @@ -259,6 +270,7 @@ static const R_CallMethodDef CallEntries[] = { {"_adaR_Rcpp_ada_get_pathname", (DL_FUNC) &_adaR_Rcpp_ada_get_pathname, 2}, {"_adaR_Rcpp_ada_get_search", (DL_FUNC) &_adaR_Rcpp_ada_get_search, 2}, {"_adaR_Rcpp_ada_get_protocol", (DL_FUNC) &_adaR_Rcpp_ada_get_protocol, 2}, + {"_adaR_url_decode", (DL_FUNC) &_adaR_url_decode, 1}, {NULL, NULL, 0} }; diff --git a/src/urldecode.cpp b/src/urldecode.cpp new file mode 100644 index 0000000..69577fd --- /dev/null +++ b/src/urldecode.cpp @@ -0,0 +1,34 @@ +#include + +using namespace Rcpp; + +//' Function to percent-decode characters in URLs +//' +//' Similar to [utils::URLdecode] +//' +//' @param url a character vector +//' @export +//' @examples +//' url_decode("Hello%20World") +// [[Rcpp::export]] +CharacterVector url_decode(CharacterVector url) { + return sapply(url, [](const String& u) { + std::string input = u; + std::string output; + size_t i = 0; + + while (i < input.length()) { + if (input[i] != '%') { + output += input[i]; + i++; + } else { + int value; + sscanf(input.substr(i + 1, 2).c_str(), "%x", &value); + output += static_cast(value); + i += 3; + } + } + + return output; + }); +} diff --git a/tests/testthat/test-urldecode.R b/tests/testthat/test-urldecode.R index a8ee49e..c98b96b 100644 --- a/tests/testthat/test-urldecode.R +++ b/tests/testthat/test-urldecode.R @@ -12,3 +12,9 @@ test_that("Integration #21", { res <- adaR::ada_url_parse(c("https://www.google.co.jp/search?q=\u30c9\u30a4\u30c4", NA, "https://www.google.co.jp/search?q=\u30c9\u30a4\u30c4")) expect_equal(res$search, c("?q=\u30c9\u30a4\u30c4", NA_character_, "?q=\u30c9\u30a4\u30c4")) }) + +test_that("cpp implementation is correct", { + enc <- "https%3A%2F%2Fwww.google.de%2Fmaps%2F%4047.6647302%2C9.1389738%2C11z%3Fentry%3Dttu" + dec <- "https://www.google.de/maps/@47.6647302,9.1389738,11z?entry=ttu" + expect_equal(url_decode(enc), dec) +}) From 4000cbf32ab869991ecb8e5fd1b320a51eb6568e Mon Sep 17 00:00:00 2001 From: schochastics Date: Sun, 24 Sep 2023 20:50:04 +0200 Subject: [PATCH 2/3] housekeeping --- DESCRIPTION | 2 +- NEWS.md | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index eb03325..3453a84 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -4,7 +4,7 @@ Version: 0.1.0.9000 Authors@R: c(person("David", "Schoch", , "david@schochastics.net", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2952-4812")), - person("Chung-hong", "Chan", role = c("ctb"), email = "chainsawtiney@gmail.com", + person("Chung-hong", "Chan", role = c("aut"), email = "chainsawtiney@gmail.com", comment = c(ORCID = "0000-0002-6232-7530")) ) Description: A wrapper for ada-url, a WHATWG-compliant and fast URL parser written in modern C++. Also contains auxiliary functions to extract public suffix. diff --git a/NEWS.md b/NEWS.md index 71101a5..9209334 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,8 @@ # adaR 0.1.0.9000 * split C++ files h/t Chung-hong Chan (@chainsawriot) -* add support for public suffix extraction +* add support for public suffix extraction #14 +* add support for punnycode #18 # adaR 0.1.0 From af40ec96057d9cbd50c1a89fa6f17e7c30b2a660 Mon Sep 17 00:00:00 2001 From: schochastics Date: Sun, 24 Sep 2023 20:53:24 +0200 Subject: [PATCH 3/3] value is unsigned int --- src/urldecode.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/urldecode.cpp b/src/urldecode.cpp index 69577fd..c6f5484 100644 --- a/src/urldecode.cpp +++ b/src/urldecode.cpp @@ -22,7 +22,7 @@ CharacterVector url_decode(CharacterVector url) { output += input[i]; i++; } else { - int value; + unsigned int value; sscanf(input.substr(i + 1, 2).c_str(), "%x", &value); output += static_cast(value); i += 3;