Skip to content

Commit

Permalink
Use the silent conversion approach fix #66 (#69)
Browse files Browse the repository at this point in the history
* added to_unicode parameter

* added test

* Use the silent conversion approach fix #66

See #68 and #67

This passes the basic tests.

* A better test suite for #66

* More tests

* Head hygiene and default parameter

* Bürokratie

---------

Co-authored-by: schochastics <[email protected]>
  • Loading branch information
chainsawriot and schochastics authored Jan 31, 2024
1 parent 3b43e60 commit 43284da
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 54 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: adaR
Title: A Fast 'WHATWG' Compliant URL Parser
Version: 0.3.1
Version: 0.3.2
Authors@R:
c(person("David", "Schoch", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-2952-4812")),
Expand All @@ -15,7 +15,7 @@ BugReports: https://github.com/gesistsa/adaR/issues
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
LinkingTo:
Rcpp
Imports:
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# adaR 0.3.2

* fixed #66

# adaR 0.3.1

* bumped ada-url to 2.7.3
Expand Down
55 changes: 27 additions & 28 deletions src/adaR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,21 @@

#include "urldecode.h"

std::string charsub(const ada_string stringi) {
std::string charsub(const ada_string stringi, bool to_unicode = true) {
// to_unicode = false should only be used for href, see #66
const char* res = stringi.data;
size_t len = stringi.length;
ada_owned_string stringi_new = ada_idna_to_unicode(res, len);
std::string_view output(stringi_new.data, stringi_new.length);
std::string output2 = {output.begin(), output.end()};
ada_free_owned_string(stringi_new);
return output2;
if (to_unicode) {
ada_owned_string stringi_new = ada_idna_to_unicode(res, len);
std::string_view output(stringi_new.data, stringi_new.length);
std::string output2 = {output.begin(), output.end()};
ada_free_owned_string(stringi_new);
return output2;
} else {
std::string_view output(stringi.data, stringi.length);
std::string output2 = {output.begin(), output.end()};
return output2;
}
}

// [[Rcpp::export]]
Expand All @@ -30,7 +37,7 @@ DataFrame Rcpp_ada_parse(const CharacterVector& input_vec, bool decode) {
std::string_view input(s.get_cstring());
ada_url url = ada_parse(input.data(), input.length());
if (ada_is_valid(url)) {
href[i] = charsub(ada_get_href(url));
href[i] = charsub(ada_get_href(url), false);
protocol[i] = charsub(ada_get_protocol(url));
username[i] = charsub(ada_get_username(url));
password[i] = charsub(ada_get_password(url));
Expand Down Expand Up @@ -135,7 +142,7 @@ LogicalVector Rcpp_ada_has_search(const CharacterVector& url_vec) {
// higher-order function for all Rcpp_ada_get_*
CharacterVector Rcpp_ada_get(const CharacterVector& url_vec,
std::function<ada_string(ada_url)> func,
bool decode) {
bool decode, bool to_unicode = true) {
unsigned int n = url_vec.length();
CharacterVector out(n);
for (int i = 0; i < url_vec.length(); i++) {
Expand All @@ -145,7 +152,7 @@ CharacterVector Rcpp_ada_get(const CharacterVector& url_vec,
if (!ada_is_valid(url)) {
out[i] = NA_STRING;
} else {
out[i] = charsub(func(url));
out[i] = charsub(func(url), to_unicode);
}
ada_free(url);
}
Expand All @@ -157,7 +164,7 @@ CharacterVector Rcpp_ada_get(const CharacterVector& url_vec,

// [[Rcpp::export]]
CharacterVector Rcpp_ada_get_href(const CharacterVector& url_vec, bool decode) {
return Rcpp_ada_get(url_vec, &ada_get_href, decode);
return Rcpp_ada_get(url_vec, &ada_get_href, decode, false);
}

// [[Rcpp::export]]
Expand Down Expand Up @@ -200,8 +207,7 @@ CharacterVector Rcpp_ada_get_pathname(const CharacterVector& url_vec,
}

// [[Rcpp::export]]
CharacterVector Rcpp_ada_get_search(const CharacterVector& url_vec,
bool decode) {
CharacterVector Rcpp_ada_get_search(const CharacterVector& url_vec, bool decode) {
return Rcpp_ada_get(url_vec, &ada_get_search, decode);
}

Expand Down Expand Up @@ -229,7 +235,7 @@ CharacterVector Rcpp_ada_set(
out[i] = NA_STRING;
} else {
func(url, replace.data(), replace.length());
out[i] = charsub(ada_get_href(url));
out[i] = charsub(ada_get_href(url), false);
}
ada_free(url);
}
Expand All @@ -246,15 +252,13 @@ CharacterVector Rcpp_ada_set_href(const CharacterVector& url_vec,

// [[Rcpp::export]]
CharacterVector Rcpp_ada_set_username(const CharacterVector& url_vec,
const CharacterVector& subst,
bool decode) {
const CharacterVector& subst, bool decode) {
return Rcpp_ada_set<bool>(url_vec, &ada_set_username, subst, decode);
}

// [[Rcpp::export]]
CharacterVector Rcpp_ada_set_password(const CharacterVector& url_vec,
const CharacterVector& subst,
bool decode) {
const CharacterVector& subst, bool decode) {
return Rcpp_ada_set<bool>(url_vec, &ada_set_password, subst, decode);
}

Expand All @@ -272,22 +276,19 @@ CharacterVector Rcpp_ada_set_host(const CharacterVector& url_vec,

// [[Rcpp::export]]
CharacterVector Rcpp_ada_set_hostname(const CharacterVector& url_vec,
const CharacterVector& subst,
bool decode) {
const CharacterVector& subst, bool decode) {
return Rcpp_ada_set<bool>(url_vec, &ada_set_hostname, subst, decode);
}

// [[Rcpp::export]]
CharacterVector Rcpp_ada_set_pathname(const CharacterVector& url_vec,
const CharacterVector& subst,
bool decode) {
const CharacterVector& subst, bool decode) {
return Rcpp_ada_set<bool>(url_vec, &ada_set_pathname, subst, decode);
}

// [[Rcpp::export]]
CharacterVector Rcpp_ada_set_protocol(const CharacterVector& url_vec,
const CharacterVector& subst,
bool decode) {
const CharacterVector& subst, bool decode) {
return Rcpp_ada_set<bool>(url_vec, &ada_set_protocol, subst, decode);
}

Expand Down Expand Up @@ -316,7 +317,7 @@ CharacterVector Rcpp_ada_clear(const CharacterVector& url_vec,
out[i] = NA_STRING;
} else {
func(url);
out[i] = charsub(ada_get_href(url));
out[i] = charsub(ada_get_href(url), false);
}
ada_free(url);
}
Expand All @@ -327,14 +328,12 @@ CharacterVector Rcpp_ada_clear(const CharacterVector& url_vec,
}

// [[Rcpp::export]]
CharacterVector Rcpp_ada_clear_port(const CharacterVector& url_vec,
bool decode) {
CharacterVector Rcpp_ada_clear_port(const CharacterVector& url_vec, bool decode) {
return Rcpp_ada_clear(url_vec, &ada_clear_port, decode);
}

// [[Rcpp::export]]
CharacterVector Rcpp_ada_clear_hash(const CharacterVector& url_vec,
bool decode) {
CharacterVector Rcpp_ada_clear_hash(const CharacterVector& url_vec, bool decode) {
return Rcpp_ada_clear(url_vec, &ada_clear_hash, decode);
}

Expand Down
2 changes: 1 addition & 1 deletion src/adaR.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

using namespace Rcpp;

std::string charsub(ada_string stringi);
std::string charsub(const ada_string stringi, bool to_unicode);

// cash cow
DataFrame Rcpp_ada_parse(CharacterVector input_vec, bool decode);
Expand Down
5 changes: 5 additions & 0 deletions tests/testthat/test-clear.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,8 @@ test_that("invalid urls should return NA", {
expect_equal(func(url), NA_character_)
}
})

test_that("clear with puny code will still return puny, ref#66", {
url <- "http://xn--53-6kcainf4buoffq.xn--p1ai/junior-programmer.html?ap=123"
expect_equal(ada_clear_search(url), "http://xn--53-6kcainf4buoffq.xn--p1ai/junior-programmer.html")
})
16 changes: 16 additions & 0 deletions tests/testthat/test-get.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,19 @@ test_that("domain with path #51", {
expect_equal(ada_get_domain(url), "example.org")
expect_equal(ada_get_domain(corner_cases), corner_domains)
})

test_that("href fix #66", {
url <- "http://xn--53-6kcainf4buoffq.xn--p1ai/doof/junior-programmer.html"
yes <- ada_get_href(url)
expect_true(url == yes)
examples <- c(
"http://xn--53-6kcainf4buoffq.xn--p1ai/pood/junior-electrical-engineer-jobs-remote.html",
"http://xn--80abb0biooohbv.xn--p1ai/",
"http://xn--alicantesueo-khb.com/insomnio",
"https://normal-url.com/this-path-will-be-fine",
"http://xn--53-6kcainf4buoffq.xn--p1ai/this-path-will-not-be-fine"
)
pathnames <- ada_get_pathname(examples, decode = FALSE)
result_pathnames <- ada_set_pathname(examples, pathnames, decode = FALSE)
expect_true(all(examples == result_pathnames))
})
55 changes: 32 additions & 23 deletions tests/testthat/test-parse.R
Original file line number Diff line number Diff line change
@@ -1,39 +1,39 @@
test_that("works with standard url", {
res <- ada_url_parse("https://www.google.de")
expect_equal(res[["href"]], "https://www.google.de/")
expect_equal(res[["host"]], "www.google.de")
res <- ada_url_parse("https://www.google.de/")
expect_equal(res[["href"]], "https://www.google.de/")
expect_equal(res[["host"]], "www.google.de")
})

test_that("works with complex url", {
res <- ada_url_parse("https://user_1:[email protected]:8080/dir/../api?q=1#frag")
expect_equal(res[["username"]], "user_1")
expect_equal(res[["password"]], "password_1")
expect_equal(res[["host"]], "example.org:8080")
expect_equal(res[["hostname"]], "example.org")
expect_equal(res[["port"]], "8080")
expect_equal(res[["pathname"]], "/api")
expect_equal(res[["search"]], "?q=1")
expect_equal(res[["hash"]], "#frag")
res <- ada_url_parse("https://user_1:[email protected]:8080/dir/../api?q=1#frag")
expect_equal(res[["username"]], "user_1")
expect_equal(res[["password"]], "password_1")
expect_equal(res[["host"]], "example.org:8080")
expect_equal(res[["hostname"]], "example.org")
expect_equal(res[["port"]], "8080")
expect_equal(res[["pathname"]], "/api")
expect_equal(res[["search"]], "?q=1")
expect_equal(res[["hash"]], "#frag")
})

test_that("works with utf8", {
res <- ada_url_parse("https://www.hk01.com/zone/1/\u6e2f\u805e")
expect_equal(res[["pathname"]], "/zone/1/\u6e2f\u805e")
res <- ada_url_parse("http://www.m\u00fcller.de")
expect_equal(res[["host"]], "www.m\u00fcller.de")
res <- ada_url_parse("https://\u4e2d\u56fd\u79fb\u52a8.\u4e2d\u56fd")
expect_equal(res$href[1], "https://xn--fiq02ib9d179b.xn--fiqs8s/")
expect_equal(res$host[1], "\u4e2d\u56fd\u79fb\u52a8.\u4e2d\u56fd")
res <- ada_url_parse("https://www.hk01.com/zone/1/\u6e2f\u805e")
expect_equal(res[["pathname"]], "/zone/1/\u6e2f\u805e")
res <- ada_url_parse("http://www.m\u00fcller.de")
expect_equal(res[["host"]], "www.m\u00fcller.de")
res <- ada_url_parse("https://\u4e2d\u56fd\u79fb\u52a8.\u4e2d\u56fd")
expect_equal(res$href[1], "https://xn--fiq02ib9d179b.xn--fiqs8s/")
expect_equal(res$host[1], "\u4e2d\u56fd\u79fb\u52a8.\u4e2d\u56fd")
})

test_that("URLdecode optional #5", {
expect_equal(ada_url_parse("https://www.google.co.jp/search?q=\u30c9\u30a4\u30c4")$search, "?q=\u30c9\u30a4\u30c4") ## default TRUE
expect_equal(ada_url_parse("https://www.google.co.jp/search?q=\u30c9\u30a4\u30c4", decode = FALSE)$search, "?q=%E3%83%89%E3%82%A4%E3%83%84")
expect_equal(ada_url_parse("https://www.google.co.jp/search?q=\u30c9\u30a4\u30c4")$search, "?q=\u30c9\u30a4\u30c4") ## default TRUE
expect_equal(ada_url_parse("https://www.google.co.jp/search?q=\u30c9\u30a4\u30c4", decode = FALSE)$search, "?q=%E3%83%89%E3%82%A4%E3%83%84")
})

test_that("multiple urls", {
urls <- rep("https://user_1:[email protected]:8080/dir/../api?q=1#frag", 5)
expect_equal(nrow(ada_url_parse(urls)), 5L)
urls <- rep("https://user_1:[email protected]:8080/dir/../api?q=1#frag", 5)
expect_equal(nrow(ada_url_parse(urls)), 5L)
})

test_that("corner cases", {
Expand All @@ -45,3 +45,12 @@ test_that("corner cases", {
expect_equal(x$host, c(NA_character_, NA_character_, NA_character_, NA_character_))
expect_equal(y$host, c(NA_character_, NA_character_, NA_character_, NA_character_))
})

test_that("#66", {
## unicode see #67
url <- "http://xn--53-6kcainf4buoffq.xn--p1ai/\u6e2f\u805e/junior-programmer.html"
res <- adaR::ada_url_parse(url)
expect_equal(res$href, url) ## doesn't mess up
expect_equal(res$pathname, "/\u6e2f\u805e/junior-programmer.html")
expect_false(res$host == "xn--53-6kcainf4buoffq.xn--p1ai") ## puny code is converted
})
6 changes: 6 additions & 0 deletions tests/testthat/test-set.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,9 @@ test_that("uneven vectors", {
test_that("NULL input", {
expect_equal(ada_set_protocol("https://google.de", NULL), "https://google.de")
})

test_that("setting with puny code will still return puny, ref#66", {
url <- "http://xn--53-6kcainf4buoffq.xn--p1ai/junior-programmer.html"
expect_equal(ada_set_protocol(url, "ws:"), "ws://xn--53-6kcainf4buoffq.xn--p1ai/junior-programmer.html")
expect_equal(ada_set_href(url, "http://xn--53-6kcainf4buoffq.xn--p1ai/junior-programmer.html"), "http://xn--53-6kcainf4buoffq.xn--p1ai/junior-programmer.html")
})

0 comments on commit 43284da

Please sign in to comment.