Merge pull request #20 from reconhub/fix-19

Allow user-specific transformation IDs in clean_labels
reconhub · May 9, 2019 · d9abd97 · d9abd97
2 parents 22214da + 25b5f56
commit d9abd97
Show file tree

Hide file tree

Showing 26 changed files with 611 additions and 422 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,6 +4,7 @@
 ^README\.Rmd$
 ^README\.md
 ^README-.*\.png$
+^README_files$
 ^\.travis\.yml$
 ^\.lintr$
 ^appveyor\.yml$

diff --git a/.travis.yml b/.travis.yml
@@ -7,6 +7,7 @@ addons:
       - sourceline: 'ppa:chris-lea/libsodium'
     packages:
       - libsodium-dev
+      - libicu-dev
 
 matrix:
   include:

diff --git a/NEWS.md b/NEWS.md
@@ -17,6 +17,14 @@
 
  - functions from `dplyr`, `purrr`, `rlang`, and `tidyr` are now imported.
 
+## Bug Fix
+
+ - `clean_labels()` can now handle non-latin characters and gains the `trans_id`
+   argument, which allows the user to customise the transformations 
+   (see https://github.com/reconhub/epitrix/issues/19 for details).
+ - `clean_labels()` gains the `protect` argument to protect meaningful symbols
+   in the data.
+
 # epitrix 0.2.2
 
  - Replace `digest` with `sodium` in Imports

diff --git a/R/clean_labels.R b/R/clean_labels.R
@@ -9,52 +9,86 @@
 #'
 #' The following changes are performed:
 #'
-#' \itemize{
+#'  - all non-ascii characters are removed
+#'  - all diacritics are replaced with their non-accentuated equivalents,
+#'    e.g. 'é', 'ê' and 'è' become 'e'.
+#'  - all characters are set to lower case
+#'  - separators are standardised to the use of a single character provided
+#'    in `sep` (defaults to '_'); heading and trailing separators are removed.
 #'
-#' \item all non-ascii characters are removed
-#'
-#' \item all diacritics are replaced with their non-accentuated equivalents,
-#' e.g. 'é', 'ê' and 'è' become 'e', 'ö' becomes 'o', etc.
-#'
-#' \item all characters are set to lower case
-#'
-#' \item separators are standardised to the use of a single character provided
-#' in \code{sep} (defaults to '_'); heading and trailing separators are removed.
-#'
-#' }
-#'
-#
-#' @author Thibaut Jombart \email{thibautjombart@@gmail.com}
+#' @author Thibaut Jombart \email{thibautjombart@@gmail.com}, Zhian N. Kamvar
 #'
 #' @export
 #'
 #' @param x A vector of labels, normally provided as characters.
 #'
 #' @param sep A character string used as separator, defaulting to '_'.
+#' 
+#' @param transformation a string to be passed on to [stringi::stri_trans_general()]
+#'   for conversion. Default is "Any-Latin; Latin-ASCII", which will convert
+#'   any non-latin characters to latin and then converts all accented
+#'   characters to ASCII characters. See [stringi::stri_trans_list()] for a
+#'   full list of options.
+#' @param protect a character string defining the punctuation that should be
+#'   protected. This helps prevent meaninful symbols like > and < from being
+#'   removed.
+#'
+#' @md
+#' @note Because of differences between the underlying transliteration engine
+#'   (ICU), the default transformations will not transilierate German umlaute
+#'   correctly. You can add them by specifying "de-ASCII" in the `transformation` 
+#'   string after "Any-Latin".
 #'
 #' @examples
 #'
 #' clean_labels("-_-This is; A    WeÏrD**./sêntënce...")
 #' clean_labels("-_-This is; A    WeÏrD**./sêntënce...", sep = ".")
-#' input <- c("Peter and stëven", "peter-and.stëven", "pëtêr and stëven  _-")
+#' input <- c("ますだ, よしひこ", "Peter and stëven", "peter-and.stëven", "pëtêr and stëven  _-")
 #' input
 #' clean_labels(input)
+#' 
+#' # Don't transliterate non-latin words
+#' clean_labels(input, transformation = "Latin-ASCII")
+#'
+#' # protect useful symbols
+#' clean_labels(c("energy > 9000", "energy < 9000"), protect = "><")
+#'
+#' # if you only want to clean accents, transform to lower, and transliterate,
+#' # you can specify "[:punct:][:space:]" for protect:
+#' clean_labels(input, protect = "[:punct:][:space:]")
+#' 
+#' # appropriately transliterate Germanic umlaute
+#' if (stringi::stri_info()$ICU.system) {
+#'   # This will only be true if you have the correct version of ICU installed
+#'
+#'   clean_labels("'é', 'ê' and 'è' become 'e', 'ö' becomes 'oe', etc.", 
+#'                transformation = "Any-Latin; de-ASCII; Latin-ASCII")
+#' }
 #'
-
-clean_labels <- function(x, sep = "_") {
+clean_labels <- function(x, sep = "_", transformation = "Any-Latin; Latin-ASCII",
+                         protect = "") {
   x <- as.character(x)
 
   ## On the processing of the input:
 
   ## - coercion to lower case
   ## - replace accentuated characters by closest matches
-  ## - replace non-alphanumeric characters by 'sep'
+  ## - replace punctuation and spaces not in the protected list with sep, cautiously
   ## - remove starting / trailing seps
+  sep <- gsub("([.*?])", "\\\\\\1", sep)
 
   out <- tolower(x)
-  out <- stringi::stri_trans_general(out, "latin-ASCII")
-  out <- gsub("[^a-z0-9]+", sep, out)
-  out <- sub("^[^a-z0-9]+", "", out)
-  out <- sub("[^a-z0-9]+$", "", out)
+  out <- stringi::stri_trans_general(out, id = transformation)
+  # Negative lookahead for alphanumeric and any protected symbols
+  to_protect <- sprintf("(?![a-z0-9%s])", paste(protect, collapse = ""))
+  # If the negative lookahead doesn't find what it's looking for, then do the
+  # replacement. 
+  to_replace <- sprintf("%s[[:punct:][:space:]]+?", to_protect)
+
+  # workhorse
+  out <- gsub(to_replace, sep, out, perl = TRUE)
+  out <- gsub(paste0("(", sep, ")+"), sep, out, perl = TRUE)
+  out <- sub(paste0("^", sep), "", out, perl = TRUE)
+  out <- sub(paste0(sep, "$"), "", out, perl = TRUE)
   out
 }