Winsorize based on the MAD (#179)

rempsyc · mattansb · web-flow · commit 3d2992aaa116 · 2022-06-27T08:35:51.000+02:00
* addresses #177 & #49 & #47 for winsorizing based on the MAD * forgot to push updated documentation * new argument "method", updated NEWS, resolved failed test, #179 * update winsorize.numeric added raw method made the code easier to maintain by modularizing it made doc more explicit about the methods updated examples to visualize the effect update NEWS * minor modifications to docs * removed tidyr from Suggests, replaced `tidyr::pivot_longer` with `datawizard::data_to_long` in vignette * added new tests for new winsorization methods, insight::format_message(), data[] <- lapply... Co-authored-by: RemPsyc <RemPsyc@users.noreply.github.com> Co-authored-by: Mattan S. Ben-Shachar <matanshm@post.bgu.ac.il>
diff --git a/NEWS.md b/NEWS.md
@@ -19,6 +19,7 @@ CHANGES
 
 * Some of the text formatting helpers (like `text_concatenate()`) gain an
   `enclose` argument, to wrap text elements with surrounding characters.
+* `winsorize` now accepts "raw" and "zscore" methods (in addition to "percentile"). Additionally, when `robust` is set to `TRUE` together with `method = "zscore"`, winsorizes via the median and median absolute deviation (MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47).
 
 NEW FUNCTIONS
 
diff --git a/R/winsorize.R b/R/winsorize.R
@@ -17,13 +17,33 @@
 #' A dataframe with winsorized columns or a winsorized vector.
 #'
 #' @param data Dataframe or vector.
-#' @param threshold The amount of winsorization.
+#' @param threshold The amount of winsorization, depends on the value of `method`:
+#' - For `method = "percentile"`: the amount to winsorize from *each* tail.
+#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the *mean*/*median* (see `robust`)
+#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound for winsorization.
 #' @param verbose Toggle warnings.
+#' @param method One of "percentile" (default), "zscore", or "raw".
+#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation.
 #' @param ... Currently not used.
 #'
 #' @examples
-#' winsorize(iris$Sepal.Length, threshold = 0.2)
+#' hist(iris$Sepal.Length, main = "Original data")
+#'
+#' hist(winsorize(iris$Sepal.Length, threshold = 0.2),
+#'      xlim = c(4, 8), main = "Percentile Winsorization")
+#'
+#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"),
+#'      xlim = c(4, 8), main = "Mean (+/- SD) Winsorization")
+#'
+#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE),
+#'      xlim = c(4, 8), main = "Median (+/- MAD) Winsorization")
+#'
+#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"),
+#'      xlim = c(4, 8), main = "Raw Thresholds")
+#'
+#' # Also works on a data frame:
 #' winsorize(iris, threshold = 0.2)
+#'
 #' @inherit data_rename seealso
 #' @export
 winsorize <- function(data, ...) {
@@ -43,27 +63,65 @@ winsorize.character <- winsorize.factor
 winsorize.logical <- winsorize.factor
 
 #' @export
-winsorize.data.frame <- function(data, threshold = 0.2, verbose = TRUE, ...) {
-  out <- sapply(data, winsorize, threshold = threshold, verbose = verbose)
-  as.data.frame(out)
+winsorize.data.frame <- function(data, threshold = 0.2, method = "percentile", robust = FALSE,
+                                 verbose = TRUE, ...) {
+  data[] <- lapply(data, winsorize, threshold = threshold, method = method, robust = robust, verbose = verbose)
+  data
 }
 
 #' @rdname winsorize
 #' @export
-winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, ...) {
-  if (threshold < 0 || threshold > 1) {
-    if (isTRUE(verbose)) {
-      warning("'threshold' for winsorization must be a scalar between 0 and 1. Did not winsorize data.", call. = FALSE)
+winsorize.numeric <- function(data, threshold = 0.2, method = "percentile", robust = FALSE,
+                              verbose = TRUE, ...) {
+  method <- match.arg(method, choices = c("percentile", "zscore", "raw"))
+
+  if (method == "raw") {
+    if (length(threshold) != 2L) {
+      if (isTRUE(verbose)) {
+        warning(insight::format_message("threshold must be of length 2 for lower and upper bound. Did not winsorize data."), call. = FALSE)
+      }
+      return(data)
     }
-    return(data)
   }
 
-  y <- sort(data)
-  n <- length(data)
-  ibot <- floor(threshold * n) + 1
-  itop <- length(data) - ibot + 1
-  xbot <- y[ibot]
-  xtop <- y[itop]
-  winval <- ifelse(data <= xbot, xbot, data)
-  ifelse(winval >= xtop, xtop, winval)
+  if(method == "percentile") {
+    if (threshold < 0 || threshold > 0.5) {
+      if (isTRUE(verbose)) {
+        warning(insight::format_message("'threshold' for winsorization must be a scalar between 0 and 0.5. Did not winsorize data."), call. = FALSE)
+      }
+      return(data)
+    }
+
+    y <- sort(data)
+    n <- length(data)
+    ibot <- floor(threshold * n) + 1
+    itop <- length(data) - ibot + 1
+
+    threshold <- c(y[ibot], y[itop])
+  }
+
+  if(method == "zscore") {
+
+    if (threshold <= 0) {
+      if (isTRUE(verbose)) {
+        warning(insight::format_message("'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data."), call. = FALSE)
+      }
+      return(data)
+    }
+
+    if(isTRUE(robust)) {
+      centeral <- stats::median(data, na.rm = TRUE)
+      deviation <- stats::mad(data, center = centeral, na.rm = TRUE)
+    } else {
+      centeral <- mean(data, na.rm = TRUE)
+      deviation <- stats::sd(data, na.rm = TRUE)
+    }
+
+    threshold <- centeral + c(-1, 1) * deviation * threshold
+  }
+
+
+  data[data < threshold[1]] <- threshold[1]
+  data[data > threshold[2]] <- threshold[2]
+  return(data)
 }
diff --git a/man/winsorize.Rd b/man/winsorize.Rd
diff --git a/tests/testthat/_snaps/data_to_numeric.md b/tests/testthat/_snaps/data_to_numeric.md
@@ -1,7 +1,7 @@
 # convert dataframe to numeric
 
     Code
-      convert_data_to_numeric(head(ToothGrowth))
+      data_to_numeric(head(ToothGrowth))
     Output
          len supp.OJ supp.VC dose
       1  4.2       0       1  0.5
@@ -14,7 +14,7 @@
 ---
 
     Code
-      convert_data_to_numeric(head(ToothGrowth), dummy_factors = FALSE)
+      data_to_numeric(head(ToothGrowth), dummy_factors = FALSE)
     Output
          len supp dose
       1  4.2    2  0.5
@@ -27,7 +27,7 @@
 # convert factor to numeric
 
     Code
-      convert_data_to_numeric(f)
+      data_to_numeric(f)
     Output
          a c i s t
       1  0 0 0 1 0
diff --git a/tests/testthat/test-winsorization.R b/tests/testthat/test-winsorization.R
@@ -11,11 +11,23 @@ test_that("with missing values", {
 test_that("winsorize: threshold must be between 0 and 1", {
   expect_warning(
     winsorize(sample(1:10, 5), threshold = -0.1),
-    regexp = "must be a scalar between 0 and 1"
+    regexp = "must be a scalar between 0 and 0.5"
   )
   expect_warning(
     winsorize(sample(1:10, 5), threshold = 1.1),
-    regexp = "must be a scalar between 0 and 1"
+    regexp = "must be a scalar between 0 and 0.5"
+  )
+  expect_warning(
+    winsorize(sample(1:10, 5), method = "zscore", threshold = -3),
+    regexp = "must be a scalar greater than 0"
+  )
+  expect_warning(
+    winsorize(sample(1:10, 5), method = "zscore", threshold = -3, robust = TRUE),
+    regexp = "must be a scalar greater than 0"
+  )
+  expect_warning(
+    winsorize(sample(1:10, 5), method = "raw", threshold = 1.1),
+    regexp = "must be of length 2 for lower and upper bound"
   )
   x <- sample(1:10, 5)
   suppressWarnings({
@@ -38,3 +50,4 @@ test_that("winsorize on data.frame", {
   )
   expect_equal(names(iris2), names(iris))
 })
+
diff --git a/vignettes/standardize_data.Rmd b/vignettes/standardize_data.Rmd
@@ -73,17 +73,16 @@ We can see that different methods give different central and variation values:
 
 ```{r, eval=FALSE}
 library(dplyr)
-library(tidyr)
 
 hardlyworking %>%
   select(starts_with("xtra_hours")) %>%
-  pivot_longer(everything()) %>%
-  group_by(name) %>%
+  data_to_long() %>%
+  group_by(Name) %>%
   summarise(
-    mean = mean(value),
-    sd = sd(value),
-    median = median(value),
-    mad = mad(value)
+    mean = mean(Value),
+    sd = sd(Value),
+    median = median(Value),
+    mad = mad(Value)
   )
 ```
 
@@ -113,13 +112,13 @@ hardlyworking_z <- standardize(hardlyworking)
 ```{r, eval=FALSE}
 hardlyworking_z %>%
   select(-xtra_hours_z, -xtra_hours_zr) %>%
-  pivot_longer(everything()) %>%
-  group_by(name) %>%
+  data_to_long() %>%
+  group_by(Name) %>%
   summarise(
-    mean = mean(value),
-    sd = sd(value),
-    median = median(value),
-    mad = mad(value)
+    mean = mean(Value),
+    sd = sd(Value),
+    median = median(Value),
+    mad = mad(Value)
   )
 ```