Skip to content

Commit 3d2992a

Browse files
rempsycmattansb
andauthored
Winsorize based on the MAD (#179)
* addresses #177 & #49 & #47 for winsorizing based on the MAD * forgot to push updated documentation * new argument "method", updated NEWS, resolved failed test, #179 * update winsorize.numeric added raw method made the code easier to maintain by modularizing it made doc more explicit about the methods updated examples to visualize the effect update NEWS * minor modifications to docs * removed tidyr from Suggests, replaced `tidyr::pivot_longer` with `datawizard::data_to_long` in vignette * added new tests for new winsorization methods, insight::format_message(), data[] <- lapply... Co-authored-by: RemPsyc <[email protected]> Co-authored-by: Mattan S. Ben-Shachar <[email protected]>
1 parent e986a8d commit 3d2992a

File tree

6 files changed

+141
-39
lines changed

6 files changed

+141
-39
lines changed

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ CHANGES
1919

2020
* Some of the text formatting helpers (like `text_concatenate()`) gain an
2121
`enclose` argument, to wrap text elements with surrounding characters.
22+
* `winsorize` now accepts "raw" and "zscore" methods (in addition to "percentile"). Additionally, when `robust` is set to `TRUE` together with `method = "zscore"`, winsorizes via the median and median absolute deviation (MAD); else via the mean and standard deviation. (@rempsyc, #177, #49, #47).
2223

2324
NEW FUNCTIONS
2425

R/winsorize.R

Lines changed: 76 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,33 @@
1717
#' A dataframe with winsorized columns or a winsorized vector.
1818
#'
1919
#' @param data Dataframe or vector.
20-
#' @param threshold The amount of winsorization.
20+
#' @param threshold The amount of winsorization, depends on the value of `method`:
21+
#' - For `method = "percentile"`: the amount to winsorize from *each* tail.
22+
#' - For `method = "zscore"`: the number of *SD*/*MAD*-deviations from the *mean*/*median* (see `robust`)
23+
#' - For `method = "raw"`: a vector of length 2 with the lower and upper bound for winsorization.
2124
#' @param verbose Toggle warnings.
25+
#' @param method One of "percentile" (default), "zscore", or "raw".
26+
#' @param robust Logical, if TRUE, winsorizing through the "zscore" method is done via the median and the median absolute deviation (MAD); if FALSE, via the mean and the standard deviation.
2227
#' @param ... Currently not used.
2328
#'
2429
#' @examples
25-
#' winsorize(iris$Sepal.Length, threshold = 0.2)
30+
#' hist(iris$Sepal.Length, main = "Original data")
31+
#'
32+
#' hist(winsorize(iris$Sepal.Length, threshold = 0.2),
33+
#' xlim = c(4, 8), main = "Percentile Winsorization")
34+
#'
35+
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore"),
36+
#' xlim = c(4, 8), main = "Mean (+/- SD) Winsorization")
37+
#'
38+
#' hist(winsorize(iris$Sepal.Length, threshold = 1.5, method = "zscore", robust = TRUE),
39+
#' xlim = c(4, 8), main = "Median (+/- MAD) Winsorization")
40+
#'
41+
#' hist(winsorize(iris$Sepal.Length, threshold = c(5, 7.5), method = "raw"),
42+
#' xlim = c(4, 8), main = "Raw Thresholds")
43+
#'
44+
#' # Also works on a data frame:
2645
#' winsorize(iris, threshold = 0.2)
46+
#'
2747
#' @inherit data_rename seealso
2848
#' @export
2949
winsorize <- function(data, ...) {
@@ -43,27 +63,65 @@ winsorize.character <- winsorize.factor
4363
winsorize.logical <- winsorize.factor
4464

4565
#' @export
46-
winsorize.data.frame <- function(data, threshold = 0.2, verbose = TRUE, ...) {
47-
out <- sapply(data, winsorize, threshold = threshold, verbose = verbose)
48-
as.data.frame(out)
66+
winsorize.data.frame <- function(data, threshold = 0.2, method = "percentile", robust = FALSE,
67+
verbose = TRUE, ...) {
68+
data[] <- lapply(data, winsorize, threshold = threshold, method = method, robust = robust, verbose = verbose)
69+
data
4970
}
5071

5172
#' @rdname winsorize
5273
#' @export
53-
winsorize.numeric <- function(data, threshold = 0.2, verbose = TRUE, ...) {
54-
if (threshold < 0 || threshold > 1) {
55-
if (isTRUE(verbose)) {
56-
warning("'threshold' for winsorization must be a scalar between 0 and 1. Did not winsorize data.", call. = FALSE)
74+
winsorize.numeric <- function(data, threshold = 0.2, method = "percentile", robust = FALSE,
75+
verbose = TRUE, ...) {
76+
method <- match.arg(method, choices = c("percentile", "zscore", "raw"))
77+
78+
if (method == "raw") {
79+
if (length(threshold) != 2L) {
80+
if (isTRUE(verbose)) {
81+
warning(insight::format_message("threshold must be of length 2 for lower and upper bound. Did not winsorize data."), call. = FALSE)
82+
}
83+
return(data)
5784
}
58-
return(data)
5985
}
6086

61-
y <- sort(data)
62-
n <- length(data)
63-
ibot <- floor(threshold * n) + 1
64-
itop <- length(data) - ibot + 1
65-
xbot <- y[ibot]
66-
xtop <- y[itop]
67-
winval <- ifelse(data <= xbot, xbot, data)
68-
ifelse(winval >= xtop, xtop, winval)
87+
if(method == "percentile") {
88+
if (threshold < 0 || threshold > 0.5) {
89+
if (isTRUE(verbose)) {
90+
warning(insight::format_message("'threshold' for winsorization must be a scalar between 0 and 0.5. Did not winsorize data."), call. = FALSE)
91+
}
92+
return(data)
93+
}
94+
95+
y <- sort(data)
96+
n <- length(data)
97+
ibot <- floor(threshold * n) + 1
98+
itop <- length(data) - ibot + 1
99+
100+
threshold <- c(y[ibot], y[itop])
101+
}
102+
103+
if(method == "zscore") {
104+
105+
if (threshold <= 0) {
106+
if (isTRUE(verbose)) {
107+
warning(insight::format_message("'threshold' for winsorization must be a scalar greater than 0. Did not winsorize data."), call. = FALSE)
108+
}
109+
return(data)
110+
}
111+
112+
if(isTRUE(robust)) {
113+
centeral <- stats::median(data, na.rm = TRUE)
114+
deviation <- stats::mad(data, center = centeral, na.rm = TRUE)
115+
} else {
116+
centeral <- mean(data, na.rm = TRUE)
117+
deviation <- stats::sd(data, na.rm = TRUE)
118+
}
119+
120+
threshold <- centeral + c(-1, 1) * deviation * threshold
121+
}
122+
123+
124+
data[data < threshold[1]] <- threshold[1]
125+
data[data > threshold[2]] <- threshold[2]
126+
return(data)
69127
}

man/winsorize.Rd

Lines changed: 34 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/_snaps/convert_data_to_numeric.md renamed to tests/testthat/_snaps/data_to_numeric.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# convert dataframe to numeric
22

33
Code
4-
convert_data_to_numeric(head(ToothGrowth))
4+
data_to_numeric(head(ToothGrowth))
55
Output
66
len supp.OJ supp.VC dose
77
1 4.2 0 1 0.5
@@ -14,7 +14,7 @@
1414
---
1515

1616
Code
17-
convert_data_to_numeric(head(ToothGrowth), dummy_factors = FALSE)
17+
data_to_numeric(head(ToothGrowth), dummy_factors = FALSE)
1818
Output
1919
len supp dose
2020
1 4.2 2 0.5
@@ -27,7 +27,7 @@
2727
# convert factor to numeric
2828

2929
Code
30-
convert_data_to_numeric(f)
30+
data_to_numeric(f)
3131
Output
3232
a c i s t
3333
1 0 0 0 1 0

tests/testthat/test-winsorization.R

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,23 @@ test_that("with missing values", {
1111
test_that("winsorize: threshold must be between 0 and 1", {
1212
expect_warning(
1313
winsorize(sample(1:10, 5), threshold = -0.1),
14-
regexp = "must be a scalar between 0 and 1"
14+
regexp = "must be a scalar between 0 and 0.5"
1515
)
1616
expect_warning(
1717
winsorize(sample(1:10, 5), threshold = 1.1),
18-
regexp = "must be a scalar between 0 and 1"
18+
regexp = "must be a scalar between 0 and 0.5"
19+
)
20+
expect_warning(
21+
winsorize(sample(1:10, 5), method = "zscore", threshold = -3),
22+
regexp = "must be a scalar greater than 0"
23+
)
24+
expect_warning(
25+
winsorize(sample(1:10, 5), method = "zscore", threshold = -3, robust = TRUE),
26+
regexp = "must be a scalar greater than 0"
27+
)
28+
expect_warning(
29+
winsorize(sample(1:10, 5), method = "raw", threshold = 1.1),
30+
regexp = "must be of length 2 for lower and upper bound"
1931
)
2032
x <- sample(1:10, 5)
2133
suppressWarnings({
@@ -38,3 +50,4 @@ test_that("winsorize on data.frame", {
3850
)
3951
expect_equal(names(iris2), names(iris))
4052
})
53+

vignettes/standardize_data.Rmd

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -73,17 +73,16 @@ We can see that different methods give different central and variation values:
7373

7474
```{r, eval=FALSE}
7575
library(dplyr)
76-
library(tidyr)
7776
7877
hardlyworking %>%
7978
select(starts_with("xtra_hours")) %>%
80-
pivot_longer(everything()) %>%
81-
group_by(name) %>%
79+
data_to_long() %>%
80+
group_by(Name) %>%
8281
summarise(
83-
mean = mean(value),
84-
sd = sd(value),
85-
median = median(value),
86-
mad = mad(value)
82+
mean = mean(Value),
83+
sd = sd(Value),
84+
median = median(Value),
85+
mad = mad(Value)
8786
)
8887
```
8988

@@ -113,13 +112,13 @@ hardlyworking_z <- standardize(hardlyworking)
113112
```{r, eval=FALSE}
114113
hardlyworking_z %>%
115114
select(-xtra_hours_z, -xtra_hours_zr) %>%
116-
pivot_longer(everything()) %>%
117-
group_by(name) %>%
115+
data_to_long() %>%
116+
group_by(Name) %>%
118117
summarise(
119-
mean = mean(value),
120-
sd = sd(value),
121-
median = median(value),
122-
mad = mad(value)
118+
mean = mean(Value),
119+
sd = sd(Value),
120+
median = median(Value),
121+
mad = mad(Value)
123122
)
124123
```
125124

0 commit comments

Comments
 (0)