Skip to content

Commit d68f2cc

Browse files
bwiernikstrengejackeetiennebacherIndrajeetPatilmattansb
authored
Add coefficient of variation function (#236)
* Add coefficient of variation function Related to #232, easystats/performance#433 * Remove suggestion that `coef_var()` can be used with models. Will implement that as `performance_coefvar()` or similar * code-style, use namespace * typo in doc * minor * Fix logic * Move mode and CV to descriptives.R * Bugfix in switch() * bug fixes and add tests * Move processing to internal functions for reuse * Fix documentation * remove CV alias * Include more digits in text expected values * check and explain why x must be ratio scaled * fix * Update R/descriptives.R Co-authored-by: Indrajeet Patil <[email protected]> * backticks * msg * typo Co-authored-by: Daniel <[email protected]> Co-authored-by: Etienne Bacher <[email protected]> Co-authored-by: Indrajeet Patil <[email protected]> Co-authored-by: Mattan S. Ben-Shachar <[email protected]> Co-authored-by: Mattan S. Ben-Shachar <[email protected]>
1 parent bda5e40 commit d68f2cc

File tree

8 files changed

+300
-30
lines changed

8 files changed

+300
-30
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ VignetteBuilder:
6363
Encoding: UTF-8
6464
Language: en-US
6565
Roxygen: list(markdown = TRUE)
66-
RoxygenNote: 7.2.1
66+
RoxygenNote: 7.2.1.9000
6767
Config/testthat/edition: 3
6868
Config/Needs/website:
6969
rstudio/bslib,

NAMESPACE

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ S3method(center,factor)
2020
S3method(center,grouped_df)
2121
S3method(center,logical)
2222
S3method(center,numeric)
23+
S3method(coef_var,default)
24+
S3method(coef_var,numeric)
2325
S3method(convert_na_to,character)
2426
S3method(convert_na_to,data.frame)
2527
S3method(convert_na_to,default)
@@ -163,6 +165,7 @@ export(center)
163165
export(centre)
164166
export(change_code)
165167
export(change_scale)
168+
export(coef_var)
166169
export(coerce_to_numeric)
167170
export(colnames_to_row)
168171
export(column_as_rownames)
@@ -201,6 +204,7 @@ export(degroup)
201204
export(demean)
202205
export(describe_distribution)
203206
export(detrend)
207+
export(distribution_coef_var)
204208
export(distribution_mode)
205209
export(empty_columns)
206210
export(empty_rows)

R/describe_distribution.R

Lines changed: 4 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
#' @inheritParams bayestestR::point_estimate
2525
#' @inheritParams find_columns
2626
#'
27-
#' @details If `x` is a data frame, only numeric variables are kept and will be displayed in the summary.
27+
#' @details If `x` is a data frame, only numeric variables are kept and will be
28+
#' displayed in the summary.
2829
#'
2930
#' If `x` is a list, the behavior is different whether `x` is a stored list. If
3031
#' `x` is stored (for example, `describe_distribution(mylist)` where `mylist`
@@ -58,7 +59,7 @@ describe_distribution <- function(x, ...) {
5859
describe_distribution.default <- function(x, verbose = TRUE, ...) {
5960
if (verbose) {
6061
warning(
61-
insight::format_message(paste0("Can't describe variables of class '", class(x)[1], "'.")),
62+
insight::format_message(paste0("Can't describe variables of class `", class(x)[1], "`.")),
6263
call. = FALSE
6364
)
6465
}
@@ -448,7 +449,7 @@ describe_distribution.grouped_df <- function(x,
448449
verbose = verbose
449450
)
450451

451-
out <- do.call(rbind, lapply(1:length(groups), function(i) {
452+
out <- do.call(rbind, lapply(seq_along(groups), function(i) {
452453
d <- describe_distribution.data.frame(
453454
groups[[i]][select],
454455
centrality = centrality,
@@ -511,27 +512,3 @@ print.parameters_distribution <- function(x, digits = 2, ...) {
511512
)
512513
out[[1]]
513514
}
514-
515-
# distribution_mode ----------------------------------
516-
517-
#' Compute mode for a statistical distribution
518-
#'
519-
#' @param x An atomic vector, a list, or a data frame.
520-
#'
521-
#' @return
522-
#'
523-
#' The value that appears most frequently in the provided data.
524-
#' The returned data structure will be the same as the entered one.
525-
#'
526-
#' @examples
527-
#'
528-
#' distribution_mode(c(1, 2, 3, 3, 4, 5))
529-
#' distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5))
530-
#'
531-
#' @export
532-
distribution_mode <- function(x) {
533-
uniqv <- unique(x)
534-
tab <- tabulate(match(x, uniqv))
535-
idx <- which.max(tab)
536-
uniqv[idx]
537-
}

R/descriptives.R

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
# distribution_mode ----------------------------------
2+
3+
#' Compute mode for a statistical distribution
4+
#'
5+
#' @param x An atomic vector, a list, or a data frame.
6+
#'
7+
#' @return
8+
#'
9+
#' The value that appears most frequently in the provided data.
10+
#' The returned data structure will be the same as the entered one.
11+
#'
12+
#' @seealso For continuous variables, the
13+
#' **Highest Maximum a Posteriori probability estimate (MAP)** may be
14+
#' a more useful way to estimate the most commonly-observed value
15+
#' than the mode. See [bayestestR::map_estimate()].
16+
#'
17+
#' @examples
18+
#'
19+
#' distribution_mode(c(1, 2, 3, 3, 4, 5))
20+
#' distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5))
21+
#'
22+
#' @export
23+
distribution_mode <- function(x) {
24+
# TODO: Add support for weights, trim, binned (method)
25+
uniqv <- unique(x)
26+
tab <- tabulate(match(x, uniqv))
27+
idx <- which.max(tab)
28+
uniqv[idx]
29+
}
30+
31+
#' Compute the coefficient of variation
32+
#'
33+
#' Compute the coefficient of variation (CV, ratio of the standard deviation to
34+
#' the mean, \eqn{\sigma/\mu}) for a set of numeric values.
35+
#'
36+
#' @return The computed coefficient of variation for `x`.
37+
#' @export
38+
#'
39+
#' @examples
40+
#' coef_var(1:10)
41+
#' coef_var(c(1:10, 100), method = "median_mad")
42+
#' coef_var(c(1:10, 100), method = "qcd")
43+
#' coef_var(mu = 10, sigma = 20)
44+
#' coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30)
45+
coef_var <- function(x, ...) {
46+
UseMethod("coef_var")
47+
}
48+
#' @name distribution_cv
49+
#' @rdname coef_var
50+
#' @export
51+
distribution_coef_var <- coef_var
52+
53+
#' @export
54+
coef_var.default <- function(x, verbose = TRUE, ...) {
55+
if (verbose) {
56+
warning(insight::format_message(
57+
paste0("Can't compute the coefficient of variation objects of class `", class(x)[1], "`.")
58+
), call. = FALSE)
59+
}
60+
NULL
61+
}
62+
63+
#' @param x A numeric vector of ratio scale (see details), or vector of values than can be coerced to one.
64+
#' @param mu A numeric vector of mean values to use to compute the coefficient
65+
#' of variation. If supplied, `x` is not used to compute the mean.
66+
#' @param sigma A numeric vector of standard deviation values to use to compute the coefficient
67+
#' of variation. If supplied, `x` is not used to compute the SD.
68+
#' @param method Method to use to compute the CV. Can be `"standard"` to compute
69+
#' by dividing the standard deviation by the mean, `"unbiased"` for the
70+
#' unbiased estimator for normally distributed data, or one of two robust
71+
#' alternatives: `"median_mad"` to divide the median by the [stats::mad()],
72+
#' or `"qcd"` (quartile coefficient of dispersion, interquartile range divided
73+
#' by the sum of the quartiles \[twice the midhinge\]: \eqn{(Q_3 - Q_1)/(Q_3 + Q_1)}.
74+
#' @param trim the fraction (0 to 0.5) of values to be trimmed from
75+
#' each end of `x` before the mean and standard deviation (or alternatvies)
76+
#' are computed. Values of `trim` outside the range of (0 to 0.5) are taken
77+
#' as the nearest endpoint.
78+
#' @param na.rm Logical. Should `NA` values be removed before computing (`TRUE`)
79+
#' or not (`FALSE`, default)?
80+
#' @param n If `method = "unbiased"` and both `mu` and `sigma` are provided (not
81+
#' computed from `x`), what sample size to use to adjust the computed CV
82+
#' for small-sample bias?
83+
#' @param ... Further arguments passed to computation functions.
84+
#'
85+
#' @details
86+
#' CV is only applicable of values taken on a ratio scale: values that have a
87+
#' *fixed* meaningfully defined 0 (which is either the lowest or highest
88+
#' possible value), and that ratios between them are interpretable For example,
89+
#' how many sandwiches have I eaten this week? 0 means "none" and 20 sandwiches
90+
#' is 4 times more than 5 sandwiches. If I were to center the number of
91+
#' sandwiches, it will no longer be on a ratio scale (0 is no "none" it is the
92+
#' mean, and the ratio between 4 and -2 is not meaningful). Scaling a ratio
93+
#' scale still results in a ratio scale. So I can re define "how many half
94+
#' sandwiches did I eat this week ( = sandwiches * 0.5) and 0 would still mean
95+
#' "none", and 20 half-sandwiches is still 4 times more than 5 half-sandwiches.
96+
#'
97+
#' This means that CV is **NOT** invariant to shifting, but it is to scaling:
98+
99+
#' ```{r}
100+
#' sandwiches <- c(0, 4, 15, 0, 0, 5, 2, 7)
101+
#' coef_var(sandwiches)
102+
#'
103+
#' coef_var(sandwiches / 2) # same
104+
#'
105+
#' coef_var(sandwiches + 4) # different! 0 is no longer meaningful!
106+
#' ````
107+
#'
108+
#' @rdname coef_var
109+
#'
110+
#' @export
111+
coef_var.numeric <- function(x, mu = NULL, sigma = NULL,
112+
method = c("standard", "unbiased", "median_mad", "qcd"),
113+
trim = 0, na.rm = FALSE, n = NULL, ...) {
114+
# TODO: Support weights
115+
if (!missing(x) && all(c(-1, 1) %in% sign(x))){
116+
stop("Coefficient of variation only applicable for ratio scale variables.", call. = FALSE)
117+
}
118+
method <- match.arg(method, choices = c("standard", "unbiased", "median_mad", "qcd"))
119+
if (is.null(mu) || is.null(sigma)) {
120+
if (isTRUE(na.rm)) {
121+
x <- .drop_na(x)
122+
}
123+
n <- length(x)
124+
x <- .trim_values(x, trim = trim, n = n)
125+
}
126+
if (is.null(mu)) {
127+
mu <- switch(
128+
method,
129+
standard = , unbiased = mean(x, ...),
130+
median_mad = stats::median(x, ...),
131+
qcd = unname(sum(stats::quantile(x, probs = c(.25, .75), ...)))
132+
)
133+
}
134+
if (is.null(sigma)) {
135+
sigma <- switch(
136+
method,
137+
standard = , unbiased = stats::sd(x, ...),
138+
median_mad = stats::mad(x, center = mu, ...),
139+
qcd = unname(diff(stats::quantile(x, probs = c(.25, .75), ...)))
140+
)
141+
}
142+
out <- sigma / mu
143+
if (method == "unbiased") {
144+
if (is.null(n)) {
145+
stop(insight::format_message(
146+
"A value for `n` must be provided when `method = \"unbiased\"` and both `mu` and `sigma` are provided."
147+
), call. = FALSE)
148+
}
149+
# from DescTools::CoefVar
150+
out <- out * (1 - 1 / (4 * (n - 1)) + 1 / n * out^2 + 1 / (2 * (n - 1)^2))
151+
}
152+
return(out)
153+
}
154+
155+
156+
157+
158+
# descriptives helpers
159+
160+
.drop_na <- function(x) {
161+
x[!is.na(x)]
162+
}
163+
164+
.trim_values <- function(x, trim = 0, n = NULL, weights = NULL) {
165+
# TODO: Support weights
166+
if (!is.numeric(trim) || length(trim) != 1L) {
167+
stop("`trim` must be a single numeric value.", call. = FALSE)
168+
}
169+
if (is.null(NULL)) {
170+
n <- length(x)
171+
}
172+
if (trim > 0 && n) {
173+
if (anyNA(x)) return(NA_real_)
174+
if (trim >= 0.5) return(stats::median(x, na.rm = FALSE))
175+
lo <- floor(n * trim) + 1
176+
hi <- n + 1 - lo
177+
x <- sort.int(x, partial = unique(c(lo, hi)))[lo:hi]
178+
}
179+
x
180+
}

man/coef_var.Rd

Lines changed: 93 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/describe_distribution.Rd

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/distribution_mode.Rd

Lines changed: 7 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)