|
| 1 | +# distribution_mode ---------------------------------- |
| 2 | + |
| 3 | +#' Compute mode for a statistical distribution |
| 4 | +#' |
| 5 | +#' @param x An atomic vector, a list, or a data frame. |
| 6 | +#' |
| 7 | +#' @return |
| 8 | +#' |
| 9 | +#' The value that appears most frequently in the provided data. |
| 10 | +#' The returned data structure will be the same as the entered one. |
| 11 | +#' |
| 12 | +#' @seealso For continuous variables, the |
| 13 | +#' **Highest Maximum a Posteriori probability estimate (MAP)** may be |
| 14 | +#' a more useful way to estimate the most commonly-observed value |
| 15 | +#' than the mode. See [bayestestR::map_estimate()]. |
| 16 | +#' |
| 17 | +#' @examples |
| 18 | +#' |
| 19 | +#' distribution_mode(c(1, 2, 3, 3, 4, 5)) |
| 20 | +#' distribution_mode(c(1.5, 2.3, 3.7, 3.7, 4.0, 5)) |
| 21 | +#' |
| 22 | +#' @export |
| 23 | +distribution_mode <- function(x) { |
| 24 | + # TODO: Add support for weights, trim, binned (method) |
| 25 | + uniqv <- unique(x) |
| 26 | + tab <- tabulate(match(x, uniqv)) |
| 27 | + idx <- which.max(tab) |
| 28 | + uniqv[idx] |
| 29 | +} |
| 30 | + |
| 31 | +#' Compute the coefficient of variation |
| 32 | +#' |
| 33 | +#' Compute the coefficient of variation (CV, ratio of the standard deviation to |
| 34 | +#' the mean, \eqn{\sigma/\mu}) for a set of numeric values. |
| 35 | +#' |
| 36 | +#' @return The computed coefficient of variation for `x`. |
| 37 | +#' @export |
| 38 | +#' |
| 39 | +#' @examples |
| 40 | +#' coef_var(1:10) |
| 41 | +#' coef_var(c(1:10, 100), method = "median_mad") |
| 42 | +#' coef_var(c(1:10, 100), method = "qcd") |
| 43 | +#' coef_var(mu = 10, sigma = 20) |
| 44 | +#' coef_var(mu = 10, sigma = 20, method = "unbiased", n = 30) |
| 45 | +coef_var <- function(x, ...) { |
| 46 | + UseMethod("coef_var") |
| 47 | +} |
| 48 | +#' @name distribution_cv |
| 49 | +#' @rdname coef_var |
| 50 | +#' @export |
| 51 | +distribution_coef_var <- coef_var |
| 52 | + |
| 53 | +#' @export |
| 54 | +coef_var.default <- function(x, verbose = TRUE, ...) { |
| 55 | + if (verbose) { |
| 56 | + warning(insight::format_message( |
| 57 | + paste0("Can't compute the coefficient of variation objects of class `", class(x)[1], "`.") |
| 58 | + ), call. = FALSE) |
| 59 | + } |
| 60 | + NULL |
| 61 | +} |
| 62 | + |
| 63 | +#' @param x A numeric vector of ratio scale (see details), or vector of values than can be coerced to one. |
| 64 | +#' @param mu A numeric vector of mean values to use to compute the coefficient |
| 65 | +#' of variation. If supplied, `x` is not used to compute the mean. |
| 66 | +#' @param sigma A numeric vector of standard deviation values to use to compute the coefficient |
| 67 | +#' of variation. If supplied, `x` is not used to compute the SD. |
| 68 | +#' @param method Method to use to compute the CV. Can be `"standard"` to compute |
| 69 | +#' by dividing the standard deviation by the mean, `"unbiased"` for the |
| 70 | +#' unbiased estimator for normally distributed data, or one of two robust |
| 71 | +#' alternatives: `"median_mad"` to divide the median by the [stats::mad()], |
| 72 | +#' or `"qcd"` (quartile coefficient of dispersion, interquartile range divided |
| 73 | +#' by the sum of the quartiles \[twice the midhinge\]: \eqn{(Q_3 - Q_1)/(Q_3 + Q_1)}. |
| 74 | +#' @param trim the fraction (0 to 0.5) of values to be trimmed from |
| 75 | +#' each end of `x` before the mean and standard deviation (or alternatvies) |
| 76 | +#' are computed. Values of `trim` outside the range of (0 to 0.5) are taken |
| 77 | +#' as the nearest endpoint. |
| 78 | +#' @param na.rm Logical. Should `NA` values be removed before computing (`TRUE`) |
| 79 | +#' or not (`FALSE`, default)? |
| 80 | +#' @param n If `method = "unbiased"` and both `mu` and `sigma` are provided (not |
| 81 | +#' computed from `x`), what sample size to use to adjust the computed CV |
| 82 | +#' for small-sample bias? |
| 83 | +#' @param ... Further arguments passed to computation functions. |
| 84 | +#' |
| 85 | +#' @details |
| 86 | +#' CV is only applicable of values taken on a ratio scale: values that have a |
| 87 | +#' *fixed* meaningfully defined 0 (which is either the lowest or highest |
| 88 | +#' possible value), and that ratios between them are interpretable For example, |
| 89 | +#' how many sandwiches have I eaten this week? 0 means "none" and 20 sandwiches |
| 90 | +#' is 4 times more than 5 sandwiches. If I were to center the number of |
| 91 | +#' sandwiches, it will no longer be on a ratio scale (0 is no "none" it is the |
| 92 | +#' mean, and the ratio between 4 and -2 is not meaningful). Scaling a ratio |
| 93 | +#' scale still results in a ratio scale. So I can re define "how many half |
| 94 | +#' sandwiches did I eat this week ( = sandwiches * 0.5) and 0 would still mean |
| 95 | +#' "none", and 20 half-sandwiches is still 4 times more than 5 half-sandwiches. |
| 96 | +#' |
| 97 | +#' This means that CV is **NOT** invariant to shifting, but it is to scaling: |
| 98 | + |
| 99 | +#' ```{r} |
| 100 | +#' sandwiches <- c(0, 4, 15, 0, 0, 5, 2, 7) |
| 101 | +#' coef_var(sandwiches) |
| 102 | +#' |
| 103 | +#' coef_var(sandwiches / 2) # same |
| 104 | +#' |
| 105 | +#' coef_var(sandwiches + 4) # different! 0 is no longer meaningful! |
| 106 | +#' ```` |
| 107 | +#' |
| 108 | +#' @rdname coef_var |
| 109 | +#' |
| 110 | +#' @export |
| 111 | +coef_var.numeric <- function(x, mu = NULL, sigma = NULL, |
| 112 | + method = c("standard", "unbiased", "median_mad", "qcd"), |
| 113 | + trim = 0, na.rm = FALSE, n = NULL, ...) { |
| 114 | + # TODO: Support weights |
| 115 | + if (!missing(x) && all(c(-1, 1) %in% sign(x))){ |
| 116 | + stop("Coefficient of variation only applicable for ratio scale variables.", call. = FALSE) |
| 117 | + } |
| 118 | + method <- match.arg(method, choices = c("standard", "unbiased", "median_mad", "qcd")) |
| 119 | + if (is.null(mu) || is.null(sigma)) { |
| 120 | + if (isTRUE(na.rm)) { |
| 121 | + x <- .drop_na(x) |
| 122 | + } |
| 123 | + n <- length(x) |
| 124 | + x <- .trim_values(x, trim = trim, n = n) |
| 125 | + } |
| 126 | + if (is.null(mu)) { |
| 127 | + mu <- switch( |
| 128 | + method, |
| 129 | + standard = , unbiased = mean(x, ...), |
| 130 | + median_mad = stats::median(x, ...), |
| 131 | + qcd = unname(sum(stats::quantile(x, probs = c(.25, .75), ...))) |
| 132 | + ) |
| 133 | + } |
| 134 | + if (is.null(sigma)) { |
| 135 | + sigma <- switch( |
| 136 | + method, |
| 137 | + standard = , unbiased = stats::sd(x, ...), |
| 138 | + median_mad = stats::mad(x, center = mu, ...), |
| 139 | + qcd = unname(diff(stats::quantile(x, probs = c(.25, .75), ...))) |
| 140 | + ) |
| 141 | + } |
| 142 | + out <- sigma / mu |
| 143 | + if (method == "unbiased") { |
| 144 | + if (is.null(n)) { |
| 145 | + stop(insight::format_message( |
| 146 | + "A value for `n` must be provided when `method = \"unbiased\"` and both `mu` and `sigma` are provided." |
| 147 | + ), call. = FALSE) |
| 148 | + } |
| 149 | + # from DescTools::CoefVar |
| 150 | + out <- out * (1 - 1 / (4 * (n - 1)) + 1 / n * out^2 + 1 / (2 * (n - 1)^2)) |
| 151 | + } |
| 152 | + return(out) |
| 153 | +} |
| 154 | + |
| 155 | + |
| 156 | + |
| 157 | + |
| 158 | +# descriptives helpers |
| 159 | + |
| 160 | +.drop_na <- function(x) { |
| 161 | + x[!is.na(x)] |
| 162 | +} |
| 163 | + |
| 164 | +.trim_values <- function(x, trim = 0, n = NULL, weights = NULL) { |
| 165 | + # TODO: Support weights |
| 166 | + if (!is.numeric(trim) || length(trim) != 1L) { |
| 167 | + stop("`trim` must be a single numeric value.", call. = FALSE) |
| 168 | + } |
| 169 | + if (is.null(NULL)) { |
| 170 | + n <- length(x) |
| 171 | + } |
| 172 | + if (trim > 0 && n) { |
| 173 | + if (anyNA(x)) return(NA_real_) |
| 174 | + if (trim >= 0.5) return(stats::median(x, na.rm = FALSE)) |
| 175 | + lo <- floor(n * trim) + 1 |
| 176 | + hi <- n + 1 - lo |
| 177 | + x <- sort.int(x, partial = unique(c(lo, hi)))[lo:hi] |
| 178 | + } |
| 179 | + x |
| 180 | +} |
0 commit comments