|
| 1 | +# dynamic binwidth selection ---------------------------------------------- |
| 2 | + |
| 3 | +#' Dynamically select a good bin width for a dotplot |
| 4 | +#' |
| 5 | +#' Searches for a nice-looking bin width to use to draw a dotplot such that |
| 6 | +#' the height of the dotplot fits within a given space (`maxheight`). |
| 7 | +#' |
| 8 | +#' @param x <[numeric]> Data values. |
| 9 | +#' @param maxheight <scalar [numeric]> Maximum height of the dotplot. |
| 10 | +#' @param heightratio <scalar [numeric]> Ratio of bin width to dot height. |
| 11 | +#' @param stackratio <scalar [numeric]> Ratio of dot height to vertical distance |
| 12 | +#' between dot centers |
| 13 | +#' @eval rd_param_dots_layout() |
| 14 | +#' @eval rd_param_slab_side() |
| 15 | +#' |
| 16 | +#' @details |
| 17 | +#' This dynamic bin selection algorithm uses a binary search over the number of |
| 18 | +#' bins to find a bin width such that if the input data (`x`) is binned |
| 19 | +#' using a Wilkinson-style dotplot algorithm the height of the tallest bin |
| 20 | +#' will be less than `maxheight`. |
| 21 | +#' |
| 22 | +#' This algorithm is used by [geom_dotsinterval()] (and its variants) to automatically |
| 23 | +#' select bin widths. Unless you are manually implementing you own dotplot [`grob`] |
| 24 | +#' or `geom`, you probably do not need to use this function directly |
| 25 | +#' |
| 26 | +#' @return A suitable bin width such that a dotplot created with this bin width |
| 27 | +#' and `heightratio` should have its tallest bin be less than or equal to `maxheight`. |
| 28 | +#' |
| 29 | +#' @seealso [bin_dots()] for an algorithm can bin dots using bin widths selected |
| 30 | +#' by this function; [geom_dotsinterval()] for geometries that use |
| 31 | +#' these algorithms to create dotplots. |
| 32 | +#' @examples |
| 33 | +#' |
| 34 | +#' library(dplyr) |
| 35 | +#' library(ggplot2) |
| 36 | +#' |
| 37 | +#' x = qnorm(ppoints(20)) |
| 38 | +#' binwidth = find_dotplot_binwidth(x, maxheight = 4, heightratio = 1) |
| 39 | +#' binwidth |
| 40 | +#' |
| 41 | +#' bin_df = bin_dots(x = x, y = 0, binwidth = binwidth, heightratio = 1) |
| 42 | +#' bin_df |
| 43 | +#' |
| 44 | +#' # we can manually plot the binning above, though this is only recommended |
| 45 | +#' # if you are using find_dotplot_binwidth() and bin_dots() to build your own |
| 46 | +#' # grob. For practical use it is much easier to use geom_dots(), which will |
| 47 | +#' # automatically select good bin widths for you (and which uses |
| 48 | +#' # find_dotplot_binwidth() and bin_dots() internally) |
| 49 | +#' bin_df %>% |
| 50 | +#' ggplot(aes(x = x, y = y)) + |
| 51 | +#' geom_point(size = 4) + |
| 52 | +#' coord_fixed() |
| 53 | +#' |
| 54 | +#' @importFrom grDevices nclass.Sturges nclass.FD nclass.scott |
| 55 | +#' @importFrom stats optimize |
| 56 | +#' @export |
| 57 | +find_dotplot_binwidth = function( |
| 58 | + x, |
| 59 | + maxheight, |
| 60 | + heightratio = 1, |
| 61 | + stackratio = 1, |
| 62 | + layout = c("bin", "weave", "hex", "swarm", "swarm2", "bar"), |
| 63 | + side = c("topright", "top", "right", "bottomleft", "bottom", "left", "topleft", "bottomright", "both") |
| 64 | +) { |
| 65 | + x = sort(as.numeric(x), na.last = TRUE) |
| 66 | + |
| 67 | + # figure out a reasonable minimum number of bins based on histogram binning |
| 68 | + min_nbins = if (length(x) <= 1) { |
| 69 | + 1 |
| 70 | + } else { |
| 71 | + min(nclass.scott(x), nclass.FD(x), nclass.Sturges(x)) |
| 72 | + } |
| 73 | + binner = new_binner(match.arg(layout), |
| 74 | + maxheight = maxheight, |
| 75 | + heightratio = heightratio, |
| 76 | + stackratio = stackratio, |
| 77 | + side = match.arg(side) |
| 78 | + ) |
| 79 | + binner = prepare_binner(binner, x) |
| 80 | + min_binning = arrange_bins(binner, x, nbins = min_nbins) |
| 81 | + |
| 82 | + if (isTRUE(min_binning$height <= maxheight)) { |
| 83 | + # if the minimum binning (i.e. the binning constructed from the smallest |
| 84 | + # number of bins --- thus, at the upper limit of the height we will allow) |
| 85 | + # is valid, then we don't need to search and can just use it. |
| 86 | + binning = min_binning |
| 87 | + } else { |
| 88 | + # figure out a maximum number of bins based on data resolution (except |
| 89 | + # for bars, which handle duplicate values differently, so must go by |
| 90 | + # number of data points instead of unique data points) |
| 91 | + # TODO: don't special case binner_bar here --- instead, have binners |
| 92 | + # implement a method to get max_binning |
| 93 | + max_binning = if (S7_inherits(binner, binner_bar)) { |
| 94 | + arrange_bins(binner, x, nbins = length(x)) |
| 95 | + } else { |
| 96 | + arrange_bins(binner, x, binwidth = resolution(x)) |
| 97 | + } |
| 98 | + |
| 99 | + if (max_binning$nbins <= min_binning$nbins + 1) { |
| 100 | + # nowhere to search, use maximum number of bins |
| 101 | + binning = max_binning |
| 102 | + } else { |
| 103 | + # use binary search to find a reasonable number of bins |
| 104 | + repeat { |
| 105 | + binning = arrange_bins(binner, x, nbins = (min_binning$nbins + max_binning$nbins) / 2) |
| 106 | + if (isTRUE(binning$height <= maxheight)) { |
| 107 | + # binning is valid, search downwards |
| 108 | + if (binning$nbins - 1 <= min_binning$nbins) { |
| 109 | + # found it, we're done |
| 110 | + break |
| 111 | + } |
| 112 | + max_binning = binning |
| 113 | + } else { |
| 114 | + # binning is not valid, search upwards |
| 115 | + if (binning$nbins + 1 >= max_binning$nbins) { |
| 116 | + # found it, we're done |
| 117 | + binning = max_binning |
| 118 | + break |
| 119 | + } |
| 120 | + min_binning = binning |
| 121 | + } |
| 122 | + } |
| 123 | + } |
| 124 | + |
| 125 | + # attempt to refine binwidth using optimization. |
| 126 | + # after finding a reasonable candidate based on number of bins, we refine |
| 127 | + # the binwidth around that number of bins using optimization. We do this |
| 128 | + # only as a second step because just using optimization on binwidth as a |
| 129 | + # first step tends to end up in a local minimum, sometimes very far from |
| 130 | + # maxheight. |
| 131 | + candidate_binwidths = c(min_binning$binwidth, max_binning$binwidth, binning$binwidth) |
| 132 | + if (length(unique(candidate_binwidths)) != 1) { |
| 133 | + binwidth = optimize( |
| 134 | + function(binwidth) { |
| 135 | + binning = arrange_bins(binner, x, binwidth = binwidth) |
| 136 | + (binning$height - maxheight)^2 |
| 137 | + }, |
| 138 | + candidate_binwidths, |
| 139 | + tol = sqrt(.Machine$double.eps) |
| 140 | + )$minimum |
| 141 | + new_binning = arrange_bins(binner, x, binwidth = binwidth) |
| 142 | + |
| 143 | + # approximate test that binning is valid, used here to tolerate approximation with optimize() |
| 144 | + if (isTRUE(new_binning$height <= maxheight + .Machine$double.eps^0.25)) { |
| 145 | + binning = new_binning |
| 146 | + } |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + # check if the selected binning is valid.... |
| 151 | + if (isTRUE(binning$height <= maxheight + .Machine$double.eps^0.25)) { |
| 152 | + binning$binwidth |
| 153 | + } else { |
| 154 | + # ... if it isn't, this means we've ended up with some bin that's too |
| 155 | + # tall, probably because we have discrete data --- we'll just |
| 156 | + # conservatively shrink things down so they fit by backing out a bin |
| 157 | + # width that works with the tallest bin |
| 158 | + binning$binwidth * maxheight / binning$height |
| 159 | + } |
| 160 | +} |
0 commit comments