Skip to content

Commit 77163bf

Browse files
committed
Refactor dotplot binning internals to improve automatic binwidth selection
1 parent b4fcf49 commit 77163bf

21 files changed

+739
-649
lines changed

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ Collate:
9494
'deprecated.R'
9595
'distributions.R'
9696
'draw_key_slabinterval.R'
97+
'find_dotplot_binwidth.R'
9798
'geom.R'
9899
'geom_slabinterval.R'
99100
'geom_dotsinterval.R'

NEWS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ Minor changes:
66
around their mean y position. This makes the swarm more visually symmetrical,
77
and particularly makes small, isolated clusters less likely to appear lopsided
88
(inspired by a question from @jbengler at the ggextenders talk).
9+
* Automatic binwidth detection in dots geometries now accounts for `layout` and
10+
`side` parameters to improve binwidth selection for non-default layouts, most
11+
notably `layout = "swarm"` and `side = "both"`. This may cause minor changes to
12+
existing plots that use automatic binwidths.
913

1014
Internal changes:
1115

R/bin_dots.R

Lines changed: 268 additions & 331 deletions
Large diffs are not rendered by default.

R/binner.R

Lines changed: 3 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ NULL
2525
#' @noRd
2626
binner = new_class(
2727
"binner",
28+
abstract = TRUE,
2829
properties = list(
2930
maxheight = new_property(
3031
class_numeric,
@@ -61,9 +62,6 @@ binner = new_class(
6162
)
6263
)
6364

64-
65-
# binner setup -----------------------------------------------------------
66-
6765
#' Create a new dot binner
6866
#' @param layout <[string][character]> name of layout as passed to `bin_dots()`.
6967
#' @param ... Additional arguments passed to the binner constructor.
@@ -73,31 +71,9 @@ new_binner = function(layout, ...) {
7371
match_function(layout, "binner_")(...)
7472
}
7573

76-
#' Prepare binner for data
77-
#' @description
78-
#' This generic function updates a dot binner based on the provided data points.
79-
#' Used for pre-calculations that depend on the data that can be used
80-
#' to speed up automatic binwidth selection.
81-
#' @param binner <`binner`> dot binner to update.
82-
#' @param x <[numeric]> numeric vector of data points.
83-
#' @return An updated `binner`.
84-
#' @noRd
85-
prepare_binner = new_generic("prepare_binner", c("binner"), function(binner, x, ...) {
86-
S7_dispatch()
87-
})
88-
89-
method(prepare_binner, binner) = function(binner, x, ...) {
90-
binner
91-
}
92-
9374

9475
# bin-based layouts ----------------------------------------------------------------
9576

96-
# TODO: remove
97-
automatic_bin = function(x, width, binner = binner_bin()) {
98-
prepare_binner(binner, x)@bin_method(x, width)[c("bins", "bin_midpoints")]
99-
}
100-
10177
#' Bin layout
10278
#' @description
10379
#' Wilkinson-esque binner for dot plots created with `bin_dots()`.
@@ -112,7 +88,7 @@ binner_bin = new_class(
11288
properties = list(
11389
bin_method = new_property(
11490
class_function,
115-
default = automatic_bin
91+
default = function(...) cli_abort("`prepare_binner()` must be called to set `bin_method`.")
11692
),
11793
align_rows = new_property(
11894
class_logical,
@@ -121,19 +97,6 @@ binner_bin = new_class(
12197
)
12298
)
12399

124-
method(prepare_binner, binner_bin) = function(binner, x, ...) {
125-
# examines a vector of data and determines an appropriate binning method based on its properties
126-
# doing this up front allows us to doing this repeatedly when finding binwidth via optimization
127-
diff_x = diff(x)
128-
if (isTRUE(all.equal(diff_x, rev(diff_x), check.attributes = FALSE))) {
129-
# x is symmetric, use centered binning
130-
binner@bin_method = wilkinson_bin_from_center
131-
} else {
132-
binner@bin_method = wilkinson_bin
133-
}
134-
binner
135-
}
136-
137100
#' Weave binner
138101
#' @description
139102
#' Weave `binner` for dot plots created with `bin_dots()`.
@@ -222,10 +185,6 @@ binner_bar = new_class(
222185
)
223186
)
224187

225-
method(prepare_binner, binner_bar) = function(binner, x, ...) {
226-
binner
227-
}
228-
229188

230189
# swarm binners ----------------------------------------------------------
231190

@@ -237,14 +196,7 @@ method(prepare_binner, binner_bar) = function(binner, x, ...) {
237196
#' @noRd
238197
binner_swarm = new_class(
239198
"binner_swarm",
240-
parent = binner,
241-
properties = list(
242-
# TODO: remove this default method when dots_heap is refactored not to call this
243-
bin_method = new_property(
244-
class_function,
245-
default = automatic_bin
246-
)
247-
)
199+
parent = binner
248200
)
249201

250202
#' Swarm2 binner

R/find_dotplot_binwidth.R

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# dynamic binwidth selection ----------------------------------------------
2+
3+
#' Dynamically select a good bin width for a dotplot
4+
#'
5+
#' Searches for a nice-looking bin width to use to draw a dotplot such that
6+
#' the height of the dotplot fits within a given space (`maxheight`).
7+
#'
8+
#' @param x <[numeric]> Data values.
9+
#' @param maxheight <scalar [numeric]> Maximum height of the dotplot.
10+
#' @param heightratio <scalar [numeric]> Ratio of bin width to dot height.
11+
#' @param stackratio <scalar [numeric]> Ratio of dot height to vertical distance
12+
#' between dot centers
13+
#' @eval rd_param_dots_layout()
14+
#' @eval rd_param_slab_side()
15+
#'
16+
#' @details
17+
#' This dynamic bin selection algorithm uses a binary search over the number of
18+
#' bins to find a bin width such that if the input data (`x`) is binned
19+
#' using a Wilkinson-style dotplot algorithm the height of the tallest bin
20+
#' will be less than `maxheight`.
21+
#'
22+
#' This algorithm is used by [geom_dotsinterval()] (and its variants) to automatically
23+
#' select bin widths. Unless you are manually implementing you own dotplot [`grob`]
24+
#' or `geom`, you probably do not need to use this function directly
25+
#'
26+
#' @return A suitable bin width such that a dotplot created with this bin width
27+
#' and `heightratio` should have its tallest bin be less than or equal to `maxheight`.
28+
#'
29+
#' @seealso [bin_dots()] for an algorithm can bin dots using bin widths selected
30+
#' by this function; [geom_dotsinterval()] for geometries that use
31+
#' these algorithms to create dotplots.
32+
#' @examples
33+
#'
34+
#' library(dplyr)
35+
#' library(ggplot2)
36+
#'
37+
#' x = qnorm(ppoints(20))
38+
#' binwidth = find_dotplot_binwidth(x, maxheight = 4, heightratio = 1)
39+
#' binwidth
40+
#'
41+
#' bin_df = bin_dots(x = x, y = 0, binwidth = binwidth, heightratio = 1)
42+
#' bin_df
43+
#'
44+
#' # we can manually plot the binning above, though this is only recommended
45+
#' # if you are using find_dotplot_binwidth() and bin_dots() to build your own
46+
#' # grob. For practical use it is much easier to use geom_dots(), which will
47+
#' # automatically select good bin widths for you (and which uses
48+
#' # find_dotplot_binwidth() and bin_dots() internally)
49+
#' bin_df %>%
50+
#' ggplot(aes(x = x, y = y)) +
51+
#' geom_point(size = 4) +
52+
#' coord_fixed()
53+
#'
54+
#' @importFrom grDevices nclass.Sturges nclass.FD nclass.scott
55+
#' @importFrom stats optimize
56+
#' @export
57+
find_dotplot_binwidth = function(
58+
x,
59+
maxheight,
60+
heightratio = 1,
61+
stackratio = 1,
62+
layout = c("bin", "weave", "hex", "swarm", "swarm2", "bar"),
63+
side = c("topright", "top", "right", "bottomleft", "bottom", "left", "topleft", "bottomright", "both")
64+
) {
65+
x = sort(as.numeric(x), na.last = TRUE)
66+
67+
# figure out a reasonable minimum number of bins based on histogram binning
68+
min_nbins = if (length(x) <= 1) {
69+
1
70+
} else {
71+
min(nclass.scott(x), nclass.FD(x), nclass.Sturges(x))
72+
}
73+
binner = new_binner(match.arg(layout),
74+
maxheight = maxheight,
75+
heightratio = heightratio,
76+
stackratio = stackratio,
77+
side = match.arg(side)
78+
)
79+
binner = prepare_binner(binner, x)
80+
min_binning = arrange_bins(binner, x, nbins = min_nbins)
81+
82+
if (isTRUE(min_binning$height <= maxheight)) {
83+
# if the minimum binning (i.e. the binning constructed from the smallest
84+
# number of bins --- thus, at the upper limit of the height we will allow)
85+
# is valid, then we don't need to search and can just use it.
86+
binning = min_binning
87+
} else {
88+
# figure out a maximum number of bins based on data resolution (except
89+
# for bars, which handle duplicate values differently, so must go by
90+
# number of data points instead of unique data points)
91+
# TODO: don't special case binner_bar here --- instead, have binners
92+
# implement a method to get max_binning
93+
max_binning = if (S7_inherits(binner, binner_bar)) {
94+
arrange_bins(binner, x, nbins = length(x))
95+
} else {
96+
arrange_bins(binner, x, binwidth = resolution(x))
97+
}
98+
99+
if (max_binning$nbins <= min_binning$nbins + 1) {
100+
# nowhere to search, use maximum number of bins
101+
binning = max_binning
102+
} else {
103+
# use binary search to find a reasonable number of bins
104+
repeat {
105+
binning = arrange_bins(binner, x, nbins = (min_binning$nbins + max_binning$nbins) / 2)
106+
if (isTRUE(binning$height <= maxheight)) {
107+
# binning is valid, search downwards
108+
if (binning$nbins - 1 <= min_binning$nbins) {
109+
# found it, we're done
110+
break
111+
}
112+
max_binning = binning
113+
} else {
114+
# binning is not valid, search upwards
115+
if (binning$nbins + 1 >= max_binning$nbins) {
116+
# found it, we're done
117+
binning = max_binning
118+
break
119+
}
120+
min_binning = binning
121+
}
122+
}
123+
}
124+
125+
# attempt to refine binwidth using optimization.
126+
# after finding a reasonable candidate based on number of bins, we refine
127+
# the binwidth around that number of bins using optimization. We do this
128+
# only as a second step because just using optimization on binwidth as a
129+
# first step tends to end up in a local minimum, sometimes very far from
130+
# maxheight.
131+
candidate_binwidths = c(min_binning$binwidth, max_binning$binwidth, binning$binwidth)
132+
if (length(unique(candidate_binwidths)) != 1) {
133+
binwidth = optimize(
134+
function(binwidth) {
135+
binning = arrange_bins(binner, x, binwidth = binwidth)
136+
(binning$height - maxheight)^2
137+
},
138+
candidate_binwidths,
139+
tol = sqrt(.Machine$double.eps)
140+
)$minimum
141+
new_binning = arrange_bins(binner, x, binwidth = binwidth)
142+
143+
# approximate test that binning is valid, used here to tolerate approximation with optimize()
144+
if (isTRUE(new_binning$height <= maxheight + .Machine$double.eps^0.25)) {
145+
binning = new_binning
146+
}
147+
}
148+
}
149+
150+
# check if the selected binning is valid....
151+
if (isTRUE(binning$height <= maxheight + .Machine$double.eps^0.25)) {
152+
binning$binwidth
153+
} else {
154+
# ... if it isn't, this means we've ended up with some bin that's too
155+
# tall, probably because we have discrete data --- we'll just
156+
# conservatively shrink things down so they fit by backing out a bin
157+
# width that works with the tallest bin
158+
binning$binwidth * maxheight / binning$height
159+
}
160+
}

R/geom_dotsinterval.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ makeContent.dots_grob = function(x) {
100100
# find the best bin widths across all the dotplots we are going to draw
101101
binwidths = map_dbl_(datas, function(d) {
102102
maxheight = max(d[[ymax]] - d[[ymin]])
103-
find_dotplot_binwidth(d[[x]], maxheight, heightratio, stackratio, layout = layout)
103+
find_dotplot_binwidth(d[[x]], maxheight, heightratio, stackratio, layout = layout, side = d$side[[1]])
104104
})
105105

106106
binwidth = min(binwidths, user_max_binwidth)

0 commit comments

Comments
 (0)