Skip to content

Commit 7f495e6

Browse files
committed
add wilkinson smoothing
1 parent ca32927 commit 7f495e6

File tree

3 files changed

+104
-31
lines changed

3 files changed

+104
-31
lines changed

R/bin_dots.R

Lines changed: 95 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -447,18 +447,8 @@ wilkinson_bin_to_right = function(x, width) {
447447
bins = wilkinson_bin_to_right_(x, width)
448448

449449
# determine bin positions
450-
# can take advantage of the fact that bins is sorted runs of numbers to
451-
# get the first and last entry from each bin
452-
bin_left = x[!duplicated(bins)]
453-
bin_right = x[!duplicated(bins, fromLast = TRUE)]
454-
bin_midpoints = (bin_left + bin_right) / 2
455-
456-
list(
457-
bins = bins,
458-
bin_midpoints = bin_midpoints,
459-
bin_left = bin_left,
460-
bin_right = bin_right
461-
)
450+
bin_runs = rle_bins(bins)
451+
locate_bins(bin_runs, x)
462452
}
463453

464454
#' do a backwards sweep after a left-to-right wilkinson binning, trying to
@@ -507,16 +497,9 @@ wilkinson_sweep_back = function(x, b, width, first_slack = Inf) {
507497
b$bins[changed_x_is] = bins_changed
508498

509499
# re-number bins to be consecutive in case some bins got removed completely
510-
first_x_in_bin = !duplicated(b$bins)
511-
b$bins = cumsum(first_x_in_bin)
512-
513-
# can take advantage of the fact that b$bins is sorted runs of numbers to
514-
# get the first and last entry from each bin
515-
b$bin_left = x[first_x_in_bin]
516-
b$bin_right = x[!duplicated(b$bins, fromLast = TRUE)]
517-
b$bin_midpoints = (b$bin_left + b$bin_right) / 2
518-
519-
b
500+
bin_runs = rle_bins(b$bins)
501+
bin_runs = renumber_bins(bin_runs)
502+
locate_bins(bin_runs, x)
520503
}
521504

522505
#' a rightward or leftward wilkinson binning followed by a backwards sweep to
@@ -536,17 +519,19 @@ wilkinson_bin = function(x, width, right = TRUE, first_slack = Inf) {
536519

537520
if (right) {
538521
b = wilkinson_bin_to_right(x, width)
539-
wilkinson_sweep_back(x, b, width, first_slack = first_slack)
522+
b = wilkinson_sweep_back(x, b, width, first_slack = first_slack)
540523
} else {
541524
rev_x = -rev(x)
542525
b = wilkinson_bin_to_right(rev_x, width)
543526
b = wilkinson_sweep_back(rev_x, b, width, first_slack = first_slack)
544-
list(
527+
b = list(
545528
# renumber bins so 1,2,3,3 => 3,2,1,1 (then reverse so it matches original vector order)
546529
bins = rev(max(b$bins) + 1 - b$bins),
547530
bin_midpoints = -rev(b$bin_midpoints)
548531
)
549532
}
533+
534+
wilkinson_smooth(x, b, width)
550535
}
551536

552537
#' A modified wilkinson-style binning that expands outward from the center of
@@ -617,13 +602,98 @@ wilkinson_bin_from_center = function(x, width) {
617602
)
618603

619604
center_bin_i = length(left$bin_midpoints) + 1
620-
list(
605+
b = list(
621606
bins = c(left$bins, rep(center_bin_i, n_center), center_bin_i + right$bins),
622607
bin_midpoints = c(left$bin_midpoints, center_midpoint, right$bin_midpoints)
623608
)
609+
610+
wilkinson_smooth(x, b, width)
624611
}
625612
}
626613

614+
#' Get the run-length encoding of the bins in a Wilkinson binning
615+
#' @param bins <[list]> sorted bin numbers starting at `1`, as in the
616+
#' `"bins"` element of output from `wilkinson_` functions.
617+
#' @returns <[data.frame]> with columns:
618+
#' - `"bin"`: `unique(bins)`
619+
#' - `"count"`: occurrences of each bin.
620+
#' @noRd
621+
rle_bins = function(bins) {
622+
out = vec_unrep(bins)
623+
names(out) = c("bin", "count")
624+
out
625+
}
626+
627+
#' Re-number bins to be consecutive
628+
#'
629+
#' Removes empty bins and gaps in bins
630+
#' @param bin_runs <[data.frame]> as returned by `rle_bins()`. Missing bins
631+
#' will be removed: the `"bin"` element should be increasing but need
632+
#' not be consecutive and the `"count"` element may have zeros.
633+
#' @returns modified `bin_runs` with consecutive `"bin"` element starting
634+
#' at zero and `"count"` all positive.
635+
#' @noRd
636+
renumber_bins = function(bin_runs) {
637+
bin_runs = bin_runs[bin_runs$count > 0, ]
638+
bin_runs$bin = seq_len(nrow(bin_runs))
639+
bin_runs
640+
}
641+
642+
#' Convert run-length encoded bins into wilkinson binning format and find bin locations
643+
#' @param bin_runs <[data.frame]> as returned by `rle_bins()`. The
644+
#' `"bin"` element should be consecutive starting from `1`and the
645+
#' `"count"` element should not have zeros.
646+
#' @param x <[numeric]> data values to be binned
647+
#' @param b <[list]> as returned by `wilkinson_` methods.
648+
#' @returns <[list]> binning format returned by `wilkinson_` methods, with
649+
#' elements `"bins"`, `"bin_left"`, `"bin_right"`, `"bin_midpoints"`.
650+
#' @noRd
651+
locate_bins = function(bin_runs, x) {
652+
bins = rep.int(bin_runs$bin, times = bin_runs$count)
653+
654+
# can take advantage of the fact that bins is sorted runs of numbers to
655+
# get the first and last entry from each bin
656+
bin_left = x[bins != c(0, bins[-length(bins)])]
657+
bin_right = x[bins != c(bins[-1], 0)]
658+
bin_midpoints = (bin_left + bin_right) / 2
659+
660+
list(
661+
bins = bins,
662+
bin_left = bin_left,
663+
bin_right = bin_right,
664+
bin_midpoints = bin_midpoints
665+
)
666+
}
667+
668+
#' Adjacent-bin moving average smooth as described in Wilkinson
669+
#' @description
670+
#' Exchanges dots between adjacent bins in a dotplot as described in Wilkinson.
671+
#' @param x <[numeric]> sorted data values
672+
#' @param b <[numeric]> binning as returned by other `wilkinson_` functions: a
673+
#' [list] with elements `bins` (consecutive integers starting at `1` with the same
674+
#' length as `x`) and `bin_midpoints` (having length equal to `max(bins)`).
675+
#' @param width <scalar [numeric]> positive bin width
676+
#' @param span <scalar [numeric]> multiple of bin width giving the window within
677+
#' which to consider the next bin "adjacent". If `0`, no smoothing is done. A value
678+
#' of `1.25` is equivalent to Wilkinson's recommended smoothing if bins are within
679+
#' `width/4` of each other.
680+
#' @noRd
681+
wilkinson_smooth = function(x, b, width, span = 0) {
682+
if (span == 0) return(b)
683+
window = width * span
684+
685+
bin_runs = rle_bins(b$bins)
686+
for (i in seq_len(nrow(bin_runs) - 1)) {
687+
if (b$bin_midpoints[[i + 1]] - b$bin_midpoints[[i]] <= window) {
688+
dots_to_move = floor((bin_runs$count[[i + 1]] - bin_runs$count[[i]]) / 2)
689+
bin_runs$count[[i]] = bin_runs$count[[i]] + dots_to_move
690+
bin_runs$count[[i + 1]] = bin_runs$count[[i + 1]] - dots_to_move
691+
}
692+
}
693+
694+
bin_runs = renumber_bins(bin_runs)
695+
locate_bins(bin_runs, x)
696+
}
627697

628698
# grid swarm -------------------------------------------------------------
629699

R/geom_dotsinterval.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -507,11 +507,11 @@ GeomDotsinterval = ggproto("GeomDotsinterval", GeomSlabinterval,
507507
<[function] | [string][character]> Smoother to apply to dot positions.
508508
One of:
509509
- A function that takes a numeric vector of dot positions and returns a
510-
smoothed version of that vector, such as `smooth_bounded()`,
511-
`smooth_unbounded()`, smooth_discrete()`, or `smooth_bar()`.
510+
smoothed version of that vector, such as [smooth_bounded()],
511+
[smooth_unbounded()], [smooth_discrete()], or [smooth_bar()].
512512
- A string indicating what smoother to use, as the suffix to a function
513513
name starting with `smooth_`; e.g. `"none"` (the default) applies
514-
`smooth_none()`, which simply returns the given vector without
514+
[smooth_none()], which simply returns the given vector without
515515
applying smoothing.
516516
517517
Smoothing is most effective when the smoother is matched to the support of

src/bin_dots.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,12 @@ inline auto place_rows(
216216
std::vector<std::vector<double>>& rows_bottom
217217
) -> bool {
218218
auto any_left = true;
219-
while (n --> 0_z && any_left) {
220-
any_left = place_row<reverse>(both, xsize, ygrid, remaining, next_remaining, rows, rows_bottom);
221-
}
219+
while (
220+
n-- > 0_z &&
221+
(any_left = place_row<reverse>(both, xsize, ygrid, remaining, next_remaining, rows, rows_bottom)) &&
222+
n-- > 0_z &&
223+
(any_left = place_row<!reverse>(both, xsize, ygrid, remaining, next_remaining, rows, rows_bottom))
224+
);
222225
return any_left;
223226
}
224227

0 commit comments

Comments
 (0)