From 1d1e0c46053d93ef2650e55be11e4e5c49bb3dd3 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 17 Feb 2023 09:50:09 +0000 Subject: [PATCH 1/3] added 2 experimental versions of corkendall --- src/KendallTau.jl | 9 + src/corkendall_experimental.jl | 469 +++++++++++++++++++++++++++++++ src/corkendall_experimental2.jl | 481 ++++++++++++++++++++++++++++++++ 3 files changed, 959 insertions(+) create mode 100644 src/corkendall_experimental.jl create mode 100644 src/corkendall_experimental2.jl diff --git a/src/KendallTau.jl b/src/KendallTau.jl index a8fb998..f73d882 100644 --- a/src/KendallTau.jl +++ b/src/KendallTau.jl @@ -3,6 +3,15 @@ module KendallTau include("corkendall.jl") include("corkendall_fromfile.jl") +module Experimental +include("corkendall_experimental.jl") +end + +module Experimental2 +include("corkendall_experimental2.jl") +end + + export corkendall, corkendall_fromfile end \ No newline at end of file diff --git a/src/corkendall_experimental.jl b/src/corkendall_experimental.jl new file mode 100644 index 0000000..620204a --- /dev/null +++ b/src/corkendall_experimental.jl @@ -0,0 +1,469 @@ +####################################### +# +# Kendall correlation +# +####################################### + +#RoM stands for "Real or Missing" +const RoMVector{T<:Real} = AbstractVector{<:Union{T,Missing}} +const RoMMatrix{T<:Real} = AbstractMatrix{<:Union{T,Missing}} + +#= +TODO 16 Feb 2023 +1) Should the default value of skipmissing be :none or :pairwise? :none forces the user to + address question of how missings should be handled, but at least for REPL use, it's + rather inconvenient. +2) KendallTau.corkendall currently slower than StatsBase.corkendall for very small + vector\matrix output(4x4 or smaller, assuming vector length 1000) , presumably thanks to + overhead of threading and populating the "scratch-space" vectors. Does this matter? +3) Ask for code review? +4) Make PR to StatsBase? +=# + +""" + corkendall(x, y=x; skipmissing::Symbol=:none) + +Compute Kendall's rank correlation coefficient, τ. `x` and `y` must both be either +vectors or matrices, with elements that are either real numbers or missing values. + +# Keyword argument + +- `skipmissing::Symbol=:none`: if `:none`, missing values in either `x` or `y` + cause the function to raise an error. Use `:pairwise` to skip entries with a missing + value in either of the two vectors used to calculate (an element of) the return. Use + `:listwise` to skip entries where a missing value appears anywhere in a given row of `x` + or `y`; note that this might drop a high proportion of entries. +""" +function corkendall(x::RoMVector{T}, y::RoMVector{U}; skipmissing::Symbol=:none) where {T,U} + + length(x) == length(y) || throw(DimensionMismatch("x and y have inconsistent dimensions")) + + x, y = handlelistwise(x, y, skipmissing) + + if x isa Vector{Missing} || y isa Vector{Missing} + return (NaN) + end + + x = copy(x) + y = copy(y) + if missing isa eltype(x) || missing isa eltype(y) + x, y = handlepairwise(x, y) + end + permx = sortperm(x) + permute!(x, permx) + + return (corkendall_sorted!(x, y, permx, similar(y), similar(y), T[], U[])) +end + +#= Function returns a vector in this case, inconsistent with with Statistics.cor and +StatsBase.corspearman. Fixing that is a breaking change. =# +function corkendall(x::RoMMatrix, y::RoMVector; skipmissing::Symbol=:none) + return (vec(corkendall(x, reshape(y, (length(y), 1)); skipmissing))) +end + +function corkendall(x::RoMVector, y::RoMMatrix; skipmissing::Symbol=:none) + return (corkendall(reshape(x, (length(x), 1)), y; skipmissing)) +end + +function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:none) where {T,U} + symmetric = x === y + if size(x, 1) != size(y, 1) + throw(DimensionMismatch("x and y have inconsistent dimensions")) + end + + #Swap x and y for more efficient threaded loop. + if size(x, 2) < size(y, 2) + return (collect(transpose(corkendall(y, x; skipmissing)))) + end + + x, y = handlelistwise(x, y, skipmissing) + m, nr = size(x) + nc = size(y, 2) + + if x isa Matrix{Missing} || y isa Matrix{Missing} + if symmetric + return (ifelse.((1:nr) .== (1:nc)', 1.0, NaN)) + else + return (fill(NaN, nr, nc)) + end + end + + C = ones(Float64, nr, nc) + ndups = min(Threads.nthreads(), symmetric ? nr - 1 : nr) + use_atomic = ndups < Threads.nthreads() + + if use_atomic + a = Threads.Atomic{Int}(1) + end + + #= Create scratch vectors so that threaded code can be non-allocating. One vector per + thread to avoid cross-talk between threads.=# + + scratchyvectors = similar(y, m, ndups) + ycolis = similar(y, m, ndups) + xcoljsorteds = similar(x, m, ndups) + permxs = zeros(Int, m, ndups) + txs = Array{T}(undef, m, ndups) + tys = Array{U}(undef, m, ndups) + sortyspaces = Array{U}(undef, m, ndups) + + Threads.@threads for j = (symmetric ? 2 : 1):nr + + if use_atomic + id = Threads.atomic_add!(a, 1)[] + else + id = Threads.threadid() + end + + scratchyvector = view(scratchyvectors, :, id) + sortyspace = view(sortyspaces, :, id) + ycoli = view(ycolis, :, id) + xcoljsorted = view(xcoljsorteds, :, id) + permx = view(permxs, :, id) + tx = view(txs, :, id) + ty = view(tys, :, id) + + sortperm!(permx, view(x, :, j)) + @inbounds for k in eachindex(xcoljsorted) + xcoljsorted[k] = x[permx[k], j] + end + + for i = 1:(symmetric ? j - 1 : nc) + ycoli .= view(y, :, i) + C[j, i] = corkendall_sorted!(xcoljsorted, ycoli, permx, scratchyvector, + sortyspace, tx, ty) + symmetric && (C[i, j] = C[j, i]) + end + end + return C +end + +# Auxiliary functions for Kendall's rank correlation + +# Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” +# Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. +# JSTOR, www.jstor.org/stable/2282833. +""" + corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U}, + permx::AbstractVector{<:Integer}, scratchyvector::RoMVector, sortyspace::RoMVector, + tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} + +Kendall correlation between two vectors but this function omits the initial sorting of +the first argument. So calculating Kendall correlation between `x` and `y` is a two stage +process: a) sort `x` to get `sortedx`; b) call this function on `sortedx` and `y`, with +subsequent arguments being: +- `permx::AbstractVector{<:Integer}`: the permutation that achieved the sorting of `x` to + yield `sortedx`. +- `scratchyvector::RoMVector`: a vector of the same element type and length as `y`; used + to permute `y` without allocation. +- `sortyspace::RoMVector`: a vector of the same element type and length as `y`; used + (in the call to `merge_sort!`) to avoid allocations. +- `tx, ty`: vectors of the same length as `x` and `y` whose element types match the types + of the non-missing elements of `x` and `y` respectively; used (in the call to + `handlepairwise!`) to avoid allocations. +""" +function corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U}, + permx::AbstractVector{<:Integer}, scratchyvector::RoMVector, sortyspace::RoMVector, + tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} + + @inbounds for i in eachindex(y) + scratchyvector[i] = y[permx[i]] + end + if missing isa eltype(sortedx) || missing isa eltype(scratchyvector) + sortedx, scratchyvector = handlepairwise!(sortedx, scratchyvector, tx, ty) + end + length(sortedx) >= 2 || return (NaN) + + shuffledy = scratchyvector + + if any(isnan, sortedx) || any(isnan, shuffledy) + return NaN + end + n = length(sortedx) + + # Use widen to avoid overflows on both 32bit and 64bit + npairs = div(widen(n) * (n - 1), 2) + ntiesx = ndoubleties = nswaps = widen(0) + k = 0 + + @inbounds for i = 2:n + if sortedx[i-1] == sortedx[i] + k += 1 + elseif k > 0 + #= + Sort the corresponding chunk of shuffledy, so the rows of hcat(sortedx,shuffledy) + are sorted first on sortedx, then (where sortedx values are tied) on shuffledy. + Hence double ties can be counted by calling countties. + =# + sort!(view(shuffledy, (i-k-1):(i-1))) + ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here + ndoubleties += countties(shuffledy, i - k - 1, i - 1) + k = 0 + end + end + if k > 0 + sort!(view(shuffledy, (n-k):n)) + ntiesx += div(widen(k) * (k + 1), 2) + ndoubleties += countties(shuffledy, n - k, n) + end + + nswaps = merge_sort!(shuffledy, 1, n, sortyspace) + ntiesy = countties(shuffledy, 1, n) + + # Calls to float below prevent possible overflow errors when + # length(sortedx) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) + (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / + sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) +end + +""" + countties(x::AbstractVector{<:Real}, lo::Integer, hi::Integer) + +Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. +""" +function countties(x::AbstractVector{<:Real}, lo::Integer, hi::Integer) + # Use of widen below prevents possible overflow errors when + # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit) + thistiecount = result = widen(0) + checkbounds(x, lo:hi) + @inbounds for i = (lo+1):hi + if x[i] == x[i-1] + thistiecount += 1 + elseif thistiecount > 0 + result += div(thistiecount * (thistiecount + 1), 2) + thistiecount = widen(0) + end + end + + if thistiecount > 0 + result += div(thistiecount * (thistiecount + 1), 2) + end + result +end + +# Tests appear to show that a value of 64 is optimal, +# but note that the equivalent constant in base/sort.jl is 20. +const SMALL_THRESHOLD = 64 + +# merge_sort! copied from Julia Base +# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) +""" + merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, + t::AbstractVector=similar(v, 0)) + +Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort +distance. +""" +function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, + t::AbstractVector=similar(v, 0)) + # Use of widen below prevents possible overflow errors when + # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit) + nswaps = widen(0) + @inbounds if lo < hi + hi - lo <= SMALL_THRESHOLD && return insertion_sort!(v, lo, hi) + + m = midpoint(lo, hi) + (length(t) < m - lo + 1) && resize!(t, m - lo + 1) + + nswaps = merge_sort!(v, lo, m, t) + nswaps += merge_sort!(v, m + 1, hi, t) + + i, j = 1, lo + while j <= m + t[i] = v[j] + i += 1 + j += 1 + end + + i, k = 1, lo + while k < j <= hi + if v[j] < t[i] + v[k] = v[j] + j += 1 + nswaps += m - lo + 1 - (i - 1) + else + v[k] = t[i] + i += 1 + end + k += 1 + end + while k < j + v[k] = t[i] + k += 1 + i += 1 + end + end + return nswaps +end + +# insertion_sort! and midpoint copied from Julia Base +# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) +midpoint(lo::T, hi::T) where {T<:Integer} = lo + ((hi - lo) >>> 0x01) +midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) + +""" + insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) + +Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort +distance. +""" +function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) + if lo == hi + return widen(0) + end + nswaps = widen(0) + @inbounds for i = lo+1:hi + j = i + x = v[i] + while j > lo + if x < v[j-1] + nswaps += 1 + v[j] = v[j-1] + j -= 1 + continue + end + break + end + v[j] = x + end + return nswaps +end + +""" + handlepairwise(x::RoMVector{T}, y::RoMVector{U}) where {T,U} + +Return a pair `(a,b)`, filtered copies of `x` and `y`, in which elements `x[i]` and `y[i]` +are filtered out if `ismissing(x[i])||ismissing(y[i])`. +""" +function handlepairwise(x::RoMVector{T}, y::RoMVector{U}) where {T,U} + + n = length(x) + a = Vector{T}(undef, n) + b = Vector{U}(undef, n) + j::Int = 0 + + @inbounds for i in eachindex(x) + if !(ismissing(x[i]) || ismissing(y[i])) + j += 1 + a[j] = x[i] + b[j] = y[i] + end + end + + return (resize!(a, j), resize!(b, j)) +end + +""" + handlepairwise!(x::RoMVector{T}, y::RoMVector{U}, + tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} + +Return a pair `(a,b)`, filtered copies of `(x,y)`, in which elements `x[i]` and +`y[i]` are filtered out if `ismissing(x[i])||ismissing(y[i])`. +""" +function handlepairwise!(x::RoMVector{T}, y::RoMVector{U}, + tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} + + j = 0 + + @inbounds for i in eachindex(x) + if !(ismissing(x[i]) || ismissing(y[i])) + j += 1 + tx[j] = x[i] + ty[j] = y[i] + end + end + + return (view(tx, 1:j), view(ty, 1:j)) +end + +""" + handlelistwise(x::AbstractArray,y::AbstractArray,skipmissing::Symbol) + +If `skipmissing` is `:listwise` and `x` and `y` are both matrices then do listwise filtering +of `x` and `y`. Otherwise merely validate `skipmissing` argument. +""" +function handlelistwise(x::AbstractArray, y::AbstractArray, skipmissing::Symbol) + if skipmissing == :listwise + if x isa Matrix && y isa Matrix + return (handlelistwise(x, y)) + end + elseif skipmissing == :pairwise + elseif skipmissing == :none + if missing isa eltype(x) || missing isa eltype(y) + throw(ArgumentError("When missing is an allowed element type \ + then keyword argument skipmissing must be either\ + `:pairwise` or `:listwise`, but got `:$skipmissing`")) + end + else + if missing isa eltype(x) || missing isa eltype(y) + throw(ArgumentError("keyword argument skipmissing must be either \ + `:pairwise` or `:listwise`, but got `:$skipmissing`")) + else + throw(ArgumentError("keyword argument skipmissing must be either \ + `:pairwise`, `:listwise` or `:none` but got \ + `:$skipmissing`")) + end + end + return (x, y) +end + +""" + handlelistwise(x::RoMMatrix{T}, y::RoMMatrix{U}) where {T,U} + +Return a pair `(a,b)`, filtered copies of `(x,y)`, in which the rows `x[i,:]` and +`y[i,:]` are both filtered out if `any(ismissing,x[i,:])||any(ismissing,y[i,:])`. +""" +function handlelistwise(x::RoMMatrix{T}, y::RoMMatrix{U}) where {T,U} + + nrx, ncx = size(x) + nry, ncy = size(y) + + nrx == nry || throw(DimensionMismatch("x and y have inconsistent dimensions")) + chooser = fill(true, nrx) + + nrout = nrx + @inbounds for i = 1:nrx + for j = 1:ncx + if ismissing(x[i, j]) + chooser[i] = false + nrout -= 1 + break + end + end + if chooser[i] + for j = 1:ncy + if ismissing(y[i, j]) + chooser[i] = false + nrout -= 1 + break + end + end + end + end + + a = Matrix{T}(undef, nrout, ncx) + @inbounds for j = 1:ncx + k = 0 + for i = 1:nrx + if chooser[i] + k += 1 + a[k, j] = x[i, j] + end + end + end + + b = Matrix{U}(undef, nrout, ncy) + @inbounds for j = 1:ncy + k = 0 + for i = 1:nrx + if chooser[i] + k += 1 + b[k, j] = y[i, j] + end + end + end + + return (a, b) +end \ No newline at end of file diff --git a/src/corkendall_experimental2.jl b/src/corkendall_experimental2.jl new file mode 100644 index 0000000..4c48b2d --- /dev/null +++ b/src/corkendall_experimental2.jl @@ -0,0 +1,481 @@ +####################################### +# +# Kendall correlation +# +####################################### + +#RoM stands for "Real or Missing" +const RoMVector{T<:Real} = AbstractVector{<:Union{T,Missing}} +const RoMMatrix{T<:Real} = AbstractMatrix{<:Union{T,Missing}} + +#= +TODO 16 Feb 2023 +1) Should the default value of skipmissing be :none or :pairwise? :none forces the user to + address question of how missings should be handled, but at least for REPL use, it's + rather inconvenient. +2) KendallTau.corkendall currently slower than StatsBase.corkendall for very small + vector\matrix output(4x4 or smaller, assuming vector length 1000) , presumably thanks to + overhead of threading and populating the "scratch-space" vectors. Does this matter? +3) Ask for code review? +4) Make PR to StatsBase? +=# + +""" + corkendall(x, y=x; skipmissing::Symbol=:none) + +Compute Kendall's rank correlation coefficient, τ. `x` and `y` must both be either +vectors or matrices, with elements that are either real numbers or missing values. + +# Keyword argument + +- `skipmissing::Symbol=:none`: if `:none`, missing values in either `x` or `y` + cause the function to raise an error. Use `:pairwise` to skip entries with a missing + value in either of the two vectors used to calculate (an element of) the return. Use + `:listwise` to skip entries where a missing value appears anywhere in a given row of `x` + or `y`; note that this might drop a high proportion of entries. +""" +function corkendall(x::RoMVector{T}, y::RoMVector{U}; skipmissing::Symbol=:none) where {T,U} + + length(x) == length(y) || throw(DimensionMismatch("x and y have inconsistent dimensions")) + + x, y = handlelistwise(x, y, skipmissing) + + if x isa Vector{Missing} || y isa Vector{Missing} + return (NaN) + end + + x = copy(x) + y = copy(y) + if missing isa eltype(x) || missing isa eltype(y) + x, y = handlepairwise(x, y) + end + permx = sortperm(x) + permute!(x, permx) + + return (corkendall_sorted!(x, y, permx, similar(y), similar(y), T[], U[])) +end + +#= Function returns a vector in this case, inconsistent with with Statistics.cor and +StatsBase.corspearman. Fixing that is a breaking change. =# +function corkendall(x::RoMMatrix, y::RoMVector; skipmissing::Symbol=:none) + return (vec(corkendall(x, reshape(y, (length(y), 1)); skipmissing))) +end + +function corkendall(x::RoMVector, y::RoMMatrix; skipmissing::Symbol=:none) + return (corkendall(reshape(x, (length(x), 1)), y; skipmissing)) +end + +""" + duplicate(x) + +Construct a vector with `Threads.nthreads()` elements, each a copy of `x`. +""" +function duplicate(x,n) + [copy(x) for _ in 1:n] +end + + + + +function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:none) where {T,U} + symmetric = x === y + if size(x, 1) != size(y, 1) + throw(DimensionMismatch("x and y have inconsistent dimensions")) + end + + #Swap x and y for more efficient threaded loop. + if size(x, 2) < size(y, 2) + return (collect(transpose(corkendall(y, x; skipmissing)))) + end + + x, y = handlelistwise(x, y, skipmissing) + m, nr = size(x) + nc = size(y, 2) + + if x isa Matrix{Missing} || y isa Matrix{Missing} + if symmetric + return (ifelse.((1:nr) .== (1:nc)', 1.0, NaN)) + else + return (fill(NaN, nr, nc)) + end + end + + C = ones(Float64, nr, nc) + ndups = min(Threads.nthreads(), symmetric ? nr - 1 : nr) + use_atomic = ndups < Threads.nthreads() + + if use_atomic + a = Threads.Atomic{Int}(1) + end + + #= Create scratch vectors so that threaded code can be non-allocating. One vector per + thread to avoid cross-talk between threads.=# + + scratchyvectors = duplicate(similar(y, m),ndups) + ycolis = duplicate(similar(y, m),ndups) + xcoljsorteds = duplicate(similar(x, m),ndups) + permxs = duplicate(zeros(Int, m),ndups) + txs = duplicate(Vector{T}(undef, m),ndups) + tys = duplicate(Vector{U}(undef, m),ndups) + sortyspaces = duplicate(Vector{U}(undef, m),ndups) + + Threads.@threads for j = (symmetric ? 2 : 1):nr + + if use_atomic + id = Threads.atomic_add!(a, 1)[] + else + id = Threads.threadid() + end + + scratchyvector = scratchyvectors[id] + sortyspace = sortyspaces[id] + ycoli = ycolis[id] + xcoljsorted = xcoljsorteds[id] + permx = permxs[id] + tx = txs[id] + ty = tys[id] + + sortperm!(permx, view(x, :, j)) + @inbounds for k in eachindex(xcoljsorted) + xcoljsorted[k] = x[permx[k], j] + end + + for i = 1:(symmetric ? j - 1 : nc) + ycoli .= view(y, :, i) + C[j, i] = corkendall_sorted!(xcoljsorted, ycoli, permx, scratchyvector, + sortyspace, tx, ty) + symmetric && (C[i, j] = C[j, i]) + end + end + return C +end + +# Auxiliary functions for Kendall's rank correlation + +# Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” +# Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. +# JSTOR, www.jstor.org/stable/2282833. +""" + corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U}, + permx::AbstractVector{<:Integer}, scratchyvector::RoMVector, sortyspace::RoMVector, + tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} + +Kendall correlation between two vectors but this function omits the initial sorting of +the first argument. So calculating Kendall correlation between `x` and `y` is a two stage +process: a) sort `x` to get `sortedx`; b) call this function on `sortedx` and `y`, with +subsequent arguments being: +- `permx::AbstractVector{<:Integer}`: the permutation that achieved the sorting of `x` to + yield `sortedx`. +- `scratchyvector::RoMVector`: a vector of the same element type and length as `y`; used + to permute `y` without allocation. +- `sortyspace::RoMVector`: a vector of the same element type and length as `y`; used + (in the call to `merge_sort!`) to avoid allocations. +- `tx, ty`: vectors of the same length as `x` and `y` whose element types match the types + of the non-missing elements of `x` and `y` respectively; used (in the call to + `handlepairwise!`) to avoid allocations. +""" +function corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U}, + permx::AbstractVector{<:Integer}, scratchyvector::RoMVector, sortyspace::RoMVector, + tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} + + @inbounds for i in eachindex(y) + scratchyvector[i] = y[permx[i]] + end + if missing isa eltype(sortedx) || missing isa eltype(scratchyvector) + sortedx, scratchyvector = handlepairwise!(sortedx, scratchyvector, tx, ty) + end + length(sortedx) >= 2 || return (NaN) + + shuffledy = scratchyvector + + if any(isnan, sortedx) || any(isnan, shuffledy) + return NaN + end + n = length(sortedx) + + # Use widen to avoid overflows on both 32bit and 64bit + npairs = div(widen(n) * (n - 1), 2) + ntiesx = ndoubleties = nswaps = widen(0) + k = 0 + + @inbounds for i = 2:n + if sortedx[i-1] == sortedx[i] + k += 1 + elseif k > 0 + #= + Sort the corresponding chunk of shuffledy, so the rows of hcat(sortedx,shuffledy) + are sorted first on sortedx, then (where sortedx values are tied) on shuffledy. + Hence double ties can be counted by calling countties. + =# + sort!(view(shuffledy, (i-k-1):(i-1))) + ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here + ndoubleties += countties(shuffledy, i - k - 1, i - 1) + k = 0 + end + end + if k > 0 + sort!(view(shuffledy, (n-k):n)) + ntiesx += div(widen(k) * (k + 1), 2) + ndoubleties += countties(shuffledy, n - k, n) + end + + nswaps = merge_sort!(shuffledy, 1, n, sortyspace) + ntiesy = countties(shuffledy, 1, n) + + # Calls to float below prevent possible overflow errors when + # length(sortedx) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) + (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / + sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) +end + +""" + countties(x::AbstractVector{<:Real}, lo::Integer, hi::Integer) + +Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. +""" +function countties(x::AbstractVector{<:Real}, lo::Integer, hi::Integer) + # Use of widen below prevents possible overflow errors when + # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit) + thistiecount = result = widen(0) + checkbounds(x, lo:hi) + @inbounds for i = (lo+1):hi + if x[i] == x[i-1] + thistiecount += 1 + elseif thistiecount > 0 + result += div(thistiecount * (thistiecount + 1), 2) + thistiecount = widen(0) + end + end + + if thistiecount > 0 + result += div(thistiecount * (thistiecount + 1), 2) + end + result +end + +# Tests appear to show that a value of 64 is optimal, +# but note that the equivalent constant in base/sort.jl is 20. +const SMALL_THRESHOLD = 64 + +# merge_sort! copied from Julia Base +# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) +""" + merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, + t::AbstractVector=similar(v, 0)) + +Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort +distance. +""" +function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, + t::AbstractVector=similar(v, 0)) + # Use of widen below prevents possible overflow errors when + # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit) + nswaps = widen(0) + @inbounds if lo < hi + hi - lo <= SMALL_THRESHOLD && return insertion_sort!(v, lo, hi) + + m = midpoint(lo, hi) + (length(t) < m - lo + 1) && resize!(t, m - lo + 1) + + nswaps = merge_sort!(v, lo, m, t) + nswaps += merge_sort!(v, m + 1, hi, t) + + i, j = 1, lo + while j <= m + t[i] = v[j] + i += 1 + j += 1 + end + + i, k = 1, lo + while k < j <= hi + if v[j] < t[i] + v[k] = v[j] + j += 1 + nswaps += m - lo + 1 - (i - 1) + else + v[k] = t[i] + i += 1 + end + k += 1 + end + while k < j + v[k] = t[i] + k += 1 + i += 1 + end + end + return nswaps +end + +# insertion_sort! and midpoint copied from Julia Base +# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) +midpoint(lo::T, hi::T) where {T<:Integer} = lo + ((hi - lo) >>> 0x01) +midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) + +""" + insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) + +Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. +This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort +distance. +""" +function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) + if lo == hi + return widen(0) + end + nswaps = widen(0) + @inbounds for i = lo+1:hi + j = i + x = v[i] + while j > lo + if x < v[j-1] + nswaps += 1 + v[j] = v[j-1] + j -= 1 + continue + end + break + end + v[j] = x + end + return nswaps +end + +""" + handlepairwise(x::RoMVector{T}, y::RoMVector{U}) where {T,U} + +Return a pair `(a,b)`, filtered copies of `x` and `y`, in which elements `x[i]` and `y[i]` +are filtered out if `ismissing(x[i])||ismissing(y[i])`. +""" +function handlepairwise(x::RoMVector{T}, y::RoMVector{U}) where {T,U} + + n = length(x) + a = Vector{T}(undef, n) + b = Vector{U}(undef, n) + j::Int = 0 + + @inbounds for i in eachindex(x) + if !(ismissing(x[i]) || ismissing(y[i])) + j += 1 + a[j] = x[i] + b[j] = y[i] + end + end + + return (resize!(a, j), resize!(b, j)) +end + +""" + handlepairwise!(x::RoMVector{T}, y::RoMVector{U}, + tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} + +Return a pair `(a,b)`, filtered copies of `(x,y)`, in which elements `x[i]` and +`y[i]` are filtered out if `ismissing(x[i])||ismissing(y[i])`. +""" +function handlepairwise!(x::RoMVector{T}, y::RoMVector{U}, + tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} + + j = 0 + + @inbounds for i in eachindex(x) + if !(ismissing(x[i]) || ismissing(y[i])) + j += 1 + tx[j] = x[i] + ty[j] = y[i] + end + end + + return (view(tx, 1:j), view(ty, 1:j)) +end + +""" + handlelistwise(x::AbstractArray,y::AbstractArray,skipmissing::Symbol) + +If `skipmissing` is `:listwise` and `x` and `y` are both matrices then do listwise filtering +of `x` and `y`. Otherwise merely validate `skipmissing` argument. +""" +function handlelistwise(x::AbstractArray, y::AbstractArray, skipmissing::Symbol) + if skipmissing == :listwise + if x isa Matrix && y isa Matrix + return (handlelistwise(x, y)) + end + elseif skipmissing == :pairwise + elseif skipmissing == :none + if missing isa eltype(x) || missing isa eltype(y) + throw(ArgumentError("When missing is an allowed element type \ + then keyword argument skipmissing must be either\ + `:pairwise` or `:listwise`, but got `:$skipmissing`")) + end + else + if missing isa eltype(x) || missing isa eltype(y) + throw(ArgumentError("keyword argument skipmissing must be either \ + `:pairwise` or `:listwise`, but got `:$skipmissing`")) + else + throw(ArgumentError("keyword argument skipmissing must be either \ + `:pairwise`, `:listwise` or `:none` but got \ + `:$skipmissing`")) + end + end + return (x, y) +end + +""" + handlelistwise(x::RoMMatrix{T}, y::RoMMatrix{U}) where {T,U} + +Return a pair `(a,b)`, filtered copies of `(x,y)`, in which the rows `x[i,:]` and +`y[i,:]` are both filtered out if `any(ismissing,x[i,:])||any(ismissing,y[i,:])`. +""" +function handlelistwise(x::RoMMatrix{T}, y::RoMMatrix{U}) where {T,U} + + nrx, ncx = size(x) + nry, ncy = size(y) + + nrx == nry || throw(DimensionMismatch("x and y have inconsistent dimensions")) + chooser = fill(true, nrx) + + nrout = nrx + @inbounds for i = 1:nrx + for j = 1:ncx + if ismissing(x[i, j]) + chooser[i] = false + nrout -= 1 + break + end + end + if chooser[i] + for j = 1:ncy + if ismissing(y[i, j]) + chooser[i] = false + nrout -= 1 + break + end + end + end + end + + a = Matrix{T}(undef, nrout, ncx) + @inbounds for j = 1:ncx + k = 0 + for i = 1:nrx + if chooser[i] + k += 1 + a[k, j] = x[i, j] + end + end + end + + b = Matrix{U}(undef, nrout, ncy) + @inbounds for j = 1:ncy + k = 0 + for i = 1:nrx + if chooser[i] + k += 1 + b[k, j] = y[i, j] + end + end + end + + return (a, b) +end \ No newline at end of file From bafe4093990e5699f687b13bc34ba303f9502a83 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 17 Feb 2023 12:05:34 +0000 Subject: [PATCH 2/3] work on "many threads but small correlation matrix" problem --- ...ced_allocations_prior_to_threaded_loop.svg | 1 + plots/plot_1.svg | 1 + plots/plot_2.svg | 1 + ...=> threading_with_too_much_allocation.svg} | 0 scripts/performancetests.jl | 2 +- src/corkendall_experimental2.jl | 31 +++++++++---------- test/rankcorr.jl | 2 +- 7 files changed, 20 insertions(+), 18 deletions(-) create mode 100644 Test_of_reduced_allocations_prior_to_threaded_loop.svg create mode 100644 plots/plot_1.svg create mode 100644 plots/plot_2.svg rename plots/{plot_24.svg => threading_with_too_much_allocation.svg} (100%) diff --git a/Test_of_reduced_allocations_prior_to_threaded_loop.svg b/Test_of_reduced_allocations_prior_to_threaded_loop.svg new file mode 100644 index 0000000..e92e981 --- /dev/null +++ b/Test_of_reduced_allocations_prior_to_threaded_loop.svg @@ -0,0 +1 @@ +246810121416789100μ234567890.0012345678StatsBase.corkendallKendallTau.corkendallKendallTau.Experimental.corkendallKendallTau.Experimental2.corkendallTime to evaluate corkendall(x) vs num cols in xNum cols (num rows = 1000)Seconds \ No newline at end of file diff --git a/plots/plot_1.svg b/plots/plot_1.svg new file mode 100644 index 0000000..7beb358 --- /dev/null +++ b/plots/plot_1.svg @@ -0,0 +1 @@ +234567891023100μ250.001250.012StatsBase.corkendallKendallTau.corkendallKendallTau.Experimental.corkendallKendallTau.Experimental2.corkendallTime to evaluate corkendall(x) vs num cols in xNum cols (num rows = 1000)Seconds \ No newline at end of file diff --git a/plots/plot_2.svg b/plots/plot_2.svg new file mode 100644 index 0000000..be7f940 --- /dev/null +++ b/plots/plot_2.svg @@ -0,0 +1 @@ +2345678789100μ234567890.0012StatsBase.corkendallKendallTau.corkendallKendallTau.Experimental.corkendallKendallTau.Experimental2.corkendallTime to evaluate corkendall(x) vs num cols in xNum cols (num rows = 1000)Seconds \ No newline at end of file diff --git a/plots/plot_24.svg b/plots/threading_with_too_much_allocation.svg similarity index 100% rename from plots/plot_24.svg rename to plots/threading_with_too_much_allocation.svg diff --git a/scripts/performancetests.jl b/scripts/performancetests.jl index 72946e9..6644dcb 100644 --- a/scripts/performancetests.jl +++ b/scripts/performancetests.jl @@ -376,7 +376,7 @@ function how_scaleable(fns, nr::Integer, ncs::Vector{<:Integer}, end plot([ - scatter(x=ncs, y=datatoplot[:, i], mode="line", name=fullnameof(fns[i])) for i in 1:length(fns)], Layout(; title="Time to evaluate corkendall(x) vs num cols in x", + scatter(x=ncs, y=datatoplot[:, i], mode="line", name=fullnameof(fns[i])) for i in 1:length(fns)], Layout(; title="Time to evaluate corkendall(x) vs num cols in x ($(Threads.nthreads()) threads)", xaxis=attr(title="Num cols (num rows = $nr)", type="log"), yaxis=attr(title="Seconds", type="log"))) end \ No newline at end of file diff --git a/src/corkendall_experimental2.jl b/src/corkendall_experimental2.jl index 4c48b2d..39ef629 100644 --- a/src/corkendall_experimental2.jl +++ b/src/corkendall_experimental2.jl @@ -66,17 +66,14 @@ function corkendall(x::RoMVector, y::RoMMatrix; skipmissing::Symbol=:none) end """ - duplicate(x) + duplicate(x, n) -Construct a vector with `Threads.nthreads()` elements, each a copy of `x`. +Construct a vector with `n` copies of `x`. """ -function duplicate(x,n) +function duplicate(x, n) [copy(x) for _ in 1:n] end - - - function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:none) where {T,U} symmetric = x === y if size(x, 1) != size(y, 1) @@ -101,8 +98,9 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non end C = ones(Float64, nr, nc) - ndups = min(Threads.nthreads(), symmetric ? nr - 1 : nr) - use_atomic = ndups < Threads.nthreads() + #Avoid unnecessary allocation when nthreads is large but output matrix is small + n_duplicates = min(Threads.nthreads(), symmetric ? nr - 1 : nr) + use_atomic = n_duplicates < Threads.nthreads() if use_atomic a = Threads.Atomic{Int}(1) @@ -110,19 +108,20 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non #= Create scratch vectors so that threaded code can be non-allocating. One vector per thread to avoid cross-talk between threads.=# - - scratchyvectors = duplicate(similar(y, m),ndups) - ycolis = duplicate(similar(y, m),ndups) - xcoljsorteds = duplicate(similar(x, m),ndups) - permxs = duplicate(zeros(Int, m),ndups) - txs = duplicate(Vector{T}(undef, m),ndups) - tys = duplicate(Vector{U}(undef, m),ndups) - sortyspaces = duplicate(Vector{U}(undef, m),ndups) + scratchyvectors = duplicate(similar(y, m), n_duplicates) + ycolis = duplicate(similar(y, m), n_duplicates) + xcoljsorteds = duplicate(similar(x, m), n_duplicates) + permxs = duplicate(zeros(Int, m), n_duplicates) + txs = duplicate(Vector{T}(undef, m), n_duplicates) + tys = duplicate(Vector{U}(undef, m), n_duplicates) + sortyspaces = duplicate(Vector{U}(undef, m), n_duplicates) Threads.@threads for j = (symmetric ? 2 : 1):nr if use_atomic id = Threads.atomic_add!(a, 1)[] + #=Check that threads are using distinct scratch vectors=# + @assert permxs[id][1] == 0 else id = Threads.threadid() end diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 1c473be..f7e2255 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -28,7 +28,7 @@ x2 = x[:, 2] y = Y[:, 1] # corkendall and friends -for f in (KendallTau.corkendall, corkendall_naive) +for f in (KendallTau.corkendall, corkendall_naive,KendallTau.Experimental.corkendall,KendallTau.Experimental2.corkendall) println("f = $(Base.parentmodule(f)).$(Base.nameof(f))") # Check error, handling of NaN, Inf etc From 90f708eaa226d5c11e06033a38dcea5520fe7d41 Mon Sep 17 00:00:00 2001 From: Philip Swannell <18028484+PGS62@users.noreply.github.com> Date: Fri, 17 Feb 2023 12:17:13 +0000 Subject: [PATCH 3/3] Ready to merge in to main branch --- src/KendallTau.jl | 9 - src/corkendall.jl | 50 ++-- src/corkendall_experimental.jl | 469 ------------------------------- src/corkendall_experimental2.jl | 480 -------------------------------- test/rankcorr.jl | 2 +- 5 files changed, 28 insertions(+), 982 deletions(-) delete mode 100644 src/corkendall_experimental.jl delete mode 100644 src/corkendall_experimental2.jl diff --git a/src/KendallTau.jl b/src/KendallTau.jl index f73d882..a8fb998 100644 --- a/src/KendallTau.jl +++ b/src/KendallTau.jl @@ -3,15 +3,6 @@ module KendallTau include("corkendall.jl") include("corkendall_fromfile.jl") -module Experimental -include("corkendall_experimental.jl") -end - -module Experimental2 -include("corkendall_experimental2.jl") -end - - export corkendall, corkendall_fromfile end \ No newline at end of file diff --git a/src/corkendall.jl b/src/corkendall.jl index f01dd24..42e218a 100644 --- a/src/corkendall.jl +++ b/src/corkendall.jl @@ -9,15 +9,12 @@ const RoMVector{T<:Real} = AbstractVector{<:Union{T,Missing}} const RoMMatrix{T<:Real} = AbstractMatrix{<:Union{T,Missing}} #= -TODO 16 Feb 2023 +TODO 17 Feb 2023 1) Should the default value of skipmissing be :none or :pairwise? :none forces the user to - address question of how missings should be handled, but at least for REPL use, it's + address the question of how missings should be handled, but at least for REPL use, it's rather inconvenient. -2) KendallTau.corkendall currently slower than StatsBase.corkendall for very small - vector\matrix output(4x4 or smaller, assuming vector length 1000) , presumably thanks to - overhead of threading and populating the "scratch-space" vectors. Does this matter? -3) Ask for code review? -4) Make PR to StatsBase? +2) Ask for code review? +3) Make PR to StatsBase? =# """ @@ -65,14 +62,6 @@ function corkendall(x::RoMVector, y::RoMMatrix; skipmissing::Symbol=:none) return (corkendall(reshape(x, (length(x), 1)), y; skipmissing)) end -""" - duplicate(x) - -Construct a vector with `Threads.nthreads()` elements, each a copy of `x`. -""" -function duplicate(x) - [copy(x) for _ in 1:Threads.nthreads()] -end function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:none) where {T,U} symmetric = x === y @@ -98,20 +87,35 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non end C = ones(Float64, nr, nc) + #Avoid unnecessary allocation when nthreads is large but output matrix is small. + n_duplicates = min(Threads.nthreads(), symmetric ? nr - 1 : nr) + use_atomic = n_duplicates < Threads.nthreads() + + if use_atomic + a = Threads.Atomic{Int}(1) + end #= Create scratch vectors so that threaded code can be non-allocating. One vector per thread to avoid cross-talk between threads.=# - scratchyvectors = duplicate(similar(y, m)) - ycolis = duplicate(similar(y, m)) - xcoljsorteds = duplicate(similar(x, m)) - permxs = duplicate(zeros(Int, m)) - txs = duplicate(Vector{T}(undef, m)) - tys = duplicate(Vector{U}(undef, m)) - sortyspaces = duplicate(Vector{U}(undef, m)) + duplicate(x, n) = [copy(x) for _ in 1:n] + scratchyvectors = duplicate(similar(y, m), n_duplicates) + ycolis = duplicate(similar(y, m), n_duplicates) + xcoljsorteds = duplicate(similar(x, m), n_duplicates) + permxs = duplicate(zeros(Int, m), n_duplicates) + txs = duplicate(Vector{T}(undef, m), n_duplicates) + tys = duplicate(Vector{U}(undef, m), n_duplicates) + sortyspaces = duplicate(Vector{U}(undef, m), n_duplicates) Threads.@threads for j = (symmetric ? 2 : 1):nr - id = Threads.threadid() + if use_atomic + id = Threads.atomic_add!(a, 1)[] + #=Check that threads are using distinct scratch vectors=# + @assert permxs[id][1] == 0 + else + id = Threads.threadid() + end + scratchyvector = scratchyvectors[id] sortyspace = sortyspaces[id] ycoli = ycolis[id] diff --git a/src/corkendall_experimental.jl b/src/corkendall_experimental.jl deleted file mode 100644 index 620204a..0000000 --- a/src/corkendall_experimental.jl +++ /dev/null @@ -1,469 +0,0 @@ -####################################### -# -# Kendall correlation -# -####################################### - -#RoM stands for "Real or Missing" -const RoMVector{T<:Real} = AbstractVector{<:Union{T,Missing}} -const RoMMatrix{T<:Real} = AbstractMatrix{<:Union{T,Missing}} - -#= -TODO 16 Feb 2023 -1) Should the default value of skipmissing be :none or :pairwise? :none forces the user to - address question of how missings should be handled, but at least for REPL use, it's - rather inconvenient. -2) KendallTau.corkendall currently slower than StatsBase.corkendall for very small - vector\matrix output(4x4 or smaller, assuming vector length 1000) , presumably thanks to - overhead of threading and populating the "scratch-space" vectors. Does this matter? -3) Ask for code review? -4) Make PR to StatsBase? -=# - -""" - corkendall(x, y=x; skipmissing::Symbol=:none) - -Compute Kendall's rank correlation coefficient, τ. `x` and `y` must both be either -vectors or matrices, with elements that are either real numbers or missing values. - -# Keyword argument - -- `skipmissing::Symbol=:none`: if `:none`, missing values in either `x` or `y` - cause the function to raise an error. Use `:pairwise` to skip entries with a missing - value in either of the two vectors used to calculate (an element of) the return. Use - `:listwise` to skip entries where a missing value appears anywhere in a given row of `x` - or `y`; note that this might drop a high proportion of entries. -""" -function corkendall(x::RoMVector{T}, y::RoMVector{U}; skipmissing::Symbol=:none) where {T,U} - - length(x) == length(y) || throw(DimensionMismatch("x and y have inconsistent dimensions")) - - x, y = handlelistwise(x, y, skipmissing) - - if x isa Vector{Missing} || y isa Vector{Missing} - return (NaN) - end - - x = copy(x) - y = copy(y) - if missing isa eltype(x) || missing isa eltype(y) - x, y = handlepairwise(x, y) - end - permx = sortperm(x) - permute!(x, permx) - - return (corkendall_sorted!(x, y, permx, similar(y), similar(y), T[], U[])) -end - -#= Function returns a vector in this case, inconsistent with with Statistics.cor and -StatsBase.corspearman. Fixing that is a breaking change. =# -function corkendall(x::RoMMatrix, y::RoMVector; skipmissing::Symbol=:none) - return (vec(corkendall(x, reshape(y, (length(y), 1)); skipmissing))) -end - -function corkendall(x::RoMVector, y::RoMMatrix; skipmissing::Symbol=:none) - return (corkendall(reshape(x, (length(x), 1)), y; skipmissing)) -end - -function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:none) where {T,U} - symmetric = x === y - if size(x, 1) != size(y, 1) - throw(DimensionMismatch("x and y have inconsistent dimensions")) - end - - #Swap x and y for more efficient threaded loop. - if size(x, 2) < size(y, 2) - return (collect(transpose(corkendall(y, x; skipmissing)))) - end - - x, y = handlelistwise(x, y, skipmissing) - m, nr = size(x) - nc = size(y, 2) - - if x isa Matrix{Missing} || y isa Matrix{Missing} - if symmetric - return (ifelse.((1:nr) .== (1:nc)', 1.0, NaN)) - else - return (fill(NaN, nr, nc)) - end - end - - C = ones(Float64, nr, nc) - ndups = min(Threads.nthreads(), symmetric ? nr - 1 : nr) - use_atomic = ndups < Threads.nthreads() - - if use_atomic - a = Threads.Atomic{Int}(1) - end - - #= Create scratch vectors so that threaded code can be non-allocating. One vector per - thread to avoid cross-talk between threads.=# - - scratchyvectors = similar(y, m, ndups) - ycolis = similar(y, m, ndups) - xcoljsorteds = similar(x, m, ndups) - permxs = zeros(Int, m, ndups) - txs = Array{T}(undef, m, ndups) - tys = Array{U}(undef, m, ndups) - sortyspaces = Array{U}(undef, m, ndups) - - Threads.@threads for j = (symmetric ? 2 : 1):nr - - if use_atomic - id = Threads.atomic_add!(a, 1)[] - else - id = Threads.threadid() - end - - scratchyvector = view(scratchyvectors, :, id) - sortyspace = view(sortyspaces, :, id) - ycoli = view(ycolis, :, id) - xcoljsorted = view(xcoljsorteds, :, id) - permx = view(permxs, :, id) - tx = view(txs, :, id) - ty = view(tys, :, id) - - sortperm!(permx, view(x, :, j)) - @inbounds for k in eachindex(xcoljsorted) - xcoljsorted[k] = x[permx[k], j] - end - - for i = 1:(symmetric ? j - 1 : nc) - ycoli .= view(y, :, i) - C[j, i] = corkendall_sorted!(xcoljsorted, ycoli, permx, scratchyvector, - sortyspace, tx, ty) - symmetric && (C[i, j] = C[j, i]) - end - end - return C -end - -# Auxiliary functions for Kendall's rank correlation - -# Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” -# Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. -# JSTOR, www.jstor.org/stable/2282833. -""" - corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U}, - permx::AbstractVector{<:Integer}, scratchyvector::RoMVector, sortyspace::RoMVector, - tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} - -Kendall correlation between two vectors but this function omits the initial sorting of -the first argument. So calculating Kendall correlation between `x` and `y` is a two stage -process: a) sort `x` to get `sortedx`; b) call this function on `sortedx` and `y`, with -subsequent arguments being: -- `permx::AbstractVector{<:Integer}`: the permutation that achieved the sorting of `x` to - yield `sortedx`. -- `scratchyvector::RoMVector`: a vector of the same element type and length as `y`; used - to permute `y` without allocation. -- `sortyspace::RoMVector`: a vector of the same element type and length as `y`; used - (in the call to `merge_sort!`) to avoid allocations. -- `tx, ty`: vectors of the same length as `x` and `y` whose element types match the types - of the non-missing elements of `x` and `y` respectively; used (in the call to - `handlepairwise!`) to avoid allocations. -""" -function corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U}, - permx::AbstractVector{<:Integer}, scratchyvector::RoMVector, sortyspace::RoMVector, - tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} - - @inbounds for i in eachindex(y) - scratchyvector[i] = y[permx[i]] - end - if missing isa eltype(sortedx) || missing isa eltype(scratchyvector) - sortedx, scratchyvector = handlepairwise!(sortedx, scratchyvector, tx, ty) - end - length(sortedx) >= 2 || return (NaN) - - shuffledy = scratchyvector - - if any(isnan, sortedx) || any(isnan, shuffledy) - return NaN - end - n = length(sortedx) - - # Use widen to avoid overflows on both 32bit and 64bit - npairs = div(widen(n) * (n - 1), 2) - ntiesx = ndoubleties = nswaps = widen(0) - k = 0 - - @inbounds for i = 2:n - if sortedx[i-1] == sortedx[i] - k += 1 - elseif k > 0 - #= - Sort the corresponding chunk of shuffledy, so the rows of hcat(sortedx,shuffledy) - are sorted first on sortedx, then (where sortedx values are tied) on shuffledy. - Hence double ties can be counted by calling countties. - =# - sort!(view(shuffledy, (i-k-1):(i-1))) - ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here - ndoubleties += countties(shuffledy, i - k - 1, i - 1) - k = 0 - end - end - if k > 0 - sort!(view(shuffledy, (n-k):n)) - ntiesx += div(widen(k) * (k + 1), 2) - ndoubleties += countties(shuffledy, n - k, n) - end - - nswaps = merge_sort!(shuffledy, 1, n, sortyspace) - ntiesy = countties(shuffledy, 1, n) - - # Calls to float below prevent possible overflow errors when - # length(sortedx) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) - (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / - sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) -end - -""" - countties(x::AbstractVector{<:Real}, lo::Integer, hi::Integer) - -Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. -""" -function countties(x::AbstractVector{<:Real}, lo::Integer, hi::Integer) - # Use of widen below prevents possible overflow errors when - # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit) - thistiecount = result = widen(0) - checkbounds(x, lo:hi) - @inbounds for i = (lo+1):hi - if x[i] == x[i-1] - thistiecount += 1 - elseif thistiecount > 0 - result += div(thistiecount * (thistiecount + 1), 2) - thistiecount = widen(0) - end - end - - if thistiecount > 0 - result += div(thistiecount * (thistiecount + 1), 2) - end - result -end - -# Tests appear to show that a value of 64 is optimal, -# but note that the equivalent constant in base/sort.jl is 20. -const SMALL_THRESHOLD = 64 - -# merge_sort! copied from Julia Base -# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) -""" - merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, - t::AbstractVector=similar(v, 0)) - -Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. -This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort -distance. -""" -function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, - t::AbstractVector=similar(v, 0)) - # Use of widen below prevents possible overflow errors when - # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit) - nswaps = widen(0) - @inbounds if lo < hi - hi - lo <= SMALL_THRESHOLD && return insertion_sort!(v, lo, hi) - - m = midpoint(lo, hi) - (length(t) < m - lo + 1) && resize!(t, m - lo + 1) - - nswaps = merge_sort!(v, lo, m, t) - nswaps += merge_sort!(v, m + 1, hi, t) - - i, j = 1, lo - while j <= m - t[i] = v[j] - i += 1 - j += 1 - end - - i, k = 1, lo - while k < j <= hi - if v[j] < t[i] - v[k] = v[j] - j += 1 - nswaps += m - lo + 1 - (i - 1) - else - v[k] = t[i] - i += 1 - end - k += 1 - end - while k < j - v[k] = t[i] - k += 1 - i += 1 - end - end - return nswaps -end - -# insertion_sort! and midpoint copied from Julia Base -# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) -midpoint(lo::T, hi::T) where {T<:Integer} = lo + ((hi - lo) >>> 0x01) -midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) - -""" - insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) - -Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. -This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort -distance. -""" -function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) - if lo == hi - return widen(0) - end - nswaps = widen(0) - @inbounds for i = lo+1:hi - j = i - x = v[i] - while j > lo - if x < v[j-1] - nswaps += 1 - v[j] = v[j-1] - j -= 1 - continue - end - break - end - v[j] = x - end - return nswaps -end - -""" - handlepairwise(x::RoMVector{T}, y::RoMVector{U}) where {T,U} - -Return a pair `(a,b)`, filtered copies of `x` and `y`, in which elements `x[i]` and `y[i]` -are filtered out if `ismissing(x[i])||ismissing(y[i])`. -""" -function handlepairwise(x::RoMVector{T}, y::RoMVector{U}) where {T,U} - - n = length(x) - a = Vector{T}(undef, n) - b = Vector{U}(undef, n) - j::Int = 0 - - @inbounds for i in eachindex(x) - if !(ismissing(x[i]) || ismissing(y[i])) - j += 1 - a[j] = x[i] - b[j] = y[i] - end - end - - return (resize!(a, j), resize!(b, j)) -end - -""" - handlepairwise!(x::RoMVector{T}, y::RoMVector{U}, - tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} - -Return a pair `(a,b)`, filtered copies of `(x,y)`, in which elements `x[i]` and -`y[i]` are filtered out if `ismissing(x[i])||ismissing(y[i])`. -""" -function handlepairwise!(x::RoMVector{T}, y::RoMVector{U}, - tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} - - j = 0 - - @inbounds for i in eachindex(x) - if !(ismissing(x[i]) || ismissing(y[i])) - j += 1 - tx[j] = x[i] - ty[j] = y[i] - end - end - - return (view(tx, 1:j), view(ty, 1:j)) -end - -""" - handlelistwise(x::AbstractArray,y::AbstractArray,skipmissing::Symbol) - -If `skipmissing` is `:listwise` and `x` and `y` are both matrices then do listwise filtering -of `x` and `y`. Otherwise merely validate `skipmissing` argument. -""" -function handlelistwise(x::AbstractArray, y::AbstractArray, skipmissing::Symbol) - if skipmissing == :listwise - if x isa Matrix && y isa Matrix - return (handlelistwise(x, y)) - end - elseif skipmissing == :pairwise - elseif skipmissing == :none - if missing isa eltype(x) || missing isa eltype(y) - throw(ArgumentError("When missing is an allowed element type \ - then keyword argument skipmissing must be either\ - `:pairwise` or `:listwise`, but got `:$skipmissing`")) - end - else - if missing isa eltype(x) || missing isa eltype(y) - throw(ArgumentError("keyword argument skipmissing must be either \ - `:pairwise` or `:listwise`, but got `:$skipmissing`")) - else - throw(ArgumentError("keyword argument skipmissing must be either \ - `:pairwise`, `:listwise` or `:none` but got \ - `:$skipmissing`")) - end - end - return (x, y) -end - -""" - handlelistwise(x::RoMMatrix{T}, y::RoMMatrix{U}) where {T,U} - -Return a pair `(a,b)`, filtered copies of `(x,y)`, in which the rows `x[i,:]` and -`y[i,:]` are both filtered out if `any(ismissing,x[i,:])||any(ismissing,y[i,:])`. -""" -function handlelistwise(x::RoMMatrix{T}, y::RoMMatrix{U}) where {T,U} - - nrx, ncx = size(x) - nry, ncy = size(y) - - nrx == nry || throw(DimensionMismatch("x and y have inconsistent dimensions")) - chooser = fill(true, nrx) - - nrout = nrx - @inbounds for i = 1:nrx - for j = 1:ncx - if ismissing(x[i, j]) - chooser[i] = false - nrout -= 1 - break - end - end - if chooser[i] - for j = 1:ncy - if ismissing(y[i, j]) - chooser[i] = false - nrout -= 1 - break - end - end - end - end - - a = Matrix{T}(undef, nrout, ncx) - @inbounds for j = 1:ncx - k = 0 - for i = 1:nrx - if chooser[i] - k += 1 - a[k, j] = x[i, j] - end - end - end - - b = Matrix{U}(undef, nrout, ncy) - @inbounds for j = 1:ncy - k = 0 - for i = 1:nrx - if chooser[i] - k += 1 - b[k, j] = y[i, j] - end - end - end - - return (a, b) -end \ No newline at end of file diff --git a/src/corkendall_experimental2.jl b/src/corkendall_experimental2.jl deleted file mode 100644 index 39ef629..0000000 --- a/src/corkendall_experimental2.jl +++ /dev/null @@ -1,480 +0,0 @@ -####################################### -# -# Kendall correlation -# -####################################### - -#RoM stands for "Real or Missing" -const RoMVector{T<:Real} = AbstractVector{<:Union{T,Missing}} -const RoMMatrix{T<:Real} = AbstractMatrix{<:Union{T,Missing}} - -#= -TODO 16 Feb 2023 -1) Should the default value of skipmissing be :none or :pairwise? :none forces the user to - address question of how missings should be handled, but at least for REPL use, it's - rather inconvenient. -2) KendallTau.corkendall currently slower than StatsBase.corkendall for very small - vector\matrix output(4x4 or smaller, assuming vector length 1000) , presumably thanks to - overhead of threading and populating the "scratch-space" vectors. Does this matter? -3) Ask for code review? -4) Make PR to StatsBase? -=# - -""" - corkendall(x, y=x; skipmissing::Symbol=:none) - -Compute Kendall's rank correlation coefficient, τ. `x` and `y` must both be either -vectors or matrices, with elements that are either real numbers or missing values. - -# Keyword argument - -- `skipmissing::Symbol=:none`: if `:none`, missing values in either `x` or `y` - cause the function to raise an error. Use `:pairwise` to skip entries with a missing - value in either of the two vectors used to calculate (an element of) the return. Use - `:listwise` to skip entries where a missing value appears anywhere in a given row of `x` - or `y`; note that this might drop a high proportion of entries. -""" -function corkendall(x::RoMVector{T}, y::RoMVector{U}; skipmissing::Symbol=:none) where {T,U} - - length(x) == length(y) || throw(DimensionMismatch("x and y have inconsistent dimensions")) - - x, y = handlelistwise(x, y, skipmissing) - - if x isa Vector{Missing} || y isa Vector{Missing} - return (NaN) - end - - x = copy(x) - y = copy(y) - if missing isa eltype(x) || missing isa eltype(y) - x, y = handlepairwise(x, y) - end - permx = sortperm(x) - permute!(x, permx) - - return (corkendall_sorted!(x, y, permx, similar(y), similar(y), T[], U[])) -end - -#= Function returns a vector in this case, inconsistent with with Statistics.cor and -StatsBase.corspearman. Fixing that is a breaking change. =# -function corkendall(x::RoMMatrix, y::RoMVector; skipmissing::Symbol=:none) - return (vec(corkendall(x, reshape(y, (length(y), 1)); skipmissing))) -end - -function corkendall(x::RoMVector, y::RoMMatrix; skipmissing::Symbol=:none) - return (corkendall(reshape(x, (length(x), 1)), y; skipmissing)) -end - -""" - duplicate(x, n) - -Construct a vector with `n` copies of `x`. -""" -function duplicate(x, n) - [copy(x) for _ in 1:n] -end - -function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:none) where {T,U} - symmetric = x === y - if size(x, 1) != size(y, 1) - throw(DimensionMismatch("x and y have inconsistent dimensions")) - end - - #Swap x and y for more efficient threaded loop. - if size(x, 2) < size(y, 2) - return (collect(transpose(corkendall(y, x; skipmissing)))) - end - - x, y = handlelistwise(x, y, skipmissing) - m, nr = size(x) - nc = size(y, 2) - - if x isa Matrix{Missing} || y isa Matrix{Missing} - if symmetric - return (ifelse.((1:nr) .== (1:nc)', 1.0, NaN)) - else - return (fill(NaN, nr, nc)) - end - end - - C = ones(Float64, nr, nc) - #Avoid unnecessary allocation when nthreads is large but output matrix is small - n_duplicates = min(Threads.nthreads(), symmetric ? nr - 1 : nr) - use_atomic = n_duplicates < Threads.nthreads() - - if use_atomic - a = Threads.Atomic{Int}(1) - end - - #= Create scratch vectors so that threaded code can be non-allocating. One vector per - thread to avoid cross-talk between threads.=# - scratchyvectors = duplicate(similar(y, m), n_duplicates) - ycolis = duplicate(similar(y, m), n_duplicates) - xcoljsorteds = duplicate(similar(x, m), n_duplicates) - permxs = duplicate(zeros(Int, m), n_duplicates) - txs = duplicate(Vector{T}(undef, m), n_duplicates) - tys = duplicate(Vector{U}(undef, m), n_duplicates) - sortyspaces = duplicate(Vector{U}(undef, m), n_duplicates) - - Threads.@threads for j = (symmetric ? 2 : 1):nr - - if use_atomic - id = Threads.atomic_add!(a, 1)[] - #=Check that threads are using distinct scratch vectors=# - @assert permxs[id][1] == 0 - else - id = Threads.threadid() - end - - scratchyvector = scratchyvectors[id] - sortyspace = sortyspaces[id] - ycoli = ycolis[id] - xcoljsorted = xcoljsorteds[id] - permx = permxs[id] - tx = txs[id] - ty = tys[id] - - sortperm!(permx, view(x, :, j)) - @inbounds for k in eachindex(xcoljsorted) - xcoljsorted[k] = x[permx[k], j] - end - - for i = 1:(symmetric ? j - 1 : nc) - ycoli .= view(y, :, i) - C[j, i] = corkendall_sorted!(xcoljsorted, ycoli, permx, scratchyvector, - sortyspace, tx, ty) - symmetric && (C[i, j] = C[j, i]) - end - end - return C -end - -# Auxiliary functions for Kendall's rank correlation - -# Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.” -# Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. -# JSTOR, www.jstor.org/stable/2282833. -""" - corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U}, - permx::AbstractVector{<:Integer}, scratchyvector::RoMVector, sortyspace::RoMVector, - tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} - -Kendall correlation between two vectors but this function omits the initial sorting of -the first argument. So calculating Kendall correlation between `x` and `y` is a two stage -process: a) sort `x` to get `sortedx`; b) call this function on `sortedx` and `y`, with -subsequent arguments being: -- `permx::AbstractVector{<:Integer}`: the permutation that achieved the sorting of `x` to - yield `sortedx`. -- `scratchyvector::RoMVector`: a vector of the same element type and length as `y`; used - to permute `y` without allocation. -- `sortyspace::RoMVector`: a vector of the same element type and length as `y`; used - (in the call to `merge_sort!`) to avoid allocations. -- `tx, ty`: vectors of the same length as `x` and `y` whose element types match the types - of the non-missing elements of `x` and `y` respectively; used (in the call to - `handlepairwise!`) to avoid allocations. -""" -function corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U}, - permx::AbstractVector{<:Integer}, scratchyvector::RoMVector, sortyspace::RoMVector, - tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} - - @inbounds for i in eachindex(y) - scratchyvector[i] = y[permx[i]] - end - if missing isa eltype(sortedx) || missing isa eltype(scratchyvector) - sortedx, scratchyvector = handlepairwise!(sortedx, scratchyvector, tx, ty) - end - length(sortedx) >= 2 || return (NaN) - - shuffledy = scratchyvector - - if any(isnan, sortedx) || any(isnan, shuffledy) - return NaN - end - n = length(sortedx) - - # Use widen to avoid overflows on both 32bit and 64bit - npairs = div(widen(n) * (n - 1), 2) - ntiesx = ndoubleties = nswaps = widen(0) - k = 0 - - @inbounds for i = 2:n - if sortedx[i-1] == sortedx[i] - k += 1 - elseif k > 0 - #= - Sort the corresponding chunk of shuffledy, so the rows of hcat(sortedx,shuffledy) - are sorted first on sortedx, then (where sortedx values are tied) on shuffledy. - Hence double ties can be counted by calling countties. - =# - sort!(view(shuffledy, (i-k-1):(i-1))) - ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here - ndoubleties += countties(shuffledy, i - k - 1, i - 1) - k = 0 - end - end - if k > 0 - sort!(view(shuffledy, (n-k):n)) - ntiesx += div(widen(k) * (k + 1), 2) - ndoubleties += countties(shuffledy, n - k, n) - end - - nswaps = merge_sort!(shuffledy, 1, n, sortyspace) - ntiesy = countties(shuffledy, 1, n) - - # Calls to float below prevent possible overflow errors when - # length(sortedx) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit) - (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / - sqrt(float(npairs - ntiesx) * float(npairs - ntiesy)) -end - -""" - countties(x::AbstractVector{<:Real}, lo::Integer, hi::Integer) - -Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. -""" -function countties(x::AbstractVector{<:Real}, lo::Integer, hi::Integer) - # Use of widen below prevents possible overflow errors when - # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit) - thistiecount = result = widen(0) - checkbounds(x, lo:hi) - @inbounds for i = (lo+1):hi - if x[i] == x[i-1] - thistiecount += 1 - elseif thistiecount > 0 - result += div(thistiecount * (thistiecount + 1), 2) - thistiecount = widen(0) - end - end - - if thistiecount > 0 - result += div(thistiecount * (thistiecount + 1), 2) - end - result -end - -# Tests appear to show that a value of 64 is optimal, -# but note that the equivalent constant in base/sort.jl is 20. -const SMALL_THRESHOLD = 64 - -# merge_sort! copied from Julia Base -# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) -""" - merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, - t::AbstractVector=similar(v, 0)) - -Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. -This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort -distance. -""" -function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, - t::AbstractVector=similar(v, 0)) - # Use of widen below prevents possible overflow errors when - # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit) - nswaps = widen(0) - @inbounds if lo < hi - hi - lo <= SMALL_THRESHOLD && return insertion_sort!(v, lo, hi) - - m = midpoint(lo, hi) - (length(t) < m - lo + 1) && resize!(t, m - lo + 1) - - nswaps = merge_sort!(v, lo, m, t) - nswaps += merge_sort!(v, m + 1, hi, t) - - i, j = 1, lo - while j <= m - t[i] = v[j] - i += 1 - j += 1 - end - - i, k = 1, lo - while k < j <= hi - if v[j] < t[i] - v[k] = v[j] - j += 1 - nswaps += m - lo + 1 - (i - 1) - else - v[k] = t[i] - i += 1 - end - k += 1 - end - while k < j - v[k] = t[i] - k += 1 - i += 1 - end - end - return nswaps -end - -# insertion_sort! and midpoint copied from Julia Base -# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020) -midpoint(lo::T, hi::T) where {T<:Integer} = lo + ((hi - lo) >>> 0x01) -midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...) - -""" - insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) - -Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. -This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort -distance. -""" -function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer) - if lo == hi - return widen(0) - end - nswaps = widen(0) - @inbounds for i = lo+1:hi - j = i - x = v[i] - while j > lo - if x < v[j-1] - nswaps += 1 - v[j] = v[j-1] - j -= 1 - continue - end - break - end - v[j] = x - end - return nswaps -end - -""" - handlepairwise(x::RoMVector{T}, y::RoMVector{U}) where {T,U} - -Return a pair `(a,b)`, filtered copies of `x` and `y`, in which elements `x[i]` and `y[i]` -are filtered out if `ismissing(x[i])||ismissing(y[i])`. -""" -function handlepairwise(x::RoMVector{T}, y::RoMVector{U}) where {T,U} - - n = length(x) - a = Vector{T}(undef, n) - b = Vector{U}(undef, n) - j::Int = 0 - - @inbounds for i in eachindex(x) - if !(ismissing(x[i]) || ismissing(y[i])) - j += 1 - a[j] = x[i] - b[j] = y[i] - end - end - - return (resize!(a, j), resize!(b, j)) -end - -""" - handlepairwise!(x::RoMVector{T}, y::RoMVector{U}, - tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} - -Return a pair `(a,b)`, filtered copies of `(x,y)`, in which elements `x[i]` and -`y[i]` are filtered out if `ismissing(x[i])||ismissing(y[i])`. -""" -function handlepairwise!(x::RoMVector{T}, y::RoMVector{U}, - tx::AbstractVector{T}, ty::AbstractVector{U}) where {T,U} - - j = 0 - - @inbounds for i in eachindex(x) - if !(ismissing(x[i]) || ismissing(y[i])) - j += 1 - tx[j] = x[i] - ty[j] = y[i] - end - end - - return (view(tx, 1:j), view(ty, 1:j)) -end - -""" - handlelistwise(x::AbstractArray,y::AbstractArray,skipmissing::Symbol) - -If `skipmissing` is `:listwise` and `x` and `y` are both matrices then do listwise filtering -of `x` and `y`. Otherwise merely validate `skipmissing` argument. -""" -function handlelistwise(x::AbstractArray, y::AbstractArray, skipmissing::Symbol) - if skipmissing == :listwise - if x isa Matrix && y isa Matrix - return (handlelistwise(x, y)) - end - elseif skipmissing == :pairwise - elseif skipmissing == :none - if missing isa eltype(x) || missing isa eltype(y) - throw(ArgumentError("When missing is an allowed element type \ - then keyword argument skipmissing must be either\ - `:pairwise` or `:listwise`, but got `:$skipmissing`")) - end - else - if missing isa eltype(x) || missing isa eltype(y) - throw(ArgumentError("keyword argument skipmissing must be either \ - `:pairwise` or `:listwise`, but got `:$skipmissing`")) - else - throw(ArgumentError("keyword argument skipmissing must be either \ - `:pairwise`, `:listwise` or `:none` but got \ - `:$skipmissing`")) - end - end - return (x, y) -end - -""" - handlelistwise(x::RoMMatrix{T}, y::RoMMatrix{U}) where {T,U} - -Return a pair `(a,b)`, filtered copies of `(x,y)`, in which the rows `x[i,:]` and -`y[i,:]` are both filtered out if `any(ismissing,x[i,:])||any(ismissing,y[i,:])`. -""" -function handlelistwise(x::RoMMatrix{T}, y::RoMMatrix{U}) where {T,U} - - nrx, ncx = size(x) - nry, ncy = size(y) - - nrx == nry || throw(DimensionMismatch("x and y have inconsistent dimensions")) - chooser = fill(true, nrx) - - nrout = nrx - @inbounds for i = 1:nrx - for j = 1:ncx - if ismissing(x[i, j]) - chooser[i] = false - nrout -= 1 - break - end - end - if chooser[i] - for j = 1:ncy - if ismissing(y[i, j]) - chooser[i] = false - nrout -= 1 - break - end - end - end - end - - a = Matrix{T}(undef, nrout, ncx) - @inbounds for j = 1:ncx - k = 0 - for i = 1:nrx - if chooser[i] - k += 1 - a[k, j] = x[i, j] - end - end - end - - b = Matrix{U}(undef, nrout, ncy) - @inbounds for j = 1:ncy - k = 0 - for i = 1:nrx - if chooser[i] - k += 1 - b[k, j] = y[i, j] - end - end - end - - return (a, b) -end \ No newline at end of file diff --git a/test/rankcorr.jl b/test/rankcorr.jl index f7e2255..1c473be 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -28,7 +28,7 @@ x2 = x[:, 2] y = Y[:, 1] # corkendall and friends -for f in (KendallTau.corkendall, corkendall_naive,KendallTau.Experimental.corkendall,KendallTau.Experimental2.corkendall) +for f in (KendallTau.corkendall, corkendall_naive) println("f = $(Base.parentmodule(f)).$(Base.nameof(f))") # Check error, handling of NaN, Inf etc