Skip to content

Commit

Permalink
changed how corkendall handles skipmissing = :none
Browse files Browse the repository at this point in the history
  • Loading branch information
PGS62 committed Feb 4, 2024
1 parent 82c3388 commit 75c8701
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 75 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# KendallTau.jl

This (unregistered) Julia package exposes a function `corkendall` that is a candidate to replace the function of the same name in the StatsBase package.
This (unregistered) Julia package exposes a function `corkendall` that is a candidate to replace the function of the same name in the StatsBase package.

The package also contains a function `speedtest` that prints a comparison of the execution speed of two (or more) implementations of Kendall Tau. `speedtest` demonstrates that the new version of `corkendall` is about ~~five~~ ~~six~~ seven times faster than the existing StatsBase version. See [# 634](https://github.com/JuliaStats/StatsBase.jl/issues/634).

Expand Down Expand Up @@ -50,7 +50,7 @@ julia> @time res_sb = StatsBase.corkendall(x);

julia> @time res_kt = KendallTau.corkendall(x);
1.780313 seconds (2.28 k allocations: 8.876 MiB, 0.14% gc time)

julia> 21.393938/1.780313
12.016953198679108

Expand Down Expand Up @@ -106,5 +106,5 @@ Environment:



Philip Swannell
Philip Swannell
20 February 2023
8 changes: 4 additions & 4 deletions scripts/performancetests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -395,16 +395,16 @@ function how_scaleable(fns, nr::Integer, ncs::Vector{<:Integer},
=#

#Syntax for Plots.jl
#cheatsheet: https://www.matecdev.com/posts/julia-plotting-linestyle-markers.html
#cheatsheet: https://www.matecdev.com/posts/julia-plotting-linestyle-markers.html
plot(ncs, datatoplot,
title=title,
xlabel="Num cols (num rows = $nr)",
ylabel="Seconds",
label=hcat([fullnameof(fn) for fn in fns]...),
marker = :circle,
marker=:circle,
xaxis=:log,
yaxis=:log,
size = (1500,1000),
grid = true)
size=(1500, 1000),
grid=true)

end
100 changes: 46 additions & 54 deletions src/corkendall.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,12 @@ multiple threads if they are available.
# Keyword argument
- `skipmissing::Symbol=:none`: If `:none` (the default), when `missing` is an allowed
element type of either `x` or `y` the function returns an error. If `:pairwise`, when
either of the `i`th entries of the vectors used to calculate an element of the return is
`missing`, both entries are skipped. If `:listwise`, when any entry in the `i`th row
of `x` or the `i`th row of `y` is `missing` then all entries in the `i`th rows are
skipped; note that this might skip a high proportion of entries. Only allowed when `x`
or `y` is a matrix.
- `skipmissing::Symbol=:none`: If `:none` (the default), then `missing` entries in `x` or
`y` lead to `missing` entries in the return. If `:pairwise`, when either of the `i`th
entries of the vectors used to calculate an element of the return is `missing`, both
entries are skipped. If `:listwise`, when any entry in the `i`th row of `x` or the `i`th
row of `y` is `missing` then all entries in the `i`th rows are skipped; note that this
might skip a high proportion of entries. Only allowed when `x` or `y` is a matrix.
"""
function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:none) where {T,U}

Expand All @@ -73,15 +72,16 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non
symmetric = x === y

missing_allowed = missing isa eltype(x) || missing isa eltype(y)
validate_skipmissing(skipmissing, missing_allowed, true)
skipmissing in [:none, :pairwise, :listwise] || throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise"))

# Degenerate case - U and/or T not defined.
if x isa Matrix{Missing} || y isa Matrix{Missing}
offdiag = missing_allowed && skipmissing == :none ? missing : NaN
nr, nc = size(x, 2), size(y, 2)
if symmetric
return ifelse.((1:nr) .== (1:nc)', 1.0, NaN)
return ifelse.((1:nr) .== (1:nc)', 1.0, offdiag)
else
return fill(NaN, nr, nc)
return fill(offdiag, nr, nc)
end
end

Expand All @@ -90,14 +90,21 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non
return collect(transpose(corkendall(y, x; skipmissing)))
end

if missing_allowed && skipmissing == :listwise
x, y = handle_listwise!(x, y)
if missing_allowed
if skipmissing == :listwise
x, y = handle_listwise!(x, y)
end
end

m, nr = size(x)
nc = size(y, 2)

C = ones(Float64, nr, nc)
if missing_allowed && skipmissing == :none
C = ones(Union{Missing,Float64}, nr, nc)
else
C = ones(Float64, nr, nc)
end

# Avoid unnecessary allocation when nthreads is large but output matrix is small.
n_duplicates = min(Threads.nthreads(), symmetric ? nr - 1 : nr)

Expand Down Expand Up @@ -128,12 +135,10 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non

if use_atomic
id = Threads.atomic_add!(a, 1)[]
# Check that threads are using distinct scratch vectors
@assert permxs[id][1] == 0
else
id = Threads.threadid()
end

scratch_py = scratch_pys[id]
scratch_sy = scratch_sys[id]
ycoli = ycolis[id]
Expand All @@ -149,7 +154,7 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non

for i = 1:(symmetric ? j - 1 : nc)
ycoli .= view(y, :, i)
C[j, i] = corkendall_sorted!(sortedxcolj, ycoli, permx, scratch_py, scratch_sy, scratch_fx, scratch_fy)
C[j, i] = corkendall_kernel!(sortedxcolj, ycoli, permx, scratch_py, scratch_sy, scratch_fx, scratch_fy, skipmissing)
symmetric && (C[i, j] = C[j, i])
end
end
Expand All @@ -163,10 +168,14 @@ function corkendall(x::RoMVector{T}, y::RoMVector{U}; skipmissing::Symbol=:none)
length(x) == length(y) || throw(DimensionMismatch("x and y have inconsistent dimensions"))

missing_allowed = missing isa eltype(x) || missing isa eltype(y)
validate_skipmissing(skipmissing, missing_allowed, false)
skipmissing in [:none, :pairwise] || throw(ArgumentError("skipmissing must be one of :none or :pairwise"))

# Degenerate case - U and/or T not defined.
if x isa Vector{Missing} || y isa Vector{Missing}
if missing_allowed && skipmissing == :none
if any(ismissing, x) || any(ismissing, y)
return missing
end
elseif x isa Vector{Missing} || y isa Vector{Missing}
# Degenerate case - U and/or T not defined.
return NaN
end

Expand All @@ -179,7 +188,7 @@ function corkendall(x::RoMVector{T}, y::RoMVector{U}; skipmissing::Symbol=:none)
permx = sortperm(x)
permute!(x, permx)

return corkendall_sorted!(x, y, permx, similar(y), similar(y), similar(x, T), similar(y, U))
return corkendall_kernel!(x, y, permx, similar(y), similar(y), similar(x, T), similar(y, U), skipmissing)
end

#= corkendall returns a vector in this case, inconsistent with with Statistics.cor and
Expand All @@ -199,12 +208,12 @@ end
# Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439.
# JSTOR, www.jstor.org/stable/2282833.
"""
corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U},
permx::AbstractVector{<:Integer}, scratch_py::RoMVector{U},
scratch_sy::RoMVector{U}, scratch_fx::AbstractVector{T},
corkendall_kernel!(sortedx::RoMVector{T}, y::RoMVector{U},
permx::AbstractVector{<:Integer}, scratch_py::RoMVector{U},
scratch_sy::RoMVector{U}, scratch_fx::AbstractVector{T},
scratch_fy::AbstractVector{U}) where {T,U}
Kendall correlation between two vectors but omitting the initial sorting of the first
Kendall correlation between two vectors but omitting the initial sorting of the first
argument. Calculating Kendall correlation between `x` and `y` is thus a two stage process:
a) sort `x` to get `sortedx`; b) call this function on `sortedx` and `y`, with
subsequent arguments:
Expand All @@ -214,13 +223,21 @@ subsequent arguments:
- `scratch_fx, scratch_fy`: Vectors used to filter `missing`s from `x` and `y` without
allocation.
"""
function corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U},
permx::AbstractVector{<:Integer}, scratch_py::RoMVector{U},
scratch_sy::RoMVector{U}, scratch_fx::AbstractVector{T},
scratch_fy::AbstractVector{U}) where {T,U}
function corkendall_kernel!(sortedx::RoMVector{T}, y::RoMVector{U},
permx::AbstractVector{<:Integer}, scratch_py::RoMVector{U},
scratch_sy::RoMVector{U}, scratch_fx::AbstractVector{T},
scratch_fy::AbstractVector{U}, skipmissing::Symbol) where {T,U}

length(sortedx) >= 2 || return NaN

if skipmissing == :none
if missing isa eltype(sortedx) && any(ismissing, sortedx)
return (missing)
elseif missing isa eltype(y) && any(ismissing, y)
return (missing)
end
end

@inbounds for i in eachindex(y)
scratch_py[i] = y[permx[i]]
end
Expand Down Expand Up @@ -450,29 +467,4 @@ function handle_listwise!(x::RoMMatrix{T}, y::RoMMatrix{U}) where {T,U}
end
end
return view(a, 1:k, :), view(b, 1:k, :)
end

function validate_skipmissing(skipmissing::Symbol, missing_allowed::Bool, listwise_allowed::Bool)
if skipmissing == :listwise && listwise_allowed
elseif skipmissing == :pairwise
elseif missing_allowed
if listwise_allowed
throw(ArgumentError("When missing is an allowed element type \
skipmissing must be either :pairwise or :listwise, \
but got :$skipmissing"))
else
throw(ArgumentError("When missing is an allowed element type \
skipmissing must be :pairwise, but got :$skipmissing"))
end

elseif skipmissing == :none
else
if listwise_allowed
throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise, \
but got :$skipmissing"))
else
throw(ArgumentError("skipmissing must be either :none or :pairwise, \
but got :$skipmissing"))
end
end
end
24 changes: 12 additions & 12 deletions src/corkendall_fromfile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -172,22 +172,22 @@ function csvread2(filename::String, ignorefirstrow::Bool, ignorefirstcol::Bool;
missingstring::Union{Nothing,String,Vector{String}}="")


contents = DelimitedFiles.readdlm(filename,',')
contents = ifelse.(contents .== missingstring,missing,contents)
contents = DelimitedFiles.readdlm(filename, ',')
contents = ifelse.(contents .== missingstring, missing, contents)
firstrow = ignorefirstrow ? 2 : 1
firstcol = ignorefirstcol ? 2 : 1
if firstrow==firstcol==1
data=contents
if firstrow == firstcol == 1
data = contents
else
data = contents[firstrow:end,firstcol:end]
data = contents[firstrow:end, firstcol:end]
end
data = identity.(data)
data = identity.(data)
if firstrow == 0
names = "Column".* string.(1:(size(data,2)))
names = "Column" .* string.(1:(size(data, 2)))
else
names = Symbol.(contents[1,firstcol:end])
names = Symbol.(contents[1, firstcol:end])
end
return(data,names)
return (data, names)

end

Expand All @@ -198,10 +198,10 @@ function testcsvread2()
#ISDA's calculation of KendallTau
isda_results_file = joinpath(base_folder, raw"9_correlations\eq_delta-kendall_recent_1-10d.csv")

@time res1=csvread2(isda_results_file,true,true,missingstring = "NA")
@time res2=csvread(isda_results_file,true,true,missingstring = "NA")
@time res1 = csvread2(isda_results_file, true, true, missingstring="NA")
@time res2 = csvread(isda_results_file, true, true, missingstring="NA")

res1==res2
res1 == res2

end

Expand Down
8 changes: 6 additions & 2 deletions test/rankcorr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,14 @@ include("compare_implementations.jl")
@test isequal(f(xmm, skipmissing=:pairwise), [1.0 NaN; NaN 1.0])
@test isequal(f(xmm, copy(xmm), skipmissing=:pairwise), [NaN NaN; NaN NaN])

# Test not-correct values of skipmissing
if !(f === corkendall_naive)
@test_throws ArgumentError f(Xm)
#TODO fix corkendall_naive to cope with skipmissing=:none
@test ismissing(f(1:5, xm, skipmissing=:none))
@test ismissing(f(1:5, xm, skipmissing=:none))
@test isequal(f(xmm, skipmissing=:none), [1.0 missing; missing 1.0])
@test isequal(f(xmm, copy(xmm), skipmissing=:none), [missing missing; missing missing])
end

@test_throws ArgumentError f(x; skipmissing=:foo)
@test_throws ArgumentError f(Xm; skipmissing=:foo)

Expand Down

0 comments on commit 75c8701

Please sign in to comment.