From 75c8701741f1f71439b3f18f00c5b4941a239ce7 Mon Sep 17 00:00:00 2001 From: PGS62 Date: Sun, 4 Feb 2024 09:59:34 +0000 Subject: [PATCH] changed how corkendall handles skipmissing = :none --- README.md | 6 +-- scripts/performancetests.jl | 8 +-- src/corkendall.jl | 100 +++++++++++++++++------------------- src/corkendall_fromfile.jl | 24 ++++----- test/rankcorr.jl | 8 ++- 5 files changed, 71 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index 4c8e790..2cbb2b0 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ # KendallTau.jl -This (unregistered) Julia package exposes a function `corkendall` that is a candidate to replace the function of the same name in the StatsBase package. +This (unregistered) Julia package exposes a function `corkendall` that is a candidate to replace the function of the same name in the StatsBase package. The package also contains a function `speedtest` that prints a comparison of the execution speed of two (or more) implementations of Kendall Tau. `speedtest` demonstrates that the new version of `corkendall` is about ~~five~~ ~~six~~ seven times faster than the existing StatsBase version. See [# 634](https://github.com/JuliaStats/StatsBase.jl/issues/634). @@ -50,7 +50,7 @@ julia> @time res_sb = StatsBase.corkendall(x); julia> @time res_kt = KendallTau.corkendall(x); 1.780313 seconds (2.28 k allocations: 8.876 MiB, 0.14% gc time) - + julia> 21.393938/1.780313 12.016953198679108 @@ -106,5 +106,5 @@ Environment: -Philip Swannell +Philip Swannell 20 February 2023 diff --git a/scripts/performancetests.jl b/scripts/performancetests.jl index 2e6c633..8d80fef 100644 --- a/scripts/performancetests.jl +++ b/scripts/performancetests.jl @@ -395,16 +395,16 @@ function how_scaleable(fns, nr::Integer, ncs::Vector{<:Integer}, =# #Syntax for Plots.jl - #cheatsheet: https://www.matecdev.com/posts/julia-plotting-linestyle-markers.html + #cheatsheet: https://www.matecdev.com/posts/julia-plotting-linestyle-markers.html plot(ncs, datatoplot, title=title, xlabel="Num cols (num rows = $nr)", ylabel="Seconds", label=hcat([fullnameof(fn) for fn in fns]...), - marker = :circle, + marker=:circle, xaxis=:log, yaxis=:log, - size = (1500,1000), - grid = true) + size=(1500, 1000), + grid=true) end \ No newline at end of file diff --git a/src/corkendall.jl b/src/corkendall.jl index 5e15640..febf6c9 100644 --- a/src/corkendall.jl +++ b/src/corkendall.jl @@ -56,13 +56,12 @@ multiple threads if they are available. # Keyword argument -- `skipmissing::Symbol=:none`: If `:none` (the default), when `missing` is an allowed - element type of either `x` or `y` the function returns an error. If `:pairwise`, when - either of the `i`th entries of the vectors used to calculate an element of the return is - `missing`, both entries are skipped. If `:listwise`, when any entry in the `i`th row - of `x` or the `i`th row of `y` is `missing` then all entries in the `i`th rows are - skipped; note that this might skip a high proportion of entries. Only allowed when `x` - or `y` is a matrix. +- `skipmissing::Symbol=:none`: If `:none` (the default), then `missing` entries in `x` or + `y` lead to `missing` entries in the return. If `:pairwise`, when either of the `i`th + entries of the vectors used to calculate an element of the return is `missing`, both + entries are skipped. If `:listwise`, when any entry in the `i`th row of `x` or the `i`th + row of `y` is `missing` then all entries in the `i`th rows are skipped; note that this + might skip a high proportion of entries. Only allowed when `x` or `y` is a matrix. """ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:none) where {T,U} @@ -73,15 +72,16 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non symmetric = x === y missing_allowed = missing isa eltype(x) || missing isa eltype(y) - validate_skipmissing(skipmissing, missing_allowed, true) + skipmissing in [:none, :pairwise, :listwise] || throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise")) # Degenerate case - U and/or T not defined. if x isa Matrix{Missing} || y isa Matrix{Missing} + offdiag = missing_allowed && skipmissing == :none ? missing : NaN nr, nc = size(x, 2), size(y, 2) if symmetric - return ifelse.((1:nr) .== (1:nc)', 1.0, NaN) + return ifelse.((1:nr) .== (1:nc)', 1.0, offdiag) else - return fill(NaN, nr, nc) + return fill(offdiag, nr, nc) end end @@ -90,14 +90,21 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non return collect(transpose(corkendall(y, x; skipmissing))) end - if missing_allowed && skipmissing == :listwise - x, y = handle_listwise!(x, y) + if missing_allowed + if skipmissing == :listwise + x, y = handle_listwise!(x, y) + end end m, nr = size(x) nc = size(y, 2) - C = ones(Float64, nr, nc) + if missing_allowed && skipmissing == :none + C = ones(Union{Missing,Float64}, nr, nc) + else + C = ones(Float64, nr, nc) + end + # Avoid unnecessary allocation when nthreads is large but output matrix is small. n_duplicates = min(Threads.nthreads(), symmetric ? nr - 1 : nr) @@ -128,12 +135,10 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non if use_atomic id = Threads.atomic_add!(a, 1)[] - # Check that threads are using distinct scratch vectors - @assert permxs[id][1] == 0 else id = Threads.threadid() end - + scratch_py = scratch_pys[id] scratch_sy = scratch_sys[id] ycoli = ycolis[id] @@ -149,7 +154,7 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non for i = 1:(symmetric ? j - 1 : nc) ycoli .= view(y, :, i) - C[j, i] = corkendall_sorted!(sortedxcolj, ycoli, permx, scratch_py, scratch_sy, scratch_fx, scratch_fy) + C[j, i] = corkendall_kernel!(sortedxcolj, ycoli, permx, scratch_py, scratch_sy, scratch_fx, scratch_fy, skipmissing) symmetric && (C[i, j] = C[j, i]) end end @@ -163,10 +168,14 @@ function corkendall(x::RoMVector{T}, y::RoMVector{U}; skipmissing::Symbol=:none) length(x) == length(y) || throw(DimensionMismatch("x and y have inconsistent dimensions")) missing_allowed = missing isa eltype(x) || missing isa eltype(y) - validate_skipmissing(skipmissing, missing_allowed, false) + skipmissing in [:none, :pairwise] || throw(ArgumentError("skipmissing must be one of :none or :pairwise")) - # Degenerate case - U and/or T not defined. - if x isa Vector{Missing} || y isa Vector{Missing} + if missing_allowed && skipmissing == :none + if any(ismissing, x) || any(ismissing, y) + return missing + end + elseif x isa Vector{Missing} || y isa Vector{Missing} + # Degenerate case - U and/or T not defined. return NaN end @@ -179,7 +188,7 @@ function corkendall(x::RoMVector{T}, y::RoMVector{U}; skipmissing::Symbol=:none) permx = sortperm(x) permute!(x, permx) - return corkendall_sorted!(x, y, permx, similar(y), similar(y), similar(x, T), similar(y, U)) + return corkendall_kernel!(x, y, permx, similar(y), similar(y), similar(x, T), similar(y, U), skipmissing) end #= corkendall returns a vector in this case, inconsistent with with Statistics.cor and @@ -199,12 +208,12 @@ end # Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439. # JSTOR, www.jstor.org/stable/2282833. """ - corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U}, - permx::AbstractVector{<:Integer}, scratch_py::RoMVector{U}, - scratch_sy::RoMVector{U}, scratch_fx::AbstractVector{T}, + corkendall_kernel!(sortedx::RoMVector{T}, y::RoMVector{U}, + permx::AbstractVector{<:Integer}, scratch_py::RoMVector{U}, + scratch_sy::RoMVector{U}, scratch_fx::AbstractVector{T}, scratch_fy::AbstractVector{U}) where {T,U} -Kendall correlation between two vectors but omitting the initial sorting of the first +Kendall correlation between two vectors but omitting the initial sorting of the first argument. Calculating Kendall correlation between `x` and `y` is thus a two stage process: a) sort `x` to get `sortedx`; b) call this function on `sortedx` and `y`, with subsequent arguments: @@ -214,13 +223,21 @@ subsequent arguments: - `scratch_fx, scratch_fy`: Vectors used to filter `missing`s from `x` and `y` without allocation. """ -function corkendall_sorted!(sortedx::RoMVector{T}, y::RoMVector{U}, - permx::AbstractVector{<:Integer}, scratch_py::RoMVector{U}, - scratch_sy::RoMVector{U}, scratch_fx::AbstractVector{T}, - scratch_fy::AbstractVector{U}) where {T,U} +function corkendall_kernel!(sortedx::RoMVector{T}, y::RoMVector{U}, + permx::AbstractVector{<:Integer}, scratch_py::RoMVector{U}, + scratch_sy::RoMVector{U}, scratch_fx::AbstractVector{T}, + scratch_fy::AbstractVector{U}, skipmissing::Symbol) where {T,U} length(sortedx) >= 2 || return NaN + if skipmissing == :none + if missing isa eltype(sortedx) && any(ismissing, sortedx) + return (missing) + elseif missing isa eltype(y) && any(ismissing, y) + return (missing) + end + end + @inbounds for i in eachindex(y) scratch_py[i] = y[permx[i]] end @@ -450,29 +467,4 @@ function handle_listwise!(x::RoMMatrix{T}, y::RoMMatrix{U}) where {T,U} end end return view(a, 1:k, :), view(b, 1:k, :) -end - -function validate_skipmissing(skipmissing::Symbol, missing_allowed::Bool, listwise_allowed::Bool) - if skipmissing == :listwise && listwise_allowed - elseif skipmissing == :pairwise - elseif missing_allowed - if listwise_allowed - throw(ArgumentError("When missing is an allowed element type \ - skipmissing must be either :pairwise or :listwise, \ - but got :$skipmissing")) - else - throw(ArgumentError("When missing is an allowed element type \ - skipmissing must be :pairwise, but got :$skipmissing")) - end - - elseif skipmissing == :none - else - if listwise_allowed - throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise, \ - but got :$skipmissing")) - else - throw(ArgumentError("skipmissing must be either :none or :pairwise, \ - but got :$skipmissing")) - end - end end \ No newline at end of file diff --git a/src/corkendall_fromfile.jl b/src/corkendall_fromfile.jl index ef77051..6e9e131 100644 --- a/src/corkendall_fromfile.jl +++ b/src/corkendall_fromfile.jl @@ -172,22 +172,22 @@ function csvread2(filename::String, ignorefirstrow::Bool, ignorefirstcol::Bool; missingstring::Union{Nothing,String,Vector{String}}="") - contents = DelimitedFiles.readdlm(filename,',') - contents = ifelse.(contents .== missingstring,missing,contents) + contents = DelimitedFiles.readdlm(filename, ',') + contents = ifelse.(contents .== missingstring, missing, contents) firstrow = ignorefirstrow ? 2 : 1 firstcol = ignorefirstcol ? 2 : 1 - if firstrow==firstcol==1 - data=contents + if firstrow == firstcol == 1 + data = contents else - data = contents[firstrow:end,firstcol:end] + data = contents[firstrow:end, firstcol:end] end - data = identity.(data) + data = identity.(data) if firstrow == 0 - names = "Column".* string.(1:(size(data,2))) + names = "Column" .* string.(1:(size(data, 2))) else - names = Symbol.(contents[1,firstcol:end]) + names = Symbol.(contents[1, firstcol:end]) end -return(data,names) + return (data, names) end @@ -198,10 +198,10 @@ function testcsvread2() #ISDA's calculation of KendallTau isda_results_file = joinpath(base_folder, raw"9_correlations\eq_delta-kendall_recent_1-10d.csv") - @time res1=csvread2(isda_results_file,true,true,missingstring = "NA") -@time res2=csvread(isda_results_file,true,true,missingstring = "NA") + @time res1 = csvread2(isda_results_file, true, true, missingstring="NA") + @time res2 = csvread(isda_results_file, true, true, missingstring="NA") -res1==res2 + res1 == res2 end diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 0a8ab29..9b5b883 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -73,10 +73,14 @@ include("compare_implementations.jl") @test isequal(f(xmm, skipmissing=:pairwise), [1.0 NaN; NaN 1.0]) @test isequal(f(xmm, copy(xmm), skipmissing=:pairwise), [NaN NaN; NaN NaN]) - # Test not-correct values of skipmissing if !(f === corkendall_naive) - @test_throws ArgumentError f(Xm) + #TODO fix corkendall_naive to cope with skipmissing=:none + @test ismissing(f(1:5, xm, skipmissing=:none)) + @test ismissing(f(1:5, xm, skipmissing=:none)) + @test isequal(f(xmm, skipmissing=:none), [1.0 missing; missing 1.0]) + @test isequal(f(xmm, copy(xmm), skipmissing=:none), [missing missing; missing missing]) end + @test_throws ArgumentError f(x; skipmissing=:foo) @test_throws ArgumentError f(Xm; skipmissing=:foo)