diff --git a/Test_of_reduced_allocations_prior_to_threaded_loop.svg b/Test_of_reduced_allocations_prior_to_threaded_loop.svg new file mode 100644 index 0000000..e92e981 --- /dev/null +++ b/Test_of_reduced_allocations_prior_to_threaded_loop.svg @@ -0,0 +1 @@ +246810121416789100μ234567890.0012345678StatsBase.corkendallKendallTau.corkendallKendallTau.Experimental.corkendallKendallTau.Experimental2.corkendallTime to evaluate corkendall(x) vs num cols in xNum cols (num rows = 1000)Seconds \ No newline at end of file diff --git a/plots/plot_1.svg b/plots/plot_1.svg new file mode 100644 index 0000000..7beb358 --- /dev/null +++ b/plots/plot_1.svg @@ -0,0 +1 @@ +234567891023100μ250.001250.012StatsBase.corkendallKendallTau.corkendallKendallTau.Experimental.corkendallKendallTau.Experimental2.corkendallTime to evaluate corkendall(x) vs num cols in xNum cols (num rows = 1000)Seconds \ No newline at end of file diff --git a/plots/plot_2.svg b/plots/plot_2.svg new file mode 100644 index 0000000..be7f940 --- /dev/null +++ b/plots/plot_2.svg @@ -0,0 +1 @@ +2345678789100μ234567890.0012StatsBase.corkendallKendallTau.corkendallKendallTau.Experimental.corkendallKendallTau.Experimental2.corkendallTime to evaluate corkendall(x) vs num cols in xNum cols (num rows = 1000)Seconds \ No newline at end of file diff --git a/plots/plot_24.svg b/plots/threading_with_too_much_allocation.svg similarity index 100% rename from plots/plot_24.svg rename to plots/threading_with_too_much_allocation.svg diff --git a/scripts/performancetests.jl b/scripts/performancetests.jl index 72946e9..6644dcb 100644 --- a/scripts/performancetests.jl +++ b/scripts/performancetests.jl @@ -376,7 +376,7 @@ function how_scaleable(fns, nr::Integer, ncs::Vector{<:Integer}, end plot([ - scatter(x=ncs, y=datatoplot[:, i], mode="line", name=fullnameof(fns[i])) for i in 1:length(fns)], Layout(; title="Time to evaluate corkendall(x) vs num cols in x", + scatter(x=ncs, y=datatoplot[:, i], mode="line", name=fullnameof(fns[i])) for i in 1:length(fns)], Layout(; title="Time to evaluate corkendall(x) vs num cols in x ($(Threads.nthreads()) threads)", xaxis=attr(title="Num cols (num rows = $nr)", type="log"), yaxis=attr(title="Seconds", type="log"))) end \ No newline at end of file diff --git a/src/corkendall.jl b/src/corkendall.jl index f01dd24..42e218a 100644 --- a/src/corkendall.jl +++ b/src/corkendall.jl @@ -9,15 +9,12 @@ const RoMVector{T<:Real} = AbstractVector{<:Union{T,Missing}} const RoMMatrix{T<:Real} = AbstractMatrix{<:Union{T,Missing}} #= -TODO 16 Feb 2023 +TODO 17 Feb 2023 1) Should the default value of skipmissing be :none or :pairwise? :none forces the user to - address question of how missings should be handled, but at least for REPL use, it's + address the question of how missings should be handled, but at least for REPL use, it's rather inconvenient. -2) KendallTau.corkendall currently slower than StatsBase.corkendall for very small - vector\matrix output(4x4 or smaller, assuming vector length 1000) , presumably thanks to - overhead of threading and populating the "scratch-space" vectors. Does this matter? -3) Ask for code review? -4) Make PR to StatsBase? +2) Ask for code review? +3) Make PR to StatsBase? =# """ @@ -65,14 +62,6 @@ function corkendall(x::RoMVector, y::RoMMatrix; skipmissing::Symbol=:none) return (corkendall(reshape(x, (length(x), 1)), y; skipmissing)) end -""" - duplicate(x) - -Construct a vector with `Threads.nthreads()` elements, each a copy of `x`. -""" -function duplicate(x) - [copy(x) for _ in 1:Threads.nthreads()] -end function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:none) where {T,U} symmetric = x === y @@ -98,20 +87,35 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non end C = ones(Float64, nr, nc) + #Avoid unnecessary allocation when nthreads is large but output matrix is small. + n_duplicates = min(Threads.nthreads(), symmetric ? nr - 1 : nr) + use_atomic = n_duplicates < Threads.nthreads() + + if use_atomic + a = Threads.Atomic{Int}(1) + end #= Create scratch vectors so that threaded code can be non-allocating. One vector per thread to avoid cross-talk between threads.=# - scratchyvectors = duplicate(similar(y, m)) - ycolis = duplicate(similar(y, m)) - xcoljsorteds = duplicate(similar(x, m)) - permxs = duplicate(zeros(Int, m)) - txs = duplicate(Vector{T}(undef, m)) - tys = duplicate(Vector{U}(undef, m)) - sortyspaces = duplicate(Vector{U}(undef, m)) + duplicate(x, n) = [copy(x) for _ in 1:n] + scratchyvectors = duplicate(similar(y, m), n_duplicates) + ycolis = duplicate(similar(y, m), n_duplicates) + xcoljsorteds = duplicate(similar(x, m), n_duplicates) + permxs = duplicate(zeros(Int, m), n_duplicates) + txs = duplicate(Vector{T}(undef, m), n_duplicates) + tys = duplicate(Vector{U}(undef, m), n_duplicates) + sortyspaces = duplicate(Vector{U}(undef, m), n_duplicates) Threads.@threads for j = (symmetric ? 2 : 1):nr - id = Threads.threadid() + if use_atomic + id = Threads.atomic_add!(a, 1)[] + #=Check that threads are using distinct scratch vectors=# + @assert permxs[id][1] == 0 + else + id = Threads.threadid() + end + scratchyvector = scratchyvectors[id] sortyspace = sortyspaces[id] ycoli = ycolis[id]