From ba78580994b83ae25131146d243780caccdab89f Mon Sep 17 00:00:00 2001 From: Philip Swannell Date: Thu, 21 Jan 2021 15:03:56 +0000 Subject: [PATCH] comments only --- README.md | 11 ----- src/KendallTau.jl | 7 +-- src/rankcorr.jl | 3 +- src/speedtestresults.txt | 2 - src/speedtestresults2.txt | 6 +-- src/speedtestresults3.txt | 91 +++++++++++++++++++++++++++++++++++++++ src/speedtests.jl | 7 ++- src/threads_v3.jl | 1 - test/rankcorr.jl | 1 - 9 files changed, 97 insertions(+), 32 deletions(-) create mode 100644 src/speedtestresults3.txt diff --git a/README.md b/README.md index d64c476..60a4c46 100644 --- a/README.md +++ b/README.md @@ -176,25 +176,14 @@ Results from all 3 functions identical? true

- - - - ## Other features: A function `corkendallnaive` that implements the obvious order N^2 algorithm. This function is not exported, but is used in the function `compare_implementations` in `tests/rankcorr.jl` which is quite a thorough test harness, and could be copied over to `StatsBase/tests/rankcorr.jl`. Functions `corkendallthreads_v1`, `corkendallthreads_v2` and `corkendallthreads_v3` which are experimental for the time being. - ## To do In the event that either `x` or `y` contain `nan` values the function currently returns `nan`. The Kendall Tau calculators in both Python and R allow alternative (and often useful) handling of `nan` values, and I would like to implement something similar. See argument `nan_policy` to the Python function `scipy.stats.kendalltau` [here](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kendalltau.html) and argument `use` to the R function `cor` [here](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/cor). For my own projects, I particularly need an equivalent of R's `use = "pairwise.complete.obs"` - - - Philip Swannell 18 Jan 2021 - - - diff --git a/src/KendallTau.jl b/src/KendallTau.jl index 7c80a7c..416b6f5 100644 --- a/src/KendallTau.jl +++ b/src/KendallTau.jl @@ -14,9 +14,4 @@ include("threads_v3.jl") export corkendall - -end # module - - - - +end # module \ No newline at end of file diff --git a/src/rankcorr.jl b/src/rankcorr.jl index 4aeef0e..224915c 100644 --- a/src/rankcorr.jl +++ b/src/rankcorr.jl @@ -161,13 +161,12 @@ function mergesort!(v::AbstractVector, lo::Integer, hi::Integer, small_threshold return nswaps end - """ countties(x::RealVector,lo::Int64,hi::Int64) Assumes `x` is sorted. Returns the number of ties within `x[lo:hi]`. """ -function countties(x::RealVector, lo::Int64, hi::Int64) +function countties(x::AbstractVector, lo::Integer, hi::Integer) thistiecount, result = 0, 0 for i ∈ (lo + 1):hi if x[i] == x[i - 1] diff --git a/src/speedtestresults.txt b/src/speedtestresults.txt index 20cc7e8..a1d45be 100644 --- a/src/speedtestresults.txt +++ b/src/speedtestresults.txt @@ -144,7 +144,6 @@ KendallTau.corkendallthreads_v1(vector1,vector2) all(myapprox.(results[2:end], results[1:end - 1], 1.0e-14)) = true ################################################################### - ################################################################### Executing speedtest 2021-01-17T10:44:51.955 -------------------------------------------------- @@ -175,7 +174,6 @@ KendallTau.corkendallthreads_v2(matrix1,matrix2) all(myapprox.(results[2:end], results[1:end - 1], 1.0e-14)) = true ################################################################### - ################################################################### Executing speedtest 2021-01-18T09:28:05.553 size(matrix1) = (2000, 10) diff --git a/src/speedtestresults2.txt b/src/speedtestresults2.txt index ac736fd..5320cf7 100644 --- a/src/speedtestresults2.txt +++ b/src/speedtestresults2.txt @@ -3,8 +3,6 @@ Recent changes: mergesort more memory efficient via correct use of buffer and resize! function. mergesort! & merge! refactored to be a bit more similar to functions in base/sort.jl - - julia> speedtest([StatsBase.corkendall,KendallTau.corkendall,KendallTau.corkendallthreads_v2],2000,10) ################################################################### Executing speedtest 2021-01-19T15:48:47.282 @@ -90,6 +88,4 @@ KendallTau.corkendallthreads_v2(manyrepeats1,manyrepeats2) Speed ratio KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 2.461948941847068 Ratio of memory allocated KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 2.275738940686372 Results from all 3 functions identical? true -################################################################### - - +################################################################### \ No newline at end of file diff --git a/src/speedtestresults3.txt b/src/speedtestresults3.txt new file mode 100644 index 0000000..142b783 --- /dev/null +++ b/src/speedtestresults3.txt @@ -0,0 +1,91 @@ +PGS 21 Jan 2021 +mergesort! and insertionsort! now very similar indeed to sort! in base/sort.jl +Note speed improvements and the fact that code (as reported by @btime) as now more +memory efficient than the StatsBase verison, which is nice. + +julia> KendallTau.speedtest([StatsBase.corkendall,KendallTau.corkendall,KendallTau.corkendallthreads_v2],2000,10) +################################################################### +Executing speedtest 2021-01-21T14:56:19.489 +size(matrix1) = (2000, 10) +StatsBase.corkendall(matrix1) + 33.684 ms (451 allocations: 5.54 MiB) +Main.KendallTau.corkendall(matrix1) + 5.394 ms (298 allocations: 3.40 MiB) +Speed ratio Main.KendallTau.corkendall vs StatsBase.corkendall: 6.244948088546108 +Ratio of memory allocated Main.KendallTau.corkendall vs StatsBase.corkendall: 0.6130525086357451 +Main.KendallTau.corkendallthreads_v2(matrix1) + 1.706 ms (614 allocations: 3.44 MiB) +Speed ratio Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 19.738646938177556 +Ratio of memory allocated Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 0.6202723771851052 +Results from all 3 functions identical? true +-------------------------------------------------- +size(matrix1) = (2000, 10) +size(matrix2) = (2000, 10) +StatsBase.corkendall(matrix1,matrix2) + 76.453 ms (1001 allocations: 12.31 MiB) +Main.KendallTau.corkendall(matrix1,matrix2) + 11.200 ms (631 allocations: 7.24 MiB) +Speed ratio Main.KendallTau.corkendall vs StatsBase.corkendall: 6.826188109481081 +Ratio of memory allocated Main.KendallTau.corkendall vs StatsBase.corkendall: 0.5880152134243097 +Main.KendallTau.corkendallthreads_v2(matrix1,matrix2) + 3.925 ms (712 allocations: 7.25 MiB) +Speed ratio Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 19.481024466550014 +Ratio of memory allocated Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 0.588845802919708 +Results from all 3 functions identical? true +-------------------------------------------------- +size(vector1) = (2000,) +size(matrix1) = (2000, 10) +StatsBase.corkendall(vector1,matrix1) + 7.374 ms (103 allocations: 1.23 MiB) +Main.KendallTau.corkendall(vector1,matrix1) + 1.096 ms (65 allocations: 725.55 KiB) +Speed ratio Main.KendallTau.corkendall vs StatsBase.corkendall: 6.726540843328325 +Ratio of memory allocated Main.KendallTau.corkendall vs StatsBase.corkendall: 0.5755739005404333 +Main.KendallTau.corkendallthreads_v2(vector1,matrix1) + 464.500 μs (133 allocations: 734.48 KiB) +Speed ratio Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 15.875780409041981 +Ratio of memory allocated Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 0.5826639892904953 +Results from all 3 functions identical? true +-------------------------------------------------- +size(matrix1) = (2000, 10) +size(vector1) = (2000,) +StatsBase.corkendall(matrix1,vector1) + 7.379 ms (101 allocations: 1.23 MiB) +Main.KendallTau.corkendall(matrix1,vector1) + 1.097 ms (63 allocations: 725.45 KiB) +Speed ratio Main.KendallTau.corkendall vs StatsBase.corkendall: 6.725142622801422 +Ratio of memory allocated Main.KendallTau.corkendall vs StatsBase.corkendall: 0.5755423329614479 +Main.KendallTau.corkendallthreads_v2(matrix1,vector1) + 474.300 μs (134 allocations: 734.52 KiB) +Speed ratio Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 15.558716002530044 +Ratio of memory allocated Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 0.5827321185074997 +Results from all 3 functions identical? true +-------------------------------------------------- +size(vector1) = (2000,) +size(vector2) = (2000,) +StatsBase.corkendall(vector1,vector2) + 733.000 μs (10 allocations: 126.03 KiB) +Main.KendallTau.corkendall(vector1,vector2) + 180.999 μs (8 allocations: 86.72 KiB) +Speed ratio Main.KendallTau.corkendall vs StatsBase.corkendall: 4.049746131194095 +Ratio of memory allocated Main.KendallTau.corkendall vs StatsBase.corkendall: 0.6880733944954128 +Main.KendallTau.corkendallthreads_v2(vector1,vector2) + 183.900 μs (10 allocations: 118.22 KiB) +Speed ratio Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 3.9858618814573137 +Ratio of memory allocated Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 0.9380114059013142 +Results from all 3 functions identical? true +-------------------------------------------------- +size(manyrepeats1) = (2000,) +size(manyrepeats2) = (2000,) +StatsBase.corkendall(manyrepeats1,manyrepeats2) + 442.500 μs (12 allocations: 157.53 KiB) +Main.KendallTau.corkendall(manyrepeats1,manyrepeats2) + 148.201 μs (14 allocations: 126.38 KiB) +Speed ratio Main.KendallTau.corkendall vs StatsBase.corkendall: 2.9858098123494443 +Ratio of memory allocated Main.KendallTau.corkendall vs StatsBase.corkendall: 0.8022217813925808 +Main.KendallTau.corkendallthreads_v2(manyrepeats1,manyrepeats2) + 150.700 μs (16 allocations: 157.88 KiB) +Speed ratio Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 2.936297279362973 +Ratio of memory allocated Main.KendallTau.corkendallthreads_v2 vs StatsBase.corkendall: 1.0021821067248562 +Results from all 3 functions identical? true +################################################################### \ No newline at end of file diff --git a/src/speedtests.jl b/src/speedtests.jl index 3538916..6a9f3fb 100644 --- a/src/speedtests.jl +++ b/src/speedtests.jl @@ -5,8 +5,9 @@ using Dates """ @btimed expression [other parameters...] -An amended version of BenchmarkTools.@btime. Identical except the return is a tuple of the result of the `expression` evaluation, the trialmin (of type BenchmarkTools.TrialEstimate) and the memory allocated (a number of bytes). - +An amended version of BenchmarkTools.@btime. Identical except the return is a tuple of +the result of the `expression` evaluation, the trialmin (of type BenchmarkTools.TrialEstimate) +and the memory allocated (a number of bytes). """ macro btimed(args...) _, params = BenchmarkTools.prunekwargs(args...) @@ -29,7 +30,6 @@ macro btimed(args...) end) end - """ speedtest(functions, nr::Int, nc::Int) @@ -204,7 +204,6 @@ function myapprox(x::Float64, y::Float64, abstol::Float64) end end - """ speedtest_repeatdensity(functions,nr) diff --git a/src/threads_v3.jl b/src/threads_v3.jl index 1a29349..1af965e 100644 --- a/src/threads_v3.jl +++ b/src/threads_v3.jl @@ -25,7 +25,6 @@ function corkendallthreads_v3(X::RealMatrix, Y::RealMatrix) return C end - #thinking here is that corkendall is more efficient if y argument has more columns than X (but that's only a hunch, haven't actually tested it.) function corkendallthreads_v4(X::RealMatrix, Y::RealMatrix) nr = size(X, 2) diff --git a/test/rankcorr.jl b/test/rankcorr.jl index 6485447..30e3b61 100644 --- a/test/rankcorr.jl +++ b/test/rankcorr.jl @@ -94,7 +94,6 @@ function corkendallnaive(X::RealMatrix) return C end - """ compare_implementations(fn1, fn2, abstol::Float64=1e-14, maxcols=10, maxrows=500, numtests=100)