diff --git a/README.md b/README.md index 4fc9e70..4c8e790 100644 --- a/README.md +++ b/README.md @@ -68,31 +68,28 @@ julia> Threads.nthreads()#12 cores, 20 logical processors I wish to compute Kendall Tau for a set of 32,000 time series, each having observations every weekday over a four year period. Such a calculation takes some 42 minutes on my PC (Windows 11, 12th Gen Intel(R) Core(TM) i7-12700, 2100 Mhz, 12 Core(s), 20 Logical Processors), with Julia 1.8.5. ```julia +julia> Threads.nthreads() +20 + julia> x = rand(1040,32000); julia> @time KendallTau.corkendall(x); 2524.754279 seconds (64.28 k allocations: 7.633 GiB, 0.00% gc time) ``` - diff --git a/data/smallcorrels.csv b/data/smallcorrels.csv new file mode 100644 index 0000000..cacac4e --- /dev/null +++ b/data/smallcorrels.csv @@ -0,0 +1,5 @@ +,"a","b","c","d" +"a",7.27546190987574E-02,0.573761529289951,0.567573346744397,0.353022413642343 +"b",0.630748492296052,0.882538005391427,0.387604390922472,0.771954267245282 +"c",4.41291538720918E-02,6.41242010748875E-02,0.429207828355124,0.445919790376363 +"d",0.13933271971074,0.838523913013751,0.641453185911449,0.151142763005595 diff --git a/data/smallcorrels_shuffled.csv b/data/smallcorrels_shuffled.csv new file mode 100644 index 0000000..9774f87 --- /dev/null +++ b/data/smallcorrels_shuffled.csv @@ -0,0 +1,5 @@ +,"c","b","d","a" +"c",0.429207828355124,6.41242010748875E-02,0.445919790376363,4.41291538720918E-02 +"b",0.387604390922472,0.882538005391427,0.771954267245282,0.630748492296052 +"d",0.641453185911449,0.838523913013751,0.151142763005595,0.13933271971074 +"a",0.567573346744397,0.573761529289951,0.353022413642343,7.27546190987574E-02 diff --git a/scripts/for_aiden.jl b/scripts/for_aiden.jl index 8bd5a26..c2b5efd 100644 --- a/scripts/for_aiden.jl +++ b/scripts/for_aiden.jl @@ -4,41 +4,56 @@ using Statistics """ test_corkendall_fromfile() -Compare KendallTau calculation by ISDA (Python implementation?) with `corkendall_fromfile`. +Compare KendallTau calculation by ISDA (Python implementation) with `corkendall_fromfile`. Returns maximum absolute difference and median absolute difference. Data for -ISDA SIMM calibration VIII, 5853 time series, with 1044 observations in each series. +ISDA SIMM 2023 calibration: 5,853 time series, each with 1,044 observations. +2024 calibration: 5,021 time series, each with 1,041 observations. -# Example +# Examples ``` -julia> @time test_corkendall_fromfile() -67.175902 seconds (215.12 M allocations: 7.915 GiB, 2.59% gc time, 0.01% compilation time) +julia> @time test_corkendall_fromfile(2023) + 67.109173 seconds (216.00 M allocations: 8.827 GiB, 1.33% gc time) (1.683168734664675e-5, 1.734723475976807e-18) + +julia> @time test_corkendall_fromfile(2024) + 51.649551 seconds (160.94 M allocations: 7.011 GiB, 1.12% gc time, 4.74% compilation time) +(3.3306690738754696e-16, 0.0) ``` """ -function test_corkendall_fromfile() - - base_folder = "C:/Users/phili/OneDrive/ISDA SIMM/Solum Validation C-VIII 2023/EQ_delta" +function test_corkendall_fromfile(year) + + if year == 2023 + #ISDA's file for 2023 is kendall tau, not converted to Pearson + converttopearson = false + base_folder = "C:/Users/phili/OneDrive/ISDA SIMM/Solum Validation C-VIII 2023/EQ_delta" + #ISDA's calculation of KendallTau + isda_results_file = joinpath(base_folder, "9_correlations/eq_delta-kendall_recent_1-10d.csv") + #ISDA's returns data + file1 = joinpath(base_folder, "7_returns_relevant_period/returns-10d_recent_1.csv") + julia_results_file = "c:/temp/correlations_2023.csv" + elseif year == 2024 + #ISDA's file for 2024 is kendall tau, converted to Pearson ρ = sin(τπ/2) + converttopearson = true + base_folder = raw"C:\Users\phili\OneDrive\ISDA SIMM\Solum Validation C-IX 2024\EQ_Delta" + #ISDA's calculation of KendallTau + isda_results_file = joinpath(base_folder, raw"9_correlations\eq_delta-individual_recent_0-10d.csv") + #ISDA's returns data + file1 = joinpath(base_folder, raw"7_returns_relevant_period\returns-10d_recent_0.csv") + julia_results_file = "c:/temp/correlations_2024.csv" + else + throw("year must be 2023 or 2024 but got $year") + end - #ISDA's calculation of KendallTau - file_isda = joinpath(base_folder, "9_correlations/eq_delta-kendall_recent_1-10d.csv") - - #ISDA's raw data - file1 = joinpath(base_folder, "7_returns_relevant_period/returns-10d_recent_1.csv") file2 = "" - outputfile = "c:/temp/test.csv" inputshaveheaderrow = true inputshaveheadercol = true writeheaders = true - converttopearson = false missingstring = "" whattoreturn = "filename" - corkendall_fromfile(file1, file2, outputfile, inputshaveheaderrow, inputshaveheadercol, + corkendall_fromfile(file1, file2, julia_results_file, inputshaveheaderrow, inputshaveheadercol, writeheaders, converttopearson, missingstring, whattoreturn) - absdiffs = (abs.(KendallTau.readfromcsv(outputfile, true, true)[1] .- - KendallTau.readfromcsv(file_isda, true, true)[1])) - - maximum(absdiffs), median(absdiffs) + KendallTau.comparecorrelationfiles(julia_results_file, isda_results_file) end diff --git a/scripts/performancetests.jl b/scripts/performancetests.jl index 54c21c7..c80f2ce 100644 --- a/scripts/performancetests.jl +++ b/scripts/performancetests.jl @@ -1,7 +1,7 @@ using BenchmarkTools using Dates -using Plotly -using PlotlyJS +#using Plotly +#using PlotlyJS using Random using NamedArrays diff --git a/scripts/quickcheckallocations.jl b/scripts/quickcheckallocations.jl index 4ba1c88..9904123 100644 --- a/scripts/quickcheckallocations.jl +++ b/scripts/quickcheckallocations.jl @@ -15,17 +15,21 @@ xm = ifelse.(x .< 0.05, missing, x); println("="^100) @show(Dates.now()) @show ENV["COMPUTERNAME"] +println("Julia Version $VERSION") @show Threads.nthreads() @show size(x) @show typeof(x) +@show size(xm) +@show typeof(xm) + print("KendallTau.corkendall(x)$(" "^26)") -@time res_kt = KendallTau.corkendall(x) +@time res_1 = KendallTau.corkendall(x) print("KendallTau.corkendall(xm; skipmissing = :pairwise)") -@time res_kt = KendallTau.corkendall(xm; skipmissing=:pairwise) +@time res_2 = KendallTau.corkendall(xm; skipmissing=:pairwise) print("KendallTau.corkendall(xm; skipmissing = :listwise)") -@time res_kt = KendallTau.corkendall(xm; skipmissing=:listwise) +@time res_3 = KendallTau.corkendall(xm; skipmissing=:listwise) println("="^100) -nothing + #= ==================================================================================================== @@ -38,4 +42,19 @@ KendallTau.corkendall(x) 1.752958 seconds (2.28 k all KendallTau.corkendall(xm; skipmissing = :pairwise) 1.770887 seconds (2.28 k allocations: 8.938 MiB) KendallTau.corkendall(xm; skipmissing = :listwise) 0.003478 seconds (2.29 k allocations: 7.747 MiB) ==================================================================================================== + +==================================================================================================== +Dates.now() = DateTime("2024-01-23T16:25:13.336") +ENV["COMPUTERNAME"] = "DESKTOP-HSGAM5S" +Julia Version 1.10.0 +Threads.nthreads() = 20 +size(x) = (1000, 1000) +typeof(x) = Matrix{Float64} +size(xm) = (1000, 1000) +typeof(xm) = Matrix{Union{Missing, Float64}} +KendallTau.corkendall(x) 1.716595 seconds (1.26 k allocations: 16.528 MiB) +KendallTau.corkendall(xm; skipmissing = :pairwise) 1.738789 seconds (1.26 k allocations: 16.236 MiB) +KendallTau.corkendall(xm; skipmissing = :listwise) 0.002477 seconds (268 allocations: 22.915 MiB) +==================================================================================================== + =# \ No newline at end of file diff --git a/src/corkendall.jl b/src/corkendall.jl index c77cb4c..adc8b4c 100644 --- a/src/corkendall.jl +++ b/src/corkendall.jl @@ -73,8 +73,9 @@ function corkendall(x::RoMVector{T}, y::RoMVector{U}; skipmissing::Symbol=:none) length(x) == length(y) || throw(DimensionMismatch("x and y have inconsistent dimensions")) (x isa Vector{Missing} || y isa Vector{Missing}) && return NaN + x = copy(x) x, y = handlelistwise(x, y, skipmissing) - x, y = handlepairwise(copy(x), copy(y)) + x, y = handlepairwise(x, y) permx = sortperm(x) permute!(x, permx) @@ -128,7 +129,11 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non tys = duplicate(Vector{U}(undef, m), n_duplicates) sortyspaces = duplicate(Vector{U}(undef, m), n_duplicates) - Threads.@threads for j = (symmetric ? 2 : 1):nr + #= Use the "static scheduler". This is the "quickfix, but not recommended longterm" + way of avoiding concurrency bugs. See https://julialang.org/blog/2023/07/PSA-dont-use-threadid/#fixing_buggy_code_which_uses_this_pattern + TODO Adopt a "better fix" as outlined in that blog. + =# + Threads.@threads :static for j = (symmetric ? 2 : 1):nr if use_atomic id = Threads.atomic_add!(a, 1)[] @@ -161,8 +166,8 @@ function corkendall(x::RoMMatrix{T}, y::RoMMatrix{U}=x; skipmissing::Symbol=:non return C end -# Function returns a vector in this case, inconsistent with with Statistics.cor and -# StatsBase.corspearman. Fixing that is a breaking change. +# corkendall returns a vector in this case, inconsistent with with Statistics.cor and +# StatsBase.corspearman, but consistent with StatsBase.corkendall. function corkendall(x::RoMMatrix, y::RoMVector; skipmissing::Symbol=:none) return vec(corkendall(x, reshape(y, (length(y), 1)); skipmissing)) end @@ -366,7 +371,7 @@ end handlepairwise(x::RoMVector{T}, y::RoMVector{U}) where {T,U} Return a pair `(a,b)`, filtered copies of `x` and `y`, in which elements `x[i]` and `y[i]` -are filtered out if `ismissing(x[i])||ismissing(y[i])`. +are filtered out if `ismissing(x[i])||ismissing(y[i])`. """ handlepairwise(x::AbstractVector{<:Real}, y::AbstractVector{<:Real}) = x, y diff --git a/src/corkendall_fromfile.jl b/src/corkendall_fromfile.jl index b4883f1..38f63fe 100644 --- a/src/corkendall_fromfile.jl +++ b/src/corkendall_fromfile.jl @@ -25,7 +25,7 @@ files, writing the result to another csv file. `file2` if they exist or be `Column1,Column2,...` otherwise. - `converttopearson::Bool`: if `true` then the equivalent Pearson correlations ρ = sin(τ*π/2) are written to the output file. -- `missingstring::Union{Nothing,String,Vector{String}}=nothing`: Used to indicate how +- `missingstring::Union{Nothing,String,Vector{String}}=""`: Used to indicate how missing values are represented in `file1` and `file2` See `CSV.File`. - `whattoreturn::String="filename"` controls what the function returns. If "filename" then the function the name of the outputfile. If "median", the function returns the median of @@ -35,16 +35,16 @@ files, writing the result to another csv file. function corkendall_fromfile(file1::String, file2::String, outputfile::String, inputshaveheaderrow::Bool=false, inputshaveheadercol::Bool=false, writeheaders::Bool=false, converttopearson::Bool=false, - missingstring::Union{Nothing,String,Vector{String}}=nothing, whattoreturn::String="filename") + missingstring::Union{Nothing,String,Vector{String}}="", whattoreturn::String="filename") - data1, names1 = readfromcsv(file1, inputshaveheaderrow, inputshaveheadercol, missingstring=missingstring) + data1, names1 = csvread(file1, inputshaveheaderrow, inputshaveheadercol, missingstring=missingstring) symmetric = false if file2 == "" || file1 == file2 symmetric = true data2, names2 = data1, names1 else - data2, names2 = readfromcsv(file2, inputshaveheaderrow, inputshaveheadercol, missingstring=missingstring) + data2, names2 = csvread(file2, inputshaveheaderrow, inputshaveheadercol, missingstring=missingstring) end res = corkendall(data1, data2, skipmissing=:pairwise) @@ -73,37 +73,97 @@ function corkendall_fromfile(file1::String, file2::String, outputfile::String, return median(res) end else - throw("whattoreturn value '$whattoreturn' was not recognised") + throw("whattoreturn my be either 'filename' or 'median', but got '$whattoreturn'") end end -function readfromcsv(filename::String, ignorefirstrow::Bool, ignorefirstcol::Bool; +""" + csvread(filename::String, ignorefirstrow::Bool, ignorefirstcol::Bool; + missingstring::Union{Nothing,String,Vector{String}}="") + +Returns a pair `(data,names)` where +""" +function csvread(filename::String, ignorefirstrow::Bool, ignorefirstcol::Bool; missingstring::Union{Nothing,String,Vector{String}}="") header = ignorefirstrow ? 1 : 0 drop = ignorefirstcol ? [1] : [0] + types = Union{Missing,Float64} + strict = true - filedata = CSV.File(filename; header, drop, missingstring) + filedata = CSV.File(filename; header, drop, missingstring, types, strict) data = Tables.matrix(filedata) - names = CSV.getnames(filedata) - if !(data isa RoMMatrix) - throw("Data read from file '$filename' is of type $(typeof(data)) so the file seems \ - to contains data that is neither missing nor numeric. Check the following arguments \ - to function readfromcsv: ignorefirstrow was passed as $ignorefirstrow, ignorefirstcol \ - was passed as $ignorefirstcol and missingstring was passed as '$missingstring'") + #Convert to Array{Float64} if there are in fact no missings + if isnothing(findfirst(ismissing, data)) + data = identity.(data) end + names = CSV.getnames(filedata) + return data, names end -function testreadfromcsv() - #readfromcsv(raw"C:\Users\phili\OneDrive\ISDA SIMM\SolumWorking\2023\StressBalance\EQ\correlation\EQ_returns_1.csv",true,true) +function testcsvread() + #csvread(raw"C:\Users\phili\OneDrive\ISDA SIMM\SolumWorking\2023\StressBalance\EQ\correlation\EQ_returns_1.csv",true,true) - readfromcsv(raw"C:\Users\phili\OneDrive\ISDA SIMM\SolumWorking\2023\StressBalance\CRQ\correlation\CRQ_returns_1.csv", false, false, missingstring="NA") + csvread(raw"C:\Users\phili\OneDrive\ISDA SIMM\SolumWorking\2023\CRQ\correlation\CRQ_returns_1.csv", false, false, missingstring="NA") end function offdiag(A::AbstractMatrix) [A[ι] for ι in CartesianIndices(A) if ι[1] ≠ ι[2]] end + + +""" + comparecorrelationfiles(file1::String, file2::String) + +Compares two csv files, each representing a correlation matrix. Both files assumed to have +header top row and header left column. The function does not examine the left headers but assumes +they are the transpose of the top headers. The set of headers in file1 and file2 must be the +same but they needn't be arranged in the same order. The function returns a tuple, the +first element is the maximum absolute difference of the correlations, the second element is +the median absolute difference. +""" +function comparecorrelationfiles(file1::String, file2::String) + data1, names1 = csvread(file1, true, true) + data2, names2 = csvread(file2, true, true) + + size(data1) == size(data2) || throw("incompatible file sizes $(size(data1).+1) vs $(size(data2).+1)") + size(data1)[1] == size(data1)[2] || throw("Expected files to have equal number of rows and columns, but got $(size(data1).+1)") + + #Treat missing on-diagonal terms as 1.0 - fixes wierd problem with ISDA's files + for i in 1:size(data1)[1] + if ismissing(data1[i, i]) + data1[i, i] = 1.0 + end + if ismissing(data2[i, i]) + data2[i, i] = 1.0 + end + end + + names1 == unique(names1)||throw("Headers in file '$file1' are not unique") + + if names1 != names2 + if sort(names1) != sort(names2) + throw("The two files must have the same headers (though not necessarily in the same order)") + end + sp1 = sortperm(names1) + sp2 = sortperm(names2) + + data1 = data1[sp1, sp1] + data2 = data2[sp2, sp2] + + names1 = names1[sp1] + names2 = names2[sp2] + + end + + absdiffs = abs.(data1 .- data2) + + return (maximum(absdiffs), median(absdiffs)) + +end + +