diff --git a/Project.toml b/Project.toml
index 4408dd3..c39b71b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,11 +10,11 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-julia = "1"
 CSV = "0.10 - 0"
 DataFrames = "1"
 Statistics = "1"
 Tables = "1"
+julia = "1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/KendallTau.jl b/src/KendallTau.jl
index a8fb998..a94eedf 100644
--- a/src/KendallTau.jl
+++ b/src/KendallTau.jl
@@ -1,5 +1,6 @@
 module KendallTau
 
+
 include("corkendall.jl")
 include("corkendall_fromfile.jl")
 
diff --git a/src/corkendall.jl b/src/corkendall.jl
index 457325b..a9c0707 100644
--- a/src/corkendall.jl
+++ b/src/corkendall.jl
@@ -44,8 +44,8 @@ function corkendall(x::AbstractMatrix, y::AbstractMatrix=x;
 
 end
 
-function _corkendall(x::AbstractMatrix, y::AbstractMatrix,
-    C::AbstractMatrix, skipmissing::Symbol)
+function _corkendall(x::AbstractMatrix{T}, y::AbstractMatrix{U},
+    C::AbstractMatrix, skipmissing::Symbol) where {T,U}
 
     symmetric = x === y
 
@@ -56,47 +56,25 @@ function _corkendall(x::AbstractMatrix, y::AbstractMatrix,
 
     (m, nr), nc = size(x), size(y, 2)
 
-    # Avoid unnecessary allocation when nthreads is large but output matrix is small.
-    n_duplicates = min(Threads.nthreads(), symmetric ? nr - 1 : nr)
+    Threads.@threads for j = (symmetric ? 2 : 1):nr
 
-    use_atomic = n_duplicates < Threads.nthreads()
-    if use_atomic
-        a = Threads.Atomic{Int}(1)
-    end
-
-    #= Create scratch vectors so that threaded code can be non-allocating, a requirement for
-    good multi-threaded performance. One vector per thread to avoid cross-talk between
-    threads.
-    =#
-    duplicate(x, n) = [copy(x) for _ in 1:n]
-    scratch_pys = duplicate(similar(y, m), n_duplicates)
-    ycolis = duplicate(similar(y, m), n_duplicates)
-    sortedxcoljs = duplicate(similar(x, m), n_duplicates)
-    permxs = duplicate(zeros(Int, m), n_duplicates)
-    scratch_fxs = duplicate(similar(x, m), n_duplicates)
-    scratch_fys = duplicate(similar(y, m), n_duplicates)
-    scratch_sys = duplicate(similar(y, m), n_duplicates)
-
-    #= Use the "static scheduler". This is the "quickfix, but not recommended longterm" way
-    of avoiding concurrency bugs when using Threads.threadid().
-    https://julialang.org/blog/2023/07/PSA-dont-use-threadid/#fixing_buggy_code_which_uses_this_pattern
-    TODO Adopt a "better fix" as outlined in that blog.
-    =#
-    Threads.@threads :static for j = (symmetric ? 2 : 1):nr
-
-        if use_atomic
-            id = Threads.atomic_add!(a, 1)[]
-        else
-            id = Threads.threadid()
+        if !haskey(task_local_storage(), :scratch_py)
+            task_local_storage(:scratch_py, similar(y, m))
+            task_local_storage(:scratch_sy, similar(y, m))
+            task_local_storage(:ycoli, similar(y, m))
+            task_local_storage(:sortedxcolj, similar(x, m))
+            task_local_storage(:permx, zeros(Int, m))
+            task_local_storage(:scratch_fx, similar(x, m))
+            task_local_storage(:scratch_fy, similar(y, m))
         end
 
-        scratch_py = scratch_pys[id]
-        scratch_sy = scratch_sys[id]
-        ycoli = ycolis[id]
-        sortedxcolj = sortedxcoljs[id]
-        permx = permxs[id]
-        scratch_fx = scratch_fxs[id]
-        scratch_fy = scratch_fys[id]
+        scratch_py::Vector{U} = task_local_storage(:scratch_py)
+        scratch_sy::Vector{U} = task_local_storage(:scratch_sy)
+        ycoli::Vector{U} = task_local_storage(:ycoli)
+        sortedxcolj::Vector{T} = task_local_storage(:sortedxcolj)
+        permx::Vector{Int} = task_local_storage(:permx)
+        scratch_fx::Vector{T} = task_local_storage(:scratch_fx)
+        scratch_fy::Vector{U} = task_local_storage(:scratch_fy)
 
         sortperm!(permx, view(x, :, j))
         @inbounds for k in eachindex(sortedxcolj)