some tests for new healpix ring workspace

xzackli · xzackli · commit 2d8237576205 · 2025-08-30T20:54:37.000+09:00
diff --git a/Project.toml b/Project.toml
@@ -1,10 +1,11 @@
 name = "XGPaint"
 uuid = "af630e4a-6754-4ec2-ab8a-f9f8b9ebafbf"
 authors = ["Zack Li"]
-version = "0.4"
+version = "0.4.0"
 
 [deps]
 CSVFiles = "5d742f6a-9f54-50ce-8119-2520741973ca"
+ChunkSplitters = "ae650224-84b6-46f8-82ea-d812ca08434e"
 Cosmology = "76746363-e552-5dba-9a5a-cef6fa9cc5ab"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0"
diff --git a/src/profiles.jl b/src/profiles.jl
@@ -1,10 +1,10 @@
 
 
-# RECTANGULAR WORKSPACES
+# Import ChunkSplitters for better threading
+using ChunkSplitters: chunks
 
+# RECTANGULAR WORKSPACES
 
-# default workspaces are immutable, so just forward the type
-wrapserialworkspace(w, tid) = w
 
 struct CarClenshawCurtisProfileWorkspace{T,A<:AbstractArray{T,2}} <: AbstractProfileWorkspace{T}
     sin_α::A
@@ -50,13 +50,16 @@ function profile_grid(model::AbstractGNFW{T}, logθs, redshifts, logMs) where T
     N_logθ, N_z, N_logM = length(logθs), length(redshifts), length(logMs)
     A = zeros(T, (N_logθ, N_z, N_logM))
 
-    Threads.@threads :static for im in 1:N_logM
-        logM = logMs[im]
-        M = 10^(logM)
-        for (iz, z) in enumerate(redshifts)
-            for iθ in 1:N_logθ
-                θ = exp(logθs[iθ])
-                A[iθ, iz, im] = max(zero(T), model(θ, M, z))
+    # Use ChunkSplitters for better load balancing
+    Threads.@threads for chunk in chunks(1:N_logM; n=Threads.nthreads())
+        for im in chunk
+            logM = logMs[im]
+            M = 10^(logM)
+            for (iz, z) in enumerate(redshifts)
+                for iθ in 1:N_logθ
+                    θ = exp(logθs[iθ])
+                    A[iθ, iz, im] = max(zero(T), model(θ, M, z))
+                end
             end
         end
     end
@@ -325,7 +328,7 @@ end
 function profile_paint!(m::Enmap{T, 2, Matrix{T}, Gnomonic{T}}, 
                         workspace::GnomonicProfileWorkspace, model, Mh, z, α₀, δ₀, 
                         θmax, normalization=1) where T
-    profile_paint_generic!(m, model, workspace, Mh, z, α₀, δ₀, θmax, normalization)
+    profile_paint_generic!(m, workspace, model, Mh, z, α₀, δ₀, θmax, normalization)
 end
 
 
@@ -417,25 +420,15 @@ function paint!(m, workspace, model, masses, redshifts, αs, δs;
     zerobeforepainting && _fillzero!(m)
 
     N_sources = length(masses)
-    chunksize = ceil(Int, N_sources / (2Threads.nthreads()))
-    chunks = chunk(N_sources, chunksize)
-
+    
     if N_sources < 2Threads.nthreads()  # don't thread if there are not many sources
-        return paintrange!(1:N_sources, m, wrapserialworkspace(workspace, 1), 
+        return paintrange!(1:N_sources, m, workspace, 
             model, masses, redshifts, αs, δs)
     end
 
-    Threads.@threads for ti in 1:Threads.nthreads()
-        chunk_i = 2ti
-        i1, i2 = chunks[chunk_i]
-        paintrange!(i1:i2, m, wrapserialworkspace(workspace, ti), 
-            model, masses, redshifts, αs, δs)
-    end
-
-    Threads.@threads for ti in 1:Threads.nthreads()
-        chunk_i = 2ti - 1
-        i1, i2 = chunks[chunk_i]
-        paintrange!(i1:i2, m, wrapserialworkspace(workspace, ti), 
+    # Use ChunkSplitters for better load balancing
+    Threads.@threads for chunk in chunks(1:N_sources; n=2*Threads.nthreads())
+        paintrange!(chunk, m, workspace, 
             model, masses, redshifts, αs, δs)
     end
 end
@@ -460,25 +453,15 @@ function paint!(m, workspace, model, masses, redshifts, αs, δs, proj_v_over_c;
     zerobeforepainting && _fillzero!(m)
 
     N_sources = length(masses)
-    chunksize = ceil(Int, N_sources / (2Threads.nthreads()))
-    chunks = chunk(N_sources, chunksize)
-
+    
     if N_sources < 2Threads.nthreads()  # don't thread if there are not many sources
-        return paintrange!(1:N_sources, m, wrapserialworkspace(workspace, 1), 
-            model, masses, redshifts, αs, δs, proj_v_over_c)
-    end
-
-    Threads.@threads for ti in 1:Threads.nthreads()
-        chunk_i = 2ti
-        i1, i2 = chunks[chunk_i]
-        paintrange!(i1:i2, m, wrapserialworkspace(workspace, ti), 
+        return paintrange!(1:N_sources, m, workspace, 
             model, masses, redshifts, αs, δs, proj_v_over_c)
     end
 
-    Threads.@threads for ti in 1:Threads.nthreads()
-        chunk_i = 2ti - 1
-        i1, i2 = chunks[chunk_i]
-        paintrange!(i1:i2, m, wrapserialworkspace(workspace, ti), 
+    # Use ChunkSplitters for better load balancing
+    Threads.@threads for chunk in chunks(1:N_sources; n=2*Threads.nthreads())
+        paintrange!(chunk, m, workspace, 
             model, masses, redshifts, αs, δs, proj_v_over_c)
     end
 end
@@ -488,6 +471,6 @@ end
 # function paint!(m, workspace::HealpixSerialProfileWorkspace, model, masses, redshifts, αs, δs; 
 #         zerobeforepainting=true)
 #     zerobeforepainting && _fillzero!(m)
-#     return paintrange!(1:length(masses), m, wrapserialworkspace(workspace, 1), 
+#     return paintrange!(1:length(masses), m, workspace, 
 #         model, masses, redshifts, αs, δs)
 # end
diff --git a/src/util.jl b/src/util.jl
@@ -3,6 +3,7 @@ using Healpix
 using Random
 using Random: MersenneTwister
 using LazyArtifacts
+using ChunkSplitters: chunks
 
 """
 Utility function to read an HDF5 table with x, y, z, M_h as the four rows.
@@ -96,22 +97,12 @@ function chunk(arr_len, chunksize::Integer)
 end
 
 
-function getrange(n)
-    tid = Threads.threadid()
-    nt = Threads.nthreads()
-    d , r = divrem(n, nt)
-    from = (tid - 1) * d + min(r, tid - 1) + 1
-    to = from + d - 1 + (tid ≤ r ? 1 : 0)
-    from:to
-end
-
-
-function threaded_rand!(random_number_generators, arr::Array{T,1};
-      chunksize=4096) where T
+function threaded_rand!(random_number_generators, arr::Array{T,1}) where T
 
-   num = size(arr,1)
-   Threads.@threads :static for (i1, i2) in chunk(num, chunksize)
-      @views rand!(random_number_generators[Threads.threadid()], arr[i1:i2])
+   # Use ChunkSplitters for better threading
+   Threads.@threads for chunk in chunks(eachindex(arr); n=Threads.nthreads())
+      tid = Threads.threadid()
+      @views rand!(random_number_generators[tid], arr[chunk])
    end
 end
 
diff --git a/test/test_ringworkspace.jl b/test/test_ringworkspace.jl
@@ -6,7 +6,10 @@ using Test
 """
 Brute-force reference implementation that loops over all pixels
 """
-function profile_paint_bruteforce!(m::HealpixMap{T, RingOrder}, model, Mh, z, α₀, δ₀, θmax, normalization=1) where T
+function profile_paint_bruteforce!(
+    m::HealpixMap{T, RingOrder}, model, Mh, z, α₀, δ₀, θmax, normalization=1
+) where T
+
     ϕ₀ = α₀  
     θ₀ = T(π)/2 - δ₀
     x₀, y₀, z₀ = ang2vec(θ₀, ϕ₀)
@@ -107,42 +110,6 @@ XGPaint.compute_θmin(::TestModel{T}) where T = eps(T)
         end
     end
 
-    @testset "RingWorkspace Performance" begin
-        # Simple performance check - should be faster than brute force
-        nside = 128  # Smaller than before for faster testing
-        res = Healpix.Resolution(nside)
-        workspace = RingWorkspace(res)
-        
-        map_ringworkspace = HealpixMap{Float64, RingOrder}(zeros(Float64, nside2npix(nside)))
-        map_bruteforce = HealpixMap{Float64, RingOrder}(zeros(Float64, nside2npix(nside)))
-        
-        model = TestModel(1.0)
-        
-        # Test case
-        α₀, δ₀, θmax = 1.5708, 0.7854, 0.1
-        Mh, z = 1e14, 0.5
-        
-        # Warm up
-        XGPaint.profile_paint_generic!(map_ringworkspace, workspace, model, Mh, z, α₀, δ₀, θmax)
-        profile_paint_bruteforce!(map_bruteforce, model, Mh, z, α₀, δ₀, θmax)
-        
-        # Time RingWorkspace
-        fill!(map_ringworkspace.pixels, 0.0)
-        time_ringworkspace = @elapsed XGPaint.profile_paint_generic!(map_ringworkspace, workspace, model, Mh, z, α₀, δ₀, θmax)
-        
-        # Time brute force
-        fill!(map_bruteforce.pixels, 0.0)
-        time_bruteforce = @elapsed profile_paint_bruteforce!(map_bruteforce, model, Mh, z, α₀, δ₀, θmax)
-        
-        speedup = time_bruteforce / time_ringworkspace
-        
-        # Should be significantly faster
-        @test speedup > 5.0  # Conservative threshold
-        
-        # Should also give same results
-        @test map_ringworkspace.pixels ≈ map_bruteforce.pixels
-    end
-
     @testset "RingWorkspace vs Brute Force with Real Y Profile" begin
         # Setup with real y profile model
         nside = 32  # Smaller for faster testing with real profile