Reformat U (#30)

SCiarella · web-flow · commit cfee2aab1c69 · 2025-05-08T10:21:01.000+02:00
Replaced the large parameter-heavy matrix U ∈ (N² × d, n_heads × np × dh) with a smaller linear decoder that maps each patch embedding (128-dim) to its corresponding flattened image patch (2738-dim). This significantly reduces parameter count and improves efficiency while preserving the ability to reconstruct the full image from attention output.
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,6 @@ coverage
 docs/build/
 env
 node_modules
+
+# Large file
+test/data_train_real.jld2
diff --git a/src/layer.jl b/src/layer.jl
@@ -47,9 +47,12 @@ function Lux.initialparameters(
         Ew = init_weight(rng, T, emb_size, patch_size * patch_size * d),
         Eb = zeros(T, emb_size),
         # then the multihead attention output matrix
-        U = init_weight(rng, T, N * N * d, n_patches * n_heads * dh),
-        # and the positional embedding
+        #U = init_weight(rng, T, N * N * d, n_patches * n_heads * dh),
+        U = init_weight(rng, T, emb_size, emb_size),  # i.e., 128 × 128
+        # the positional embedding
         pos_emb = init_weight(rng, T, emb_size, div(N, patch_size), div(N, patch_size)),
+        # and a final decoder
+        dec = init_weight(rng, T, patch_size * patch_size * d, emb_size),  # (2738, 128)
     )
 end
 
@@ -78,12 +81,15 @@ function Lux.parameterlength(
     size_wV = n_heads * dh * emb_size
     size_Ew = emb_size * patch_size * patch_size * d
     size_Eb = emb_size
-    size_U = N * N * d * n_patches * n_heads * dh
-
-    total_size = size_wQ + size_wK + size_wV + size_Ew + size_Eb + size_U
+    #size_U = N * N * d * n_patches * n_heads * dh
+    size_U = emb_size * emb_size
+    size_dec = patch_size * patch_size * d * emb_size
+    size_pos_emb = emb_size * div(N, patch_size) * div(N, patch_size)
+    total_size =
+        size_wQ + size_wK + size_wV + size_Ew + size_Eb + size_U + size_dec + size_pos_emb
     return total_size
 end
-Lux.statelength(::attention) = 11
+Lux.statelength(::attention) = 12
 
 # This is what each layer does:
 # expected input shape: [N, N, d, batch]
@@ -97,6 +103,7 @@ function ((;)::attention)(x, params, state)
     sqrtDh = state.sqrtDh
     n_heads = state.n_heads
     num_patches_1d = state.num_patches_1d
+    emb_size = state.emb_size
 
     Ew = params.Ew
     Eb = params.Eb
@@ -105,6 +112,7 @@ function ((;)::attention)(x, params, state)
     wV = params.wV
     U = params.U
     pos_emb = params.pos_emb
+    dec = params.dec
 
     batch = size(x, ndims(x))
 
@@ -133,10 +141,55 @@ function ((;)::attention)(x, params, state)
     A = attention_scores(A, V)
 
     # (7) multihead attention
-    MSA = reshape(A, n_heads * np * dh, size(x, ndims(x)))
-    MSA = U * MSA
-    MSA = reshape(MSA, size(x)...)
+    #MSA = reshape(A, n_heads * np * dh, size(x, ndims(x)))
+    #MSA = U * MSA
+    #MSA = reshape(MSA, size(x)...)
+
+
+    #A = reshape(A, n_heads * dh, np, batch)  # (emb_size, np, batch)
+    #A_flat = reshape(A, n_heads * dh, :)
+    #MSA = U * A_flat                          # U ∈ (emb_size, emb_size)
+    #MSA = reshape(MSA, n_heads * dh, np, batch)
+    #@info "***********************"
+    #@info "x shape: $(size(x))"
+    #@info "MSA size: $(size(MSA))"
+
+    ## A ∈ (n_heads * dh, np, batch) == (128, 16, batch)
+    #MSA = reshape(MSA, emb_size, np, batch)
+
+    ## Flatten across np × batch
+    #A_flat = reshape(MSA, emb_size, :)         # (128, 16 * batch)
+
+    ## Decode each patch embedding into flattened image patch
+    #decoded_patches = dec * A_flat             # (2738, 16 * batch)
+
+    ## Reshape back into patch layout: (ps, ps, d, np, batch)
+    #decoded_patches = reshape(decoded_patches, ps, ps, d, np, batch)
+
+    ## Reshape np = 4 × 4 back into grid layout
+    #patches_grid = reshape(decoded_patches, ps, ps, d, num_patches_1d, num_patches_1d, batch)
+
+    ## Reorder axes to reconstruct full image
+    #output = permutedims(patches_grid, (1, 4, 2, 5, 3, 6))  # (ps, np1d, ps, np1d, d, batch)
+    #output = reshape(output, N, N, d, batch)               # (148, 148, 2, batch)
+
+    ## Attention layer does not modify state
+    #output, state
+
+
+    # (7) multihead attention
+    # Combine reshapes and matrix multiplications
+    A_flat = reshape(A, n_heads * dh, :)  # (emb_size, np * batch)
+    MSA = U * A_flat                     # Apply U (U ∈ (emb_size, emb_size)) -> (emb_size, np * batch)
+
+    # (8) Decode each patch and reshape directly into the final image layout
+    output = reshape(dec * MSA, ps, ps, d, num_patches_1d, num_patches_1d, batch)
+
+    # (9) Reorder to reconstruct the full image
+    output = permutedims(output, (1, 4, 2, 5, 3, 6))  # (ps, np1d, ps, np1d, d, batch)
+    output = reshape(output, N, N, d, batch)           # (148, 148, 2, batch)
 
     # Attention layer does not modify state
-    MSA, state
+    output, state
+
 end
diff --git a/test/config_real.yaml b/test/config_real.yaml
@@ -0,0 +1,56 @@
+docreatedata: true
+docomp: true
+ntrajectory: 8
+T: "Float32"
+params:
+  D: 2
+  lims: [0.0, 1.0]
+  Re: 6000.0
+  tburn: 0.5
+  tsim: 5.0
+  savefreq: 50
+  ndns: 4096
+  nles: [128]
+  filters: ["FaceAverage()"]
+  icfunc: "(setup, psolver, rng) -> random_field(setup, T(0); kp=20, psolver, rng)"
+  method: "RKMethods.Wray3(; T)"
+  bodyforce: "(dim, x, y, t) -> (dim == 1) * 5 * sinpi(8 * y)"
+  issteadybodyforce: true
+  processors: "(; log = timelogger(; nupdate=100))"
+  Δt: 0.00005
+seeds:
+  dns: 123
+  θ_start: 234
+  prior: 345
+  post: 456
+closure:
+  name: "att_3"
+  type: attentioncnn
+  radii: [2, 2, 2, 2, 2]
+  channels: [24, 24, 24, 24, 2]
+  activations: ["tanh", "tanh", "tanh", "tanh", "identity"]
+  use_bias: [true, true, true, true, false]
+  use_attention: [true, false, false, false, false]
+  emb_sizes: [124, 124, 124, 124, 124]
+  Ns: [148, 144, 140, 136, 132]
+  patch_sizes: [37, 36, 35, 34, 33]
+  n_heads: [4, 4, 4, 4, 4]
+  sum_attention: [false, false, false, false, false]
+  rng: "Xoshiro(seeds.θ_start)"
+priori:
+  dotrain: true
+  nepoch: 10000
+  batchsize: 64
+  opt: "OptimiserChain(Adam(T(1.0e-3)), ClipGrad(0.1))"
+  do_plot: false
+  plot_train: false
+posteriori:
+  dotrain: true
+  projectorders: "(ProjectOrder.Last, )"
+  nepoch: 100
+  opt: "OptimiserChain(Adam(T(1.0e-4)), ClipGrad(0.1))"
+  nunroll: 5
+  nunroll_valid: 5
+  dt: T(1e-5)
+  do_plot: false
+  plot_train: false
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -11,6 +11,7 @@ The file will be automatically included inside a `@testset` with title "Title Fo
 for (root, dirs, files) in walkdir(@__DIR__)
     for file in files
         if isnothing(match(r"^test-.*\.jl$", file))
+            #if isnothing(match(r"^test-couplednode_real.*\.jl$", file))
             continue
         end
         title = titlecase(replace(splitext(file[6:end])[1], "-" => " "))
diff --git a/test/test-cnn_1att.jl b/test/test-cnn_1att.jl
@@ -0,0 +1,143 @@
+using Test
+using Lux
+using CUDA
+using LuxCUDA
+using AttentionLayer: attention, attentioncnn
+using ComponentArrays: ComponentArray
+using Random
+using Zygote: Zygote
+
+# Define parameters for the model
+T = Float32
+N = 128
+D = 2
+batch = 5
+rng = Xoshiro(123)
+r = [2, 2, 2, 2, 2]
+c = [24, 24, 24, 24, 2]
+σ = [tanh, tanh, tanh, tanh, identity]
+b = [true, true, true, true, false]
+use_attention = [true, false, false, false, false]
+sum_attention = [false, false, false, false, false]
+Ns = reverse([N + 2 * sum(r[1:i]) for i = 1:length(r)])
+patch_sizes = [37, 36, 35, 34, 33]
+emb_sizes = [8, 8, 8, 8, 8]
+n_heads = [2, 2, 2, 2, 2]
+
+@testset "AttentionCNN (CPU)" begin
+
+    # Create the model
+    closure, θ, st = attentioncnn(
+        T = T,
+        D = D,
+        data_ch = D,
+        radii = r,
+        channels = c,
+        activations = σ,
+        use_bias = b,
+        use_attention = use_attention,
+        emb_sizes = emb_sizes,
+        Ns = Ns,
+        patch_sizes = patch_sizes,
+        n_heads = n_heads,
+        sum_attention = sum_attention,
+        rng = rng,
+        use_cuda = false,
+    )
+
+    @testset "Model setup" begin
+        @test closure != nothing
+        @test θ != nothing
+        @test st != nothing
+        # Test model structure
+        @test typeof(closure) <: Lux.Chain
+    end
+
+    # Define input tensor and pass through model
+    input_tensor = rand(T, N, N, D, batch)  # Example input with shape (N, N, D, batch_size)
+    output = closure(input_tensor, θ, st)
+
+    @testset "Model output" begin
+        @test output != nothing
+        @test length(output) == 2  # Check that the output is a tuple
+        @test isa(output[1], Array)
+        @test size(output[1]) == (N, N, D, batch)  # Check final output size
+    end
+
+    @testset "AD" begin
+        # Test Differentiability by calculating gradients
+        grad = Zygote.gradient(θ -> sum(abs2, closure(input_tensor, θ, st)[1]), θ)
+        @test !isnothing(grad)  # Ensure gradients were successfully computed
+        @test sum(grad) != 0.0  # Ensure gradients are not zero
+
+        y, back = Zygote.pullback(θ -> sum(abs2, closure(input_tensor, θ, st)[1]), θ)
+        @test y ≈ sum(abs2, closure(input_tensor, θ, st)[1])
+        y_bar = ones(T, size(y))
+        θ_bar = back(y_bar)
+        @test θ_bar != nothing
+        @test sum(θ_bar) != 0.0  # Ensure gradients are not zero
+    end
+
+end
+
+@testset "AttentionCNN (GPU)" begin
+    if !CUDA.functional()
+        @testset "CUDA not available" begin
+            @test true
+        end
+        return
+    end
+
+    # Create the model
+    closure, θ, st = attentioncnn(
+        T = T,
+        D = D,
+        data_ch = D,
+        radii = r,
+        channels = c,
+        activations = σ,
+        use_bias = b,
+        use_attention = use_attention,
+        emb_sizes = emb_sizes,
+        Ns = Ns,
+        patch_sizes = patch_sizes,
+        n_heads = n_heads,
+        sum_attention = sum_attention,
+        rng = rng,
+        use_cuda = true,
+    )
+
+    @testset "Model setup" begin
+        @test closure != nothing
+        @test θ != nothing
+        @test st != nothing
+        # Test model structure
+        @test typeof(closure) <: Lux.Chain
+    end
+
+    # Define input tensor and pass through model
+    input_tensor = CUDA.rand(T, N, N, D, batch)  # Example input with shape (N, N, D, batch_size)
+    output = closure(input_tensor, θ, st)
+
+    @testset "Model output" begin
+        @test output != nothing
+        @test length(output) == 2  # Check that the output is a tuple
+        @test isa(output[1], CuArray)
+        @test size(output[1]) == (N, N, D, batch)  # Check final output size
+    end
+
+    @testset "AD" begin
+        # Test Differentiability by calculating gradients
+        grad = Zygote.gradient(θ -> sum(abs2, closure(input_tensor, θ, st)[1]), θ)
+        @test !isnothing(grad)  # Ensure gradients were successfully computed
+        @test sum(grad) != 0.0  # Ensure gradients are not zero
+
+        y, back = Zygote.pullback(θ -> sum(abs2, closure(input_tensor, θ, st)[1]), θ)
+        @test y ≈ sum(abs2, closure(input_tensor, θ, st)[1])
+        y_bar = CUDA.ones(T, size(y))
+        θ_bar = back(y_bar)
+        @test θ_bar != nothing
+        @test sum(θ_bar) != 0.0  # Ensure gradients are not zero
+    end
+
+end
diff --git a/test/test-couplednode.jl b/test/test-couplednode.jl
@@ -156,7 +156,7 @@ sum_attention = [false, false]
     end
     tmp1, tmp2 = back(λ)
     @test size(tmp1) == (18, 18, 2)
-    @test size(tmp2) == (94194,)
+    @test size(tmp2) == (2266,)
 
     # Final integration test of the entire train interface
     l, trainstate = CoupledNODE.train(
@@ -308,7 +308,7 @@ end
     end
     tmp1, tmp2 = back(λ)
     @test size(tmp1) == (18, 18, 2)
-    @test size(tmp2) == (94194,)
+    @test size(tmp2) == (2266,)
     @test isa(tmp1, CuArray)  # Check if tmp1 is on GPU
 
     # Final integration test of the entire train interface
diff --git a/test/test-couplednode_loader.jl b/test/test-couplednode_loader.jl
@@ -55,8 +55,6 @@ end
     @info "CNN warm up run"
     u = randn(Float32, 32 + 2, 32 + 2, 2, 10) |> device
     θ = θ_start |> device
-    #u = CUDA.rand(Float32, 32+2, 32+2, 2, 10)
-    #θ = θ_start |> Lux.gpu_device()
     output, _ = closure(u, θ, st)
 
     @test size(output) == (32 + 2, 32 + 2, 2, 10)
diff --git a/test/test-couplednode_real.jl b/test/test-couplednode_real.jl