DEEPDIP-project
diff --git a/‎Project.toml‎
Lines changed: 12 additions & 4 deletions b/‎Project.toml‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎src/AttentionLayer.jl‎
Lines changed: 1 addition & 0 deletions b/‎src/AttentionLayer.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/layer.jl‎
Lines changed: 36 additions & 39 deletions b/‎src/layer.jl‎
Lines changed: 36 additions & 39 deletions
@@ -4,19 +4,31 @@ authors = ["SCiarella <[email protected]>"]
 version = "0.1.0"
 
 [deps]
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
+[sources]
+CoupledNODE = {rev = "main", url = "https://github.com/DEEPDIP-project/CoupledNODE.jl.git"}
+NeuralClosure = {rev = "main", url = "https://github.com/DEEPDIP-project/NeuralClosure.jl.git"}
+
 [compat]
+Atomix = "1.1.1"
 CUDA = "5"
+ChainRulesCore = "1.25.1"
 ComponentArrays = "0.15"
 JuliaFormatter = "1.0.62"
+KernelAbstractions = "0.9.34"
 Lux = "1"
+LuxCUDA = "0.3.3"
 LuxCore = "1"
 NNlib = "0.9.27"
 julia = "1.10"
@@ -33,9 +45,5 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
-[sources]
-NeuralClosure = {rev = "main", url = "https://github.com/DEEPDIP-project/NeuralClosure.jl.git"}
-CoupledNODE = {rev = "main", url="https://github.com/DEEPDIP-project/CoupledNODE.jl.git"}
-
 [targets]
 test = ["Test", "Adapt", "CoupledNODE", "IncompressibleNavierStokes", "JLD2", "NeuralClosure", "Optimisers", "OrdinaryDiffEqTsit5", "TestItemRunner", "Zygote"]
@@ -3,6 +3,7 @@ module AttentionLayer
 using CUDA: CUDA
 ArrayType = CUDA.functional() ? CUDA.CuArray : Array
 
+include("utils.jl")
 include("layer.jl")
 include("attention_cnn.jl")
 
 
@@ -1,6 +1,8 @@
 using Lux: Lux
 using LuxCore: AbstractLuxLayer
+using CUDA
 using Random: AbstractRNG
+using NNlib: batched_mul
 
 struct attention{F} <: AbstractLuxLayer
     T::Type
@@ -28,7 +30,7 @@ function attention(
     end
     @assert N % patch_size == 0 "N must be divisible by patch_size"
     n_patches = (div(N, patch_size))^d
-    dh = div(emb_size, n_heads)
+    dh = div(emb_size, n_heads) # dimension of each head (scale down the embedding size)
     attention(T, N, d, emb_size, patch_size, n_patches, n_heads, dh, init_weight)
 end
 
@@ -40,14 +42,16 @@ function Lux.initialparameters(
 )
     (;
         # the attention weights have this size
-        wQ = init_weight(rng, T, n_heads, dh, emb_size + 1),
-        wK = init_weight(rng, T, n_heads, dh, emb_size + 1),
-        wV = init_weight(rng, T, n_heads, dh, emb_size + 1),
+        wQ = init_weight(rng, T, n_heads, dh, emb_size),
+        wK = init_weight(rng, T, n_heads, dh, emb_size),
+        wV = init_weight(rng, T, n_heads, dh, emb_size),
         # then the embedding operator
         Ew = init_weight(rng, T, emb_size, patch_size * patch_size * d),
         Eb = zeros(T, emb_size),
-        # then the multihead attention
+        # then the multihead attention output matrix
         U = init_weight(rng, T, N * N * d, n_patches * n_heads * dh),
+        # and the positional embedding
+        pos_emb = init_weight(rng, T, emb_size, div(N, patch_size), div(N, patch_size)),
     )
 end
 
@@ -61,26 +65,27 @@ function Lux.initialstates(
         d = d,
         emb_size = emb_size,
         patch_size = patch_size,
-        n_patches = n_patches,
+        n_patches = n_patches, # total number of patches
         n_heads = n_heads,
         dh = dh,
         sqrtDh = T(sqrt(dh)),
+        num_patches_1d = div(N, patch_size),
     )
 end
 function Lux.parameterlength(
     (; N, d, n_heads, dh, emb_size, patch_size, n_patches)::attention,
 )
-    size_wQ = n_heads * dh * (emb_size + 1)
-    size_wK = n_heads * dh * (emb_size + 1)
-    size_wV = n_heads * dh * (emb_size + 1)
+    size_wQ = n_heads * dh * emb_size
+    size_wK = n_heads * dh * emb_size
+    size_wV = n_heads * dh * emb_size
     size_Ew = emb_size * patch_size * patch_size * d
     size_Eb = emb_size
     size_U = N * N * d * n_patches * n_heads * dh
 
     total_size = size_wQ + size_wK + size_wV + size_Ew + size_Eb + size_U
     return total_size
 end
-Lux.statelength(::attention) = 9
+Lux.statelength(::attention) = 11
 
 # This is what each layer does:
 # expected input shape: [N, N, d, batch]
@@ -93,52 +98,44 @@ function ((;)::attention)(x, params, state)
     dh = state.dh
     sqrtDh = state.sqrtDh
     n_heads = state.n_heads
+    num_patches_1d = state.num_patches_1d
 
     Ew = params.Ew
     Eb = params.Eb
     wQ = params.wQ
     wK = params.wK
     wV = params.wV
     U = params.U
+    pos_emb = params.pos_emb
+
+    batch = size(x, ndims(x))
 
     # (1) Split the image into patches
-    num_patches = div(N, ps)
-    #The subarray of x here is by default a copy, but it can be a view (its not edited)
-    x_patches = [
-        @view(x[(i*ps+1):(i*ps+ps), (j*ps+1):(j*ps+ps), :, :]) for
-        i = 0:(num_patches-1), j = 0:(num_patches-1)
-    ]
+    x_patches = reshape(x, ps, num_patches_1d, ps, num_patches_1d, d, batch)
+    x_patches = permutedims(x_patches, (1, 3, 5, 2, 4, 6))
     # (2) flatten the patches
-    # reshape is fine and will not create a copy here, as only the first dims are merged, and because julia
-    # is column order, this does not change the shape of the underlying data, this is true for all following reshapes
-    x_pflat = [reshape(p, ps * ps * d, size(p, ndims(p))) for p in x_patches]
-
+    x_patches = reshape(x_patches, ps * ps * d, :)
     # (3) project the patches onto the embedding space
-    x_emb = [Ew * p .+ Eb for p in x_pflat]
+    x_emb = Ew * x_patches .+ Eb
+    x_emb = reshape(x_emb, size(x_emb, 1), num_patches_1d, num_patches_1d, batch)
 
-    # (4) positional embedding
+    # (4) add the positional embedding
     # notice that we use 1D positional embedding, as suggested [here](https://arxiv.org/pdf/2010.11929)
-    x_lemb = [
-        cat(p, ones(state.T, 1, size(p)[2:end]...) * i; dims = 1) for
-        (i, p) in enumerate(x_emb)
-    ]
+    x_lemb = x_emb .+ pos_emb
+    x_lemb = reshape(x_lemb, size(x_lemb, 1), num_patches_1d * num_patches_1d, batch)
 
     # (5) compute the attention scores
-    # [!] notice that you can not reuse some variable names otherwise Zygote gets confused
-    Q0 = [wQ[i, :, :] * x_lemb[patchi] for i = 1:n_heads, patchi = 1:np]
-    K0 = [wK[i, :, :] * x_lemb[patchi] for i = 1:n_heads, patchi = 1:np]
-    V0 = [wV[i, :, :] * x_lemb[patchi] for i = 1:n_heads, patchi = 1:np]
-    # Reshape Q, K, V to match desired output dimensions
-    Q = reshape(vcat(Q0...), (n_heads, np, dh, size(x, ndims(x))))
-    K = reshape(vcat(K0...), (n_heads, np, dh, size(x, ndims(x))))
-    V = reshape(vcat(V0...), (n_heads, np, dh, size(x, ndims(x))))
-    # (6) Compute attention scores without mutations
-    A = [Lux.softmax(Q[i, p, :, :] .* K[i, p, :, :] / sqrtDh) for i = 1:n_heads, p = 1:np]
-    A = reshape(vcat(A...), (n_heads, np, dh, size(x, ndims(x))))
-    SA = A .* V
+    Q = compute_QKV(x_lemb, wQ)
+    K = compute_QKV(x_lemb, wK)
+    V = compute_QKV(x_lemb, wV)
+
+    # (6) Compute attention scores
+    A = attention_weights(Q, K)
+    A = Lux.softmax(A / sqrtDh, dims = 3)
+    A = attention_scores(A, V)
 
     # (7) multihead attention
-    MSA = reshape(SA, n_heads * np * dh, size(x, ndims(x)))
+    MSA = reshape(A, n_heads * np * dh, size(x, ndims(x)))
     MSA = U * MSA
     MSA = reshape(MSA, size(x)...)