phu0ngng
diff --git a/‎tests/jax/test_distributed_fused_attn.py
Lines changed: 6 additions & 0 deletions b/‎tests/jax/test_distributed_fused_attn.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/jax/test_fused_attn.py
Lines changed: 55 additions & 20 deletions b/‎tests/jax/test_fused_attn.py
Lines changed: 55 additions & 20 deletions
diff --git a/‎transformer_engine/jax/attention.py
Lines changed: 6 additions & 4 deletions b/‎transformer_engine/jax/attention.py
Lines changed: 6 additions & 4 deletions
@@ -80,6 +80,7 @@ def impl_test_self_attn(
             seqlen,
             seqlen,
             hidden,
+            hidden,
             None,  # no window
         ):
             pytest.skip("No FusedAttn backend found")
@@ -99,6 +100,7 @@ def impl_test_self_attn(
             num_head,
             num_head,
             hidden,
+            hidden,
             attn_bias_type,
             attn_mask_type,
             dropout_prob,
@@ -227,6 +229,7 @@ def test_cross_attn(
             seqlen,
             seqlen,
             hidden,
+            hidden,
             None,  # no window
         ):
             pytest.skip("No FusedAttn backend found")
@@ -239,6 +242,7 @@ def test_cross_attn(
             num_head,
             num_head,
             hidden,
+            hidden,
             attn_bias_type,
             attn_mask_type,
             dropout_prob,
@@ -329,6 +333,7 @@ def impl_test_context_parallel_attn(
             num_head,
             num_kv_heads,
             hidden,
+            hidden,
             attn_bias_type,
             attn_mask_type,
             dropout_prob,
@@ -360,6 +365,7 @@ def check_has_backend_for_mask(mask_type):
                 seqlen,
                 seqlen,
                 hidden,
+                hidden,
                 None,
             )  # no SWA for CP
 
 
@@ -106,7 +106,8 @@ def general_dot_product_attention(
         softmax_out = softmax_out * multiplier
 
     context = jnp.einsum("...hgqk,...khd->...qhgd", softmax_out, value)
-    context = jnp.reshape(context, query.shape)
+    context_shape = query.shape[:-1] + (value.shape[-1],)
+    context = jnp.reshape(context, context_shape)
     return context
 
 
@@ -294,7 +295,8 @@ class FusedAttnRunner:
     max_seqlen_kv: int
     num_heads_q: int
     num_heads_kv: int
-    head_dim: int
+    head_dim_qk: int
+    head_dim_v: int
     attn_bias_type: AttnBiasType
     attn_mask_type: AttnMaskType
     dropout_prob: float
@@ -346,6 +348,14 @@ def _check_configs(self):
                 "seqlen_q > seqlen_kv is not supported with sliding window attention in cuDNN"
             )
 
+        # Test the MLA case where head dims for qk differ from head dims for v, only if the tensors
+        # are provided in BSHD_BSHD_BSHD or THD_THD_THD formats
+        if self.head_dim_qk != self.head_dim_v and not self.qkv_layout.is_separate():
+            pytest.skip(
+                "For head_dim_qk != head_dim_v, it is necessary that the QKV layout "
+                "is either BSHD_BSHD_BSHD or THD_THD_THD"
+            )
+
         self.backend = FusedAttnHelper(
             self.is_training,
             self.dtype,
@@ -358,7 +368,8 @@ def _check_configs(self):
             self.num_heads_kv,
             self.max_seqlen_q,
             self.max_seqlen_kv,
-            self.head_dim,
+            self.head_dim_qk,
+            self.head_dim_v,
             (-1, -1) if self.window_size is None else self.window_size,
         ).get_fused_attn_backend()
         if self.backend == NVTE_Fused_Attn_Backend.NVTE_No_Backend:
@@ -391,13 +402,9 @@ def _setup_inputs(self):
         key = jax.random.PRNGKey(0)
         q_key, k_key, v_key, bias_key, dropout_key = jax.random.split(key, 5)
 
-        q_shape = (self.batch_size, self.max_seqlen_q, self.num_heads_q, self.head_dim)
-        k_shape = v_shape = (
-            self.batch_size,
-            self.max_seqlen_kv,
-            self.num_heads_kv,
-            self.head_dim,
-        )
+        q_shape = (self.batch_size, self.max_seqlen_q, self.num_heads_q, self.head_dim_qk)
+        k_shape = (self.batch_size, self.max_seqlen_kv, self.num_heads_kv, self.head_dim_qk)
+        v_shape = (self.batch_size, self.max_seqlen_kv, self.num_heads_kv, self.head_dim_v)
 
         if self.attn_bias_type == AttnBiasType.NO_BIAS:
             bias_shape = None
@@ -616,7 +623,7 @@ def generate_random_segment_ids(
                     raise ValueError(f"Unknown {self.seq_desc_format=}")
 
         self.dropout_rng = dropout_key if self.dropout_prob > 0 else None
-        self.scaling_factor = 1.0 / sqrt(self.head_dim)
+        self.scaling_factor = 1.0 / sqrt(self.head_dim_qk)
 
         # Setup distributed sharding specs
         # Setup shardings for distributed tests
@@ -935,21 +942,45 @@ def check_dqkv(primitive, reference, pad, idx):
     ],
 )
 @pytest.mark.parametrize(
-    "b, s_q, s_kv, h_q, h_kv, d, dtype",
+    "b, s_q, s_kv, h_q, h_kv, d_qk, d_v, dtype",
     [
-        pytest.param(2, 2048, 2048, 12, 12, 64, jnp.bfloat16, id="2-2048-2048-12-12-64-BF16-SELF"),
+        pytest.param(
+            2, 2048, 2048, 12, 12, 64, 64, jnp.bfloat16, id="2-2048-2048-12-12-64-64-BF16-SELF"
+        ),
+        pytest.param(
+            2,
+            2048,
+            1024,
+            12,
+            12,
+            64,
+            64,
+            jnp.bfloat16,
+            id="2-2048-1024-12-12-64-64-BF16-CROSS",
+        ),
+        pytest.param(
+            2, 2048, 2048, 12, 6, 64, 64, jnp.bfloat16, id="2-2048-2048-12-6-64-64-BF16-GQA"
+        ),
+        pytest.param(
+            4, 128, 128, 16, 16, 64, 64, jnp.float16, id="4-128-128-16-16-64-64-FP16-SELF"
+        ),
+        pytest.param(
+            4, 128, 128, 16, 16, 64, 32, jnp.float16, id="4-128-128-16-16-64-32-FP16-SELF"
+        ),
         pytest.param(
             2,
             2048,
             1024,
             12,
             12,
             64,
+            32,
             jnp.bfloat16,
-            id="2-2048-1024-12-12-64-BF16-CROSS",
+            id="2-2048-1024-12-12-64-32-BF16-CROSS",
+        ),
+        pytest.param(
+            2, 2048, 2048, 12, 6, 128, 64, jnp.float16, id="2-2048-2048-12-6-128-64-FP16-GQA"
         ),
-        pytest.param(2, 2048, 2048, 12, 6, 64, jnp.bfloat16, id="2-2048-2048-12-6-64-BF16-GQA"),
-        pytest.param(4, 128, 128, 16, 16, 64, jnp.float16, id="4-128-128-16-16-64-FP16-SELF"),
     ],
 )
 @pytest.mark.parametrize(
@@ -1003,7 +1034,8 @@ def _test_forward(
         s_kv,
         h_q,
         h_kv,
-        d,
+        d_qk,
+        d_v,
         attn_bias_type,
         attn_mask_type,
         dropout_prob,
@@ -1028,7 +1060,8 @@ def _test_forward(
             s_kv,
             h_q,
             h_kv,
-            d,
+            d_qk,
+            d_v,
             attn_bias_type,
             attn_mask_type,
             dropout_prob,
@@ -1055,7 +1088,8 @@ def test_backward(
         s_kv,
         h_q,
         h_kv,
-        d,
+        d_qk,
+        d_v,
         attn_bias_type,
         attn_mask_type,
         dropout_prob,
@@ -1077,7 +1111,8 @@ def test_backward(
             s_kv,
             h_q,
             h_kv,
-            d,
+            d_qk,
+            d_v,
             attn_bias_type,
             attn_mask_type,
             dropout_prob,
 
@@ -188,7 +188,7 @@ class ReorderStrategy(Enum):
 
     - DualChunkSwap: This strategy splits each query into two chunks and do the mirror swap between
     GPUs. This is currently used for non-THD load balance. It requires the max_seqlens be the
-    mulitple of 2 * cp_size.
+    multiple of 2 * cp_size.
       Examples:
       - Before reorder: GPU0: [0, 1, 2, 3]; GPU1: [4, 5, 6, 7]; GPU2: [8, 9, 10, 11]; GPU3: [12, 13, 14, 15];
       - After reorder: GPU0: [0, 1, 14, 15]; GPU1: [4, 5, 10, 11]; GPU2: [8, 9, 6, 7]; GPU3: [12, 13, 2, 3]
@@ -288,7 +288,8 @@ def is_fused_attn_kernel_available(
     kv_num_heads,
     q_max_seqlen,
     kv_max_seqlen,
-    head_dim,
+    head_dim_qk,
+    head_dim_v,
     window_size: Optional[Tuple[int, int]] = None,
 ):
     """
@@ -308,7 +309,8 @@ def make_helper(attn_mask_type):
             kv_num_heads,
             q_max_seqlen,
             kv_max_seqlen,
-            head_dim,
+            head_dim_qk,
+            head_dim_v,
             (-1, -1) if window_size is None else window_size,
         )
 
@@ -491,7 +493,7 @@ def _segment_ids_to_seqlens(segment_ids_q, segment_ids_kv, attn_mask_type):
 
 @jax.tree_util.register_pytree_node_class
 class SequenceDescriptor:
-    """A class to descibe the sequences with flexible initialization.
+    """A class to describe the sequences with flexible initialization.
     - SequenceDescriptor.from_seqlens
       For non-THD (non-packed) cases, where each batch has only 1 sequence.
     - SequenceDescriptor.from_seqlens_and_offsets