What is the right way to scale up `nn.MultiHeadDotProductAttention` and `nn.LayerNorm` in multiple device? #3820

sw32-seo · 2024-04-03T05:19:54Z

sw32-seo
Apr 3, 2024

Dear Flax community 😄

I want to scale up MultiHeadDotProductAttention and nn.LayerNorm as below.
I am not sure that the size of the query, key, and value is (32, 8, 4) but the sharding names are (None, 'model') which does not have the same shape.
Moreover, I'm confused with sharding behavior about the last index of $Q,,K,V$s, the 4, seems like the 'model'` axis for sharding. Is this right?

There is one more strange behavior. The result below changes when I use y = sharding_check(1, (1, 1)) in the 7th floating point.

Please help me 🙏 and let me know what is the right way to do.

Thanks!

# with Jax 0.4.19
# and Flax 0.8.2

import functools
import os
from typing import Callable
import flax
import jax
import optax
from flax import linen as nn
from flax.training import train_state

from jax import numpy as jnp

from jax.experimental import mesh_utils

from jax.sharding import Mesh, NamedSharding, PartitionSpec

class ToyExample(nn.Module):
    depth: int
    dense_init: Callable = nn.initializers.xavier_normal()

    @nn.compact
    def __call__(self, x_1, x_2):
        input_embd_1 = nn.Embed(num_embeddings=26,
                                features=self.depth,
                                embedding_init=nn.with_partitioning(self.dense_init, (None, 'model')))(x_1)
        input_embd_2 = nn.Embed(num_embeddings=26,
                                features=self.depth,
                                embedding_init=nn.with_partitioning(self.dense_init, (None, 'model')))(x_2)

        z_1 = nn.LayerNorm(
            epsilon=1e-8,
            scale_init=nn.with_partitioning(nn.initializers.ones, ('model',)),
            bias_init=nn.with_partitioning(nn.initializers.zeros, ('model',)),
        )(input_embd_1)
        z_2 = nn.LayerNorm(
            epsilon=1e-8,
            scale_init=nn.with_partitioning(nn.initializers.ones, ('model',)),
            bias_init=nn.with_partitioning(nn.initializers.zeros, ('model',)),
        )(input_embd_2)

        attn_out = nn.MultiHeadDotProductAttention(8,
                                                   kernel_init=nn.with_partitioning(self.dense_init, (None,)),
                                                   bias_init=nn.with_partitioning(nn.initializers.zeros, (None,)))(z_1,
                                                                                                                   z_2)
        return attn_out

def init_fn(key, x_1, x_2, model, optimizer):
    variables = model.init(key, x_1, x_2)
    state = train_state.TrainState.create(apply_fn=model.apply, params=variables['params'], tx=optimizer)
    return state

def sharding_check(n_device, shape):
    os.environ["XLA_FLAGS"] = f"--xla_force_host_platform_device_count={n_device}"
    print(f"We have {jax.device_count()} devices")

    # Create a mesh and annotate each axis with a name.
    device_mesh = mesh_utils.create_device_mesh(shape)
    print("Device Mesh: ", device_mesh)

    mesh = Mesh(devices=device_mesh, axis_names=('data', 'model'))
    print("Mesh: ", mesh)

    def mesh_sharding(pspec: PartitionSpec) -> NamedSharding:
        return NamedSharding(mesh, pspec)

    BATCH, DEPTH, LENGTH = 8, 32, 2

    x_1 = jnp.ones((BATCH, LENGTH), dtype=jnp.int32)
    x_2 = jnp.zeros((BATCH, LENGTH), dtype=jnp.int32)
    key = jax.random.PRNGKey(0)

    optimizer = optax.adam(1e-3)
    model = ToyExample(DEPTH)

    x_sharding = mesh_sharding(PartitionSpec('data', None))
    x_1 = jax.device_put(x_1, x_sharding)
    x_2 = jax.device_put(x_2, x_sharding)

    abstract_variables = jax.eval_shape(functools.partial(init_fn, model=model, optimizer=optimizer), key, x_1, x_2)

    state_sharding = nn.get_sharding(abstract_variables, mesh)
    print(state_sharding)

    # Compile the code
    jit_init_fn = jax.jit(init_fn,
                          static_argnums=(3, 4),
                          in_shardings=(mesh_sharding(None), x_sharding, x_sharding),
                          out_shardings=state_sharding)

    initialized_state = jit_init_fn(key, x_1, x_2, model, optimizer)

    print("Sharding names of embedding: ", initialized_state.params['Embed_0']['embedding'].names)
    print("Shape of embedding: ", initialized_state.params['Embed_0']['embedding'].value.shape)

    print("Sharding names of LayerNorm: ", initialized_state.params['LayerNorm_0']['scale'].names)
    print("Shape of scale: ", initialized_state.params['LayerNorm_0']['scale'].value.shape)

    print("Sharding names of key: ", initialized_state.params['MultiHeadDotProductAttention_0']['key']['kernel'].names)
    print("Shape of key: ", initialized_state.params['MultiHeadDotProductAttention_0']['key']['kernel'].value.shape)

    print("Sharding names of query: ",
          initialized_state.params['MultiHeadDotProductAttention_0']['query']['kernel'].names)
    print("Shape of query: ", initialized_state.params['MultiHeadDotProductAttention_0']['query']['kernel'].value.shape)

    print("Sharding names of value: ",
          initialized_state.params['MultiHeadDotProductAttention_0']['value']['kernel'].names)
    print("Shape of value: ", initialized_state.params['MultiHeadDotProductAttention_0']['value']['kernel'].value.shape)

    @functools.partial(jax.jit, in_shardings=(state_sharding, x_sharding, x_sharding), out_shardings=x_sharding)
    def apply_fn(state, x_1, x_2):
        return state.apply_fn({'params': state.params}, x_1, x_2)

    with mesh:
        y = apply_fn(initialized_state, x_1, x_2)

    return y

if __name__ == '__main__':
    y = sharding_check(8, (2, 4))
    print('-' * 20)
    print(y.shape)
    print(y[0, 0].sum())
    print(y[0, 0])

""" 
output:
Sharding names of embedding:  (None, 'model')
Shape of embedding:  (26, 32)
Sharding names of LayerNorm:  ('model',)
Shape of scale:  (32,)
Sharding names of key:  (None, 'model')
Shape of key:  (32, 8, 4)
Sharding names of query:  (None, 'model')
Shape of query:  (32, 8, 4)
Sharding names of value:  (None, 'model')
Shape of value:  (32, 8, 4)
--------------------
(8, 2, 32)
3.4500217
[ 0.28442934  0.6421181  -1.4352701  -0.671702   -1.3835961  -0.1093393
  1.2042092   0.68857545 -1.1909387  -0.9175717  -1.0279568   0.5387048
  1.6667844   0.9802907   1.7486709  -0.04067183 -0.17786661  2.8663485
  1.7820936   1.4775337  -0.9326816  -0.4300878  -2.0963018   0.913019
  1.1165363   0.03392503 -2.0707061  -1.3661302  -0.40063575 -0.76141435
  1.1306249   1.3890284 ]
"""

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

What is the right way to scale up `nn.MultiHeadDotProductAttention` and `nn.LayerNorm` in multiple device? #3820

{{title}}

Replies: 0 comments

Select a reply

What is the right way to scale up nn.MultiHeadDotProductAttention and nn.LayerNorm in multiple device? #3820

sw32-seo Apr 3, 2024

Replies: 0 comments

What is the right way to scale up `nn.MultiHeadDotProductAttention` and `nn.LayerNorm` in multiple device? #3820

sw32-seo
Apr 3, 2024