Fix causal attention mask (#306)

thomasw21 · web-flow · commit 38607ae96253 · 2022-07-07T21:39:39.000+02:00
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from functools import lru_cache
 
 import torch
 import torch.nn as nn
@@ -201,6 +201,12 @@ def forward_fused_softmax(self, input, mask):
             else:
                 return ScaledSoftmax.apply(input, scale)
 
+    @staticmethod
+    @lru_cache(maxsize=1)
+    def get_causal_mask(sequence_length: int):
+        mask = torch.ones(1, 1, sequence_length, sequence_length, dtype=torch.bool, device=torch.cuda.current_device())
+        return torch.triu(mask, diagonal=1)
+
     def forward_torch_softmax(self, input, mask):
         if self.input_in_float16 and self.softmax_in_fp32:
             input = input.float()
@@ -210,8 +216,8 @@ def forward_torch_softmax(self, input, mask):
 
         if self.attn_mask_type == AttnMaskType.causal:
             assert mask is None
-            mask = torch.ones_like(input, dtype=torch.bool)
-            mask = torch.triu(mask, diagonal=1, out=mask)
+            assert input.shape[2] == input.shape[3]
+            mask = self.get_causal_mask(input.shape[2])
 
         mask_output = self.mask_func(input, mask) if mask is not None else input
 
diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
@@ -232,13 +232,13 @@ def _to_float16(inputs):
                                         tied_weight_attr='word_embeddings_weight'))
 
         if args.fp32_residual_connection:
-            if hasattr(args, 'attn_mask'):
+            if getattr(args, 'pretrain_causal_attention', False):
                 self.specs.append(lambda x: x.transpose(0, 1).contiguous().float())
             else:
                 # EmbeddingPipe returns attention mask as well
                 self.specs.append(lambda x: (x[0].transpose(0, 1).contiguous().float(), *x[1:]))
         else:
-            if hasattr(args, 'attn_mask'):
+            if getattr(args, 'pretrain_causal_attention', False):
                 self.specs.append(lambda x: x.transpose(0, 1).contiguous())
             else:
                 # EmbeddingPipe returns attention mask as well
@@ -256,7 +256,7 @@ def _to_float16(inputs):
 
         # Undo data format change
         def undo(x):
-            if not hasattr(args, 'attn_mask'):
+            if not getattr(args, 'pretrain_causal_attention', False):
                 x = x[0]
             return x.transpose(0, 1).contiguous()
         self.specs.append(undo)
diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py
@@ -274,7 +274,7 @@ def forward(self, inputs, **kwargs):
 
         input_ids = inputs[0]
         position_ids = inputs[1]
-        if hasattr(self._args, 'attn_mask'):
+        if getattr(self._args, 'pretrain_causal_attention', False):
             attention_mask = None
         else:
             attention_mask = inputs[2]
@@ -287,7 +287,7 @@ def forward(self, inputs, **kwargs):
         embeddings = super().forward(input_ids, position_ids, tokentype_ids=tokentype_ids)
 
         # If cmd args has attn_mask, we don't forward it as an activation.
-        if hasattr(self._args, 'attn_mask'):
+        if getattr(self._args, 'pretrain_causal_attention', False):
             return embeddings
         else:
             return embeddings, attention_mask
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
@@ -54,9 +54,7 @@ def model_provider(pre_process=True, post_process=True):
                              enabled=args.zero_stage == 3,
                              mpu=mpu):
         if args.deepspeed:
-            # Hack @thomasw21 to get fused_softmax.forward_torch_softmax working
-            args.attn_mask = None
-
+            args.pretrain_causal_attention = True
             model = GPTModelPipe(
                 num_tokentypes=0,
                 parallel_output=True,
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -4,14 +4,17 @@
 
 import deepspeed
 import torch
+from parameterized import parameterized
 from torch import nn
 import torch.nn.functional as F
 
+from megatron.enums import AttnMaskType
 from megatron.model.fused_layer_norm import MixedFusedLayerNorm
 from packaging import version
 
 from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
-from megatron.model.fused_softmax import ScaledMaskedSoftmax
+from megatron.model.fused_softmax import ScaledMaskedSoftmax, FusedScaleMaskSoftmax
+from megatron.model.utils import attention_mask_func
 from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, torch_assert_equal, \
     torch_assert_close, require_torch_bf16
 from megatron.training import setup_model_and_optimizer
@@ -366,7 +369,8 @@ def test_fused_layer_norm(self):
 
                 torch_assert_equal(mfln_output, torch_layer_norm_output)
 
-    def test_fused_masked_softmax(self):
+    @parameterized.expand([(attn_mask_type,) for attn_mask_type in AttnMaskType])
+    def test_fused_masked_softmax(self, attn_mask_type: AttnMaskType):
         command_args = get_default_args(self.test_file_dir_str)
 
         with patch('sys.argv', flatten_arguments(command_args)):
@@ -382,30 +386,54 @@ def test_fused_masked_softmax(self):
                     device="cuda",
                     dtype=args.params_dtype
                 )
-                dummy_attention_mask = torch.randn(
-                    args.micro_batch_size,
-                    1, # `args.num_attention_heads` not implemented in our cuda kernel
-                    args.seq_length,
-                    args.seq_length,
-                    device="cuda",
-                    dtype=args.params_dtype
-                ) < 0
+                if attn_mask_type == AttnMaskType.causal:
+                    dummy_attention_mask = None
+                else:
+                    dummy_attention_mask = torch.randn(
+                        args.micro_batch_size,
+                        1, # `args.num_attention_heads` not implemented in our cuda kernel
+                        args.seq_length,
+                        args.seq_length,
+                        device="cuda",
+                        dtype=args.params_dtype
+                    ) < 0
                 scale = torch.rand(())
 
-                fused_scaled_softmax = ScaledMaskedSoftmax
-
-                fused_output = fused_scaled_softmax.apply(dummy_input, dummy_attention_mask, scale)
+                fused_scaled_softmax = FusedScaleMaskSoftmax(
+                    input_in_fp16=args.params_dtype == torch.float16,
+                    input_in_bf16=args.params_dtype == torch.bfloat16,
+                    attn_mask_type=attn_mask_type,
+                    scaled_masked_softmax_fusion=True,
+                    mask_func=attention_mask_func,
+                    softmax_in_fp32=True,
+                    scale=scale,
+                )
+                unfused_scaled_softmax = FusedScaleMaskSoftmax(
+                    input_in_fp16=args.params_dtype == torch.float16,
+                    input_in_bf16=args.params_dtype == torch.bfloat16,
+                    attn_mask_type=attn_mask_type,
+                    scaled_masked_softmax_fusion=False,
+                    mask_func=attention_mask_func,
+                    softmax_in_fp32=True,
+                    scale=scale,
+                )
 
-                # mimick the same via torch
-                output = scale * dummy_input
-                output = output.masked_fill(dummy_attention_mask, torch.finfo(args.params_dtype).min)
-                output = F.softmax(output, dim=-1)
+                self.assertTrue(fused_scaled_softmax.is_kernel_available(dummy_attention_mask, *dummy_input.size()))
+                fused_output = fused_scaled_softmax(dummy_input, dummy_attention_mask)
+                self.assertFalse(unfused_scaled_softmax.is_kernel_available(dummy_attention_mask, *dummy_input.size()))
+                unfused_output = unfused_scaled_softmax(dummy_input, dummy_attention_mask)
 
                 # Test that the nonzeros are the same with the mask
                 for i in range(args.num_attention_heads):
-                    torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(~dummy_attention_mask[:, 0]))
+                    if dummy_attention_mask is None:
+                        # Make sure it's causal, values in the lower triangle should be not zero.
+                        non_zero_values = torch.tril(torch.ones_like(fused_output[:, i]))
+                        torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(non_zero_values))
+                    else:
+                        torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(~dummy_attention_mask[:, 0]))
+
                 # Cuda kernel produces slightly different results
-                torch_assert_close(fused_output, output)
+                torch_assert_close(fused_output, unfused_output)
 
 
     def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_is_not_causal_across_segments(self):