huggingface · frasermince · Mar 19, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 24, 2024
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
@@ -30,6 +30,7 @@
 from transformers.testing_utils import (
     require_bitsandbytes,
     require_torch,
+    require_torch_fp16,
     require_torch_gpu,
     require_vision,
     slow,
@@ -186,6 +187,11 @@ def setUp(self):
         self.model_tester = LlavaVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
 
+    @require_torch_fp16
+    def test_llava_model_fp16_forward(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.create_and_check_llava_model_fp16_forward(config, **inputs)
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )

diff --git a/tests/models/llava/test_trainer_llava.py b/tests/models/llava/test_trainer_llava.py
@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Llava model trainer."""
+
+import gc
+
+import requests
+from datasets import Dataset
+
+from transformers import (
+    AutoProcessor,
+    BitsAndBytesConfig,
+    DataCollatorForLanguageModeling,
+    LlavaForConditionalGeneration,
+    Trainer,
+    TrainingArguments,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import TestCasePlus, require_bitsandbytes, require_peft, require_torch, slow
+
+
+if is_vision_available():
+    from PIL import Image
+
+if is_torch_available():
+    import torch
+else:
+    is_torch_greater_or_equal_than_2_0 = False
+
+
+# Integration test for confirming autocast with trainer and accelerate works
+# correctly. Confirms type error found
+# https://github.com/huggingface/transformers/pull/29721 in is fixed
+@require_torch
+class LlavaForConditionalGenerationIntegrationTest(TestCasePlus):
+    def setUp(self):
+        super().setUp()
+        self.processor = AutoProcessor.from_pretrained(
+            "llava-hf/bakLlava-v1-hf", padding_side="left", truncation_side="right"
+        )
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @slow
+    @require_bitsandbytes
+    @require_peft
+    def test_model_trainer_integration_test(self):
+        from peft import LoraConfig, PeftModelForCausalLM
+
+        def image_prompt_generator():
+            prompts = [
+                "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
+                "USER: <image>\nWhat is this?\nASSISTANT:",
+            ]
+            image_urls = [
+                "https://llava-vl.github.io/static/images/view.jpg",
+                "http://images.cocodataset.org/val2017/000000039769.jpg",
+            ]
+
+            for prompt, image_url in zip(prompts, image_urls):
+                image = Image.open(requests.get(image_url, stream=True).raw)
+                yield {"image": image, "prompt": prompt}
+
+        def process_image_prompt(data):
+            processed = self.processor(
+                data["prompt"], images=data["image"], return_tensors="pt", padding=True, max_length=512
+            )
+            return {
+                "input_ids": processed["input_ids"].squeeze(),
+                "attention_mask": processed["attention_mask"].squeeze(),
+                "pixel_values": processed["pixel_values"].squeeze(),
+            }
+
+        train_dataset = Dataset.from_generator(image_prompt_generator).map(process_image_prompt)
+        bits_and_bytes_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+        )
+        model = LlavaForConditionalGeneration.from_pretrained(
+            "llava-hf/bakLlava-v1-hf", quantization_config=bits_and_bytes_config
+        )
+        peft_config = LoraConfig(
+            r=16,
+            lora_alpha=16,
+            bias="none",
+            task_type="CAUSAL_LM",
+            lora_dropout=0.0,
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        )
+        model = PeftModelForCausalLM(model, peft_config, adapter_name="lora_default")
+        data_collator = DataCollatorForLanguageModeling(self.processor.tokenizer, mlm=False)
+
+        output_dir = self.get_auto_remove_tmp_dir()
+        trainer = Trainer(
+            model=model,
+            train_dataset=train_dataset,
+            tokenizer=self.processor.tokenizer,
+            args=TrainingArguments(output_dir, fp16=True, learning_rate=2e-5, num_train_epochs=1),
+            data_collator=data_collator,
+        )
+        trainer.train()
+
+        prompts = [
+            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
+            "USER: <image>\nWhat is this?\nASSISTANT:",
+        ]
+        image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = self.processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
+
+        output = model(**inputs)
+        expected_slice = torch.tensor(
+            [[-3.5664, -3.5625, -0.4309], [-5.8242, -5.6914, -1.3242], [-5.4805, -5.9375, 1.1465]],
+            dtype=torch.float32,
+        )
+
+        assert torch.allclose(output["logits"][0, :3, :3], expected_slice, atol=1e-3)
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
@@ -27,12 +27,7 @@
     is_torch_available,
     is_vision_available,
 )
-from transformers.testing_utils import (
-    require_bitsandbytes,
-    require_torch,
-    slow,
-    torch_device,
-)
+from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_fp16, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -270,6 +265,16 @@ def test_sdpa_can_compile_dynamic(self):
     @unittest.skip(reason="Compile not yet supported because in LLava models")
     def test_sdpa_can_dispatch_on_flash(self):
         pass
+    @require_torch_fp16
+
+    def test_llava_next_model_fp16_forward(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.create_and_check_llava_next_model_fp16_forward(config, **inputs)
+
+    @require_torch_fp16
+    def test_llava_next_model_fp16_autocast_forward(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.create_and_check_llava_next_model_fp16_autocast_forward(config, **inputs)
 
 
 @require_torch