[Model] Add AKI model (#853)

weiyao-Wang · web-flow · commit 6c85ef782766 · 2025-03-20T17:56:42.000+08:00
diff --git a/README.md b/README.md
@@ -62,6 +62,7 @@ Note that some VLMs may not be able to run under certain transformer versions, w
 - **Please use** `transformers==4.36.2` **for**: `Moondream1`.
 - **Please use** `transformers==4.37.0` **for**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`, `Cambrian Series`, `VILA Series`, `Llama-3-MixSenseV1_1`, `Parrot-7B`, `PLLaVA Series`.
 - **Please use** `transformers==4.40.0` **for**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `360VL-70B`, `Phi-3-Vision`, `WeMM`.
+- **Please use** `transformers==4.42.0` **for**: `AKI`.
 - **Please use** `transformers==4.44.0` **for**: `Moondream2`, `H2OVL series`.
 - **Please use** `transformers==4.45.0` **for**: `Aria`.
 - **Please use** `transformers==latest` **for**: `LLaVA-Next series`, `PaliGemma-3B`, `Chameleon series`, `Video-LLaVA-7B-HF`, `Ovis series`, `Mantis series`, `MiniCPM-V2.6`, `OmChat-v2.0-13B-sinlge-beta`, `Idefics-3`, `GLM-4v-9B`, `VideoChat2-HD`, `RBDash_72b`, `Llama-3.2 series`, `Kosmos series`.
diff --git a/docs/ja/README_ja.md b/docs/ja/README_ja.md
@@ -42,6 +42,7 @@ PS: 日本語の README には最新のアップデートがすべて含まれ
 - **`transformers==4.33.0`を使用してください**: `Qwenシリーズ`, `Monkeyシリーズ`, `InternLM-XComposerシリーズ`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICSシリーズ`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4シリーズ`, `InstructBLIPシリーズ`, `PandaGPT`, `VXVERSE`, `GLM-4v-9B`.
 - **`transformers==4.37.0`を使用してください**: `LLaVAシリーズ`, `ShareGPT4Vシリーズ`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLMシリーズ`, `EMU2シリーズ`, `Yi-VLシリーズ`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VLシリーズ`, `InternVLシリーズ`, `Cambrianシリーズ`, `VILA-VLシリーズ`.
 - **`transformers==4.40.0`を使用してください**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `360VL-70B`, `Phi-3-Vision`, `WeMM`.
+- **`transformers==4.42.0`を使用してください**: `AKI`.
 - **`transformers==latest`を使用してください**: `LLaVA-Nextシリーズ`, `PaliGemma-3B`, `Chameleon-VLシリーズ`, `Video-LLaVA-7B-HF`, `Ovis1.5シリーズ`, `Mantisシリーズ`, `MiniCPM-V2.6`.
 
 ```python
diff --git a/docs/zh-CN/README_zh-CN.md b/docs/zh-CN/README_zh-CN.md
@@ -59,6 +59,7 @@
 - **请用** `transformers==4.33.0` **来运行**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`.
 - **请用** `transformers==4.37.0 ` **来运行**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`, `Cambrian Series`, `VILA Series`, `Llama-3-MixSenseV1_1`, `Parrot-7B`, `PLLaVA Series`.
 - **请用** `transformers==4.40.0 ` **来运行**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `360VL-70B`, `Phi-3-Vision`, `WeMM`.
+- **请用** `transformers==4.42.0 ` **来运行**: `AKI`.
 - **请用** `transformers==latest` **来运行**: `LLaVA-Next series`, `PaliGemma-3B`, `Chameleon series`, `Video-LLaVA-7B-HF`, `Ovis series`, `Mantis series`, `MiniCPM-V2.6`, `OmChat-v2.0-13B-sinlge-beta`, `Idefics-3`, `GLM-4v-9B`, `VideoChat2-HD`.
 
 **如何测试一个 VLM 是否可以正常运行:**
diff --git a/vlmeval/config.py b/vlmeval/config.py
@@ -47,6 +47,7 @@
 }
 
 ungrouped = {
+    "AKI": partial(AKI, name="AKI", ckpt_pth="Sony/AKI-4B-phi-3.5-mini"),
     "TransCore_M": partial(TransCoreM, root=TransCore_ROOT),
     "PandaGPT_13B": partial(PandaGPT, name="PandaGPT_13B", root=PandaGPT_ROOT),
     "flamingov2": partial(
diff --git a/vlmeval/vlm/__init__.py b/vlmeval/vlm/__init__.py
@@ -90,3 +90,4 @@
 from .ola import Ola
 from .ursa import UrsaChat
 from .vlm_r1 import VLMR1Chat
+from .aki import AKI
diff --git a/vlmeval/vlm/aki.py b/vlmeval/vlm/aki.py
@@ -0,0 +1,104 @@
+import torch
+from PIL import Image
+import warnings
+from .base import BaseModel
+from ..smp import splitlen, get_cache_path
+from transformers import AutoTokenizer, AutoConfig
+from torchvision.transforms import Compose, Resize, Lambda, ToTensor, Normalize
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+
+class AKI(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = False
+
+    def __init__(self,
+                 name,
+                 ckpt_pth=None,
+                 **kwargs):
+
+        self.name = name
+        try:
+            from open_flamingo.src.modeling_aki import AKI
+        except:
+            raise ImportError('Please first install AKIVLM from https://github.com/sony/aki')
+
+        # replace GenerationMixin to modify attention mask handling
+        from transformers.generation.utils import GenerationMixin
+        from open_flamingo import _aki_update_model_kwargs_for_generation
+        GenerationMixin._update_model_kwargs_for_generation = _aki_update_model_kwargs_for_generation
+
+        config = AutoConfig.from_pretrained(ckpt_pth)
+        tokenizer = AutoTokenizer.from_pretrained(ckpt_pth)
+        model = AKI.from_pretrained(ckpt_pth, tokenizer=tokenizer)
+
+        n_px = getattr(config, "n_px", 384)
+        norm_mean = getattr(config, "norm_mean", 0.5)
+        norm_std = getattr(config, "norm_std", 0.5)
+        
+        image_processor = Compose([
+            Resize((n_px, n_px), interpolation=InterpolationMode.BICUBIC, antialias=True),
+            Lambda(lambda x: x.convert('RGB')),
+            ToTensor(),
+            Normalize(mean=(norm_mean, norm_mean, norm_mean), std=(norm_std, norm_std, norm_std))
+        ])
+        self.model = model.eval().cuda()
+
+        tokenizer.padding_side = 'left'
+        tokenizer.add_eos_token = False
+        self.tokenizer = tokenizer
+        self.image_proc = image_processor
+
+        kwargs_default = {
+            'max_new_tokens': 512,
+            'temperature': 0.0,
+            'do_sample': False,
+            'eos_token_id': tokenizer.eos_token_id,
+        }
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+
+    def apply_prompt_template(self, query):
+        SYSTEM_BASE = "A chat between a curious user and an artificial intelligence assistant."
+        SYSTEM_DETAIL = "The assistant gives helpful, detailed, and polite answers to the user's questions."
+        SYSTEM_MESSAGE = SYSTEM_BASE + " " + SYSTEM_DETAIL
+        SYSTEM_MESSAGE_ROLE = '<|system|>' + '\n' + SYSTEM_MESSAGE + '<|end|>\n'
+
+        s = (
+            f'{SYSTEM_MESSAGE_ROLE}'
+            f'<|user|>\n{query}<|end|>\n<|assistant|>\n'
+        )
+        return s
+
+    def generate_inner(self, message, dataset=None):
+        vision_x, prompt = [], ''
+        for msg in message:
+            if msg['type'] == 'image':
+                img = Image.open(msg['value']).convert('RGB')
+
+                ## [NOTE]: only use the first image in this work if including multiple images in a sample
+                if len(vision_x) == 0:
+                    vision_x.append(self.image_proc(img).unsqueeze(0))
+                    prompt += '<image>'
+                else:
+                    warnings.warn('======Only the first image is used in the input.')
+            elif msg['type'] == 'text':
+                prompt += msg['value']
+                # prompt += f"\nAnswer the question using a single word or phrase. {msg['value']}"      # for YorN
+
+        vision_x = torch.cat(vision_x, dim=0) if len(vision_x) > 1 else vision_x[0]
+        vision_x = vision_x.unsqueeze(1).unsqueeze(0)
+        prompt = self.apply_prompt_template(prompt)
+        lang_x = self.tokenizer([prompt], return_tensors='pt')
+        
+        generated_text = self.model.generate(
+            vision_x=vision_x.cuda(),
+            lang_x=lang_x['input_ids'].cuda(),
+            attention_mask=lang_x['attention_mask'].cuda(),
+            **self.kwargs)
+        generated_text = self.tokenizer.decode(generated_text[0], skip_special_tokens=True)
+        return generated_text

Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,7 @@`
`47`	`47`	`}`
`48`	`48`
`49`	`49`	`ungrouped = {`
	`50`	`+ "AKI": partial(AKI, name="AKI", ckpt_pth="Sony/AKI-4B-phi-3.5-mini"),`
`50`	`51`	`"TransCore_M": partial(TransCoreM, root=TransCore_ROOT),`
`51`	`52`	`"PandaGPT_13B": partial(PandaGPT, name="PandaGPT_13B", root=PandaGPT_ROOT),`
`52`	`53`	`"flamingov2": partial(`