From c081228e0cb9736e20548d3939bbf11ca1fb007c Mon Sep 17 00:00:00 2001
From: root <403644786@qq.com>
Date: Mon, 2 Sep 2024 08:37:51 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E6=95=B0?=
 =?UTF-8?q?=E6=8D=AE=E5=A4=84=E7=90=86=E6=96=B9=E6=B3=95=E4=BB=A5=E9=80=82?=
 =?UTF-8?q?=E9=85=8Dminicpmv2.6=E7=9A=84awq?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 awq/models/__init__.py           |   1 +
 awq/models/auto.py               |   1 +
 awq/models/base.py               |   1 +
 awq/models/minicpmv.py           | 406 +++++++++++++++++
 examples/minicpmv2.6_quantize.py | 718 +++++++++++++++++++++++++++++++
 5 files changed, 1127 insertions(+)
 create mode 100644 awq/models/minicpmv.py
 create mode 100644 examples/minicpmv2.6_quantize.py

diff --git a/awq/models/__init__.py b/awq/models/__init__.py
index 2f1a88e2..cd85ae4d 100644
--- a/awq/models/__init__.py
+++ b/awq/models/__init__.py
@@ -24,3 +24,4 @@
 from .deepseek_v2 import DeepseekV2AWQForCausalLM
 from .minicpm import MiniCPMAWQForCausalLM
 from .internlm2 import InternLM2AWQForCausalLM
+from .minicpmv import MiniCPMVAWQForCausalLM
diff --git a/awq/models/auto.py b/awq/models/auto.py
index 3a6416f1..201fc3dd 100644
--- a/awq/models/auto.py
+++ b/awq/models/auto.py
@@ -34,6 +34,7 @@
     "deepseek_v2": DeepseekV2AWQForCausalLM,
     "minicpm": MiniCPMAWQForCausalLM,
     "internlm2": InternLM2AWQForCausalLM,
+    "minicpmv": MiniCPMVAWQForCausalLM
 }
 
 
diff --git a/awq/models/base.py b/awq/models/base.py
index 1d376fc0..86a98f01 100644
--- a/awq/models/base.py
+++ b/awq/models/base.py
@@ -84,6 +84,7 @@
     "cohere": "AutoModelForCausalLM",
     "deepseek_v2": "AutoModelForCausalLM",
     "minicpm": "AutoModelForCausalLM",
+    "minicpmv":"AutoModelForCausalLM",
     "internlm2": "AutoModelForCausalLM",
 }
 
diff --git a/awq/models/minicpmv.py b/awq/models/minicpmv.py
new file mode 100644
index 00000000..bf12ebd7
--- /dev/null
+++ b/awq/models/minicpmv.py
@@ -0,0 +1,406 @@
+import tqdm
+from typing import List, Tuple
+from .base import BaseAWQForCausalLM
+from awq.utils.fused_utils import fuse_qkv
+from awq.modules.fused.block import LlamaLikeBlock
+from awq.modules.fused.model import LlamaLikeModel
+from transformers.models.llama.modeling_llama import (
+    LlamaDecoderLayer as OldLlamaDecoderLayer,
+)
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2DecoderLayer as OldQwen2DecoderLayer,
+    Qwen2ForCausalLM as OldQwen2ForCausalLM,
+)
+from .base import (
+    Annotated,
+    AwqConfig,
+    BaseAWQForCausalLM,
+    Dict,
+    Doc,
+    List,
+    PreTrainedTokenizer,
+    Union,
+)
+
+from transformers.models.llava.modeling_llava import (
+    LlavaForConditionalGeneration as OldLlavaForConditionalGeneration,
+)
+from awq.modules.fused.norm import FasterTransformerRMSNorm
+import torch
+from transformers import AutoProcessor
+import json
+from copy import deepcopy
+from PIL import Image
+from awq.modules.fused.attn import QuantAttentionFused
+from torch import nn
+from ..quantize.quantizer import AwqQuantizer, clear_memory, get_best_device
+
+class CPMVAwqQuantizer(AwqQuantizer):
+    def init_quant(self, n_samples=None, max_seq_len=None):
+        modules = self.awq_model.get_model_layers(self.model)
+        samples = self.calib_data
+
+        inps = []
+        layer_kwargs = {}
+
+        best_device = get_best_device()
+        modules[0] = modules[0].to(best_device)
+        self.awq_model.move_embed(self.model, best_device)
+
+        # get input and kwargs to layer 0
+        # with_kwargs is only supported in PyTorch 2.0
+        # use this Catcher hack for now
+        class Catcher(nn.Module):
+            def __init__(self, module):
+                super().__init__()
+                self.module = module
+
+            def forward(self, *args, **kwargs):
+                # assume first input to forward is hidden states
+                if len(args) > 0:
+                    hidden_states = args[0]
+                    del args
+                else:
+                    first_key = list(kwargs.keys())[0]
+                    hidden_states = kwargs.pop(first_key)
+
+                inps.append(hidden_states)
+                layer_kwargs.update(kwargs)
+                raise ValueError  # early exit to break later inference
+
+        def move_to_device(obj: torch.Tensor | nn.Module, device: torch.device):
+            def get_device(obj: torch.Tensor | nn.Module):
+                if isinstance(obj, torch.Tensor):
+                    return obj.device
+                return next(obj.parameters()).device
+
+            if get_device(obj) != device:
+                obj = obj.to(device)
+            return obj
+
+        # patch layer 0 to catch input and kwargs
+        modules[0] = Catcher(modules[0])
+        for k, v in samples.items():
+            if isinstance(v, (torch.Tensor, nn.Module)):
+                samples[k] = move_to_device(v, best_device)
+        try:
+            self.model(**samples)
+        except ValueError:  # work with early exit
+            pass
+        finally:
+            for k, v in samples.items():
+                if isinstance(v, (torch.Tensor, nn.Module)):
+                    samples[k] = move_to_device(v, "cpu")
+        modules[0] = modules[0].module  # restore
+
+        del samples
+        inps = inps[0]
+
+        modules[0] = modules[0].cpu()
+        self.awq_model.move_embed(self.model, "cpu")
+
+        clear_memory()
+
+        return modules, layer_kwargs, inps
+
+class MiniCPMVAWQForCausalLM(BaseAWQForCausalLM):
+    layer_type = "Qwen2DecoderLayer"
+    max_seq_len_key = "max_position_embeddings"
+
+    def chat(
+        self,
+        image,
+        msgs,
+        tokenizer,
+        processor=None,
+        vision_hidden_states=None,
+        max_new_tokens=2048,
+        min_new_tokens=0,
+        sampling=True,
+        max_inp_length=8192,
+        system_prompt='',
+        stream=False,
+        max_slice_nums=None,
+        use_image_id=None,
+        **kwargs
+    ):
+        if isinstance(msgs[0], list):
+            batched = True
+        else:
+            batched = False
+        msgs_list = msgs
+        images_list = image
+        
+        if batched is False:
+            images_list, msgs_list = [images_list], [msgs_list]
+        else:
+            assert images_list is None, "Please integrate image to msgs when using batch inference."
+            images_list = [None] * len(msgs_list)
+        assert len(images_list) == len(msgs_list), "The batch dim of images_list and msgs_list should be the same."
+
+        if processor is None:
+            if self.processor is None:
+                self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True)
+            processor = self.processor
+        
+        assert self.config.query_num == processor.image_processor.image_feature_size, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
+        assert self.config.patch_size == processor.image_processor.patch_size, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
+        assert self.config.use_image_id == processor.image_processor.use_image_id, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
+        assert self.config.slice_config.max_slice_nums == processor.image_processor.max_slice_nums, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
+        assert self.config.slice_mode == processor.image_processor.slice_mode, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
+
+        prompts_lists = []
+        input_images_lists = []
+        for image, msgs in zip(images_list, msgs_list):
+            if isinstance(msgs, str):
+                msgs = json.loads(msgs)
+            copy_msgs = deepcopy(msgs)
+
+            assert len(msgs) > 0, "msgs is empty"
+            assert sampling or not stream, "if use stream mode, make sure sampling=True"
+
+            if image is not None and isinstance(copy_msgs[0]["content"], str):
+                copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]]
+
+            images = []
+            for i, msg in enumerate(copy_msgs):
+                role = msg["role"]
+                content = msg["content"]
+                assert role in ["user", "assistant"]
+                if i == 0:
+                    assert role == "user", "The role of first msg should be user"
+                if isinstance(content, str):
+                    content = [content]
+                cur_msgs = []
+                for c in content:
+                    if isinstance(c, Image.Image):
+                        images.append(c)
+                        cur_msgs.append("(<image>./</image>)")
+                    elif isinstance(c, str):
+                        cur_msgs.append(c)
+                msg["content"] = "\n".join(cur_msgs)
+
+            if system_prompt:
+                sys_msg = {'role': 'system', 'content': system_prompt}
+                copy_msgs = [sys_msg] + copy_msgs        
+
+            prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True))
+            input_images_lists.append(images)
+
+        inputs = processor(
+            prompts_lists, 
+            input_images_lists, 
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+            return_tensors="pt", 
+            max_length=max_inp_length
+        ).to('cuda:0')
+
+        if sampling:
+            generation_config = {
+                "top_p": 0.8,
+                "top_k": 100,
+                "temperature": 0.7,
+                "do_sample": True,
+                "repetition_penalty": 1.05
+            }
+        else:
+            generation_config = {
+                "num_beams": 3,
+                "repetition_penalty": 1.2,
+            }
+            
+        if min_new_tokens > 0:
+            generation_config['min_new_tokens'] = min_new_tokens
+
+        generation_config.update(
+            (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
+        )
+
+        inputs.pop("image_sizes")
+        with torch.inference_mode():
+            res = self.generate(
+                **inputs,
+                tokenizer=tokenizer,
+                max_new_tokens=max_new_tokens,
+                vision_hidden_states=vision_hidden_states,
+                stream=stream,
+                decode_text=True,
+                **generation_config
+            )
+        
+        if stream:
+            def stream_gen():
+                for text in res:
+                    for term in self.terminators:
+                        text = text.replace(term, '')
+                    yield text
+            return stream_gen()
+
+        else:
+            if batched:
+                answer = res
+            else:
+                answer = res[0]
+            return answer
+    # @staticmethod
+    # def fuse_layers(model: OldQwen2ForCausalLM):
+    #     fuser = MiniCPMVFuser(model) # 这里是算子融合
+    #     fuser.fuse_transformer()
+
+    # hack to use `Qwen2VLAwqQuantizer` as quantizer
+    @torch.no_grad()
+    def quantize(
+        self,
+        tokenizer: Annotated[
+            PreTrainedTokenizer, Doc("The tokenizer to use for quantization.")
+        ] = None,
+        quant_config: Annotated[
+            Dict, Doc("The quantization config you want to use.")
+        ] = {},
+        calib_data: Annotated[
+            Union[str, List[str]],
+            Doc(
+                "The calibration dataset. Either a string pointing to Huggingface or a list of preloaded examples."
+            ),
+        ] = "pileval",
+        split: Annotated[str, Doc("The split of calib_data.")] = "train",
+        text_column: Annotated[str, Doc("The text column of calib_data.")] = "text",
+        duo_scaling: Annotated[
+            bool, Doc("Whether to scale using both w/x or just x.")
+        ] = True,
+        export_compatible: Annotated[
+            bool,
+            Doc(
+                "This argument avoids real quantization by only applying the scales without quantizing down to FP16."
+            ),
+        ] = False,
+        apply_clip: Annotated[
+            bool,
+            Doc(
+                "Whether to apply clipping to the model during quantization. Some models may perform better with this set to False."
+            ),
+        ] = True,
+        n_parallel_calib_samples: Annotated[
+            int,
+            Doc(
+                "The number of parallel samples to run through the model. "
+                "A high number of parallel samples can result in OOM during quantization if max_calib_samples is high enough. "
+                "If None, runs through all samples at the same time. "
+                "You can set this to a low number for more memory efficient quantization."
+            ),
+        ] = None,
+        max_calib_samples: Annotated[
+            int, Doc("The maximum number of samples to run through the model.")
+        ] = 128,
+        max_calib_seq_len: Annotated[
+            int,
+            Doc(
+                "The maximum sequence length of the calibration dataset. Discard samples greater than max_calib_seq_len."
+            ),
+        ] = 512,
+        max_chunk_memory: Annotated[
+            int,
+            Doc(
+                "The loss computation and per-channel mean is optimized into chunked computations."
+                " Adjust this parameter to increase or decrease memory usage for these computations."
+                " Default is 1GB (1024 * 1024 * 1024)."
+            ),
+        ] = 1024
+        * 1024
+        * 1024,
+    ):
+        self.quant_config: AwqConfig = AwqConfig.from_dict(quant_config)
+
+        if hasattr(self, "modules_to_not_convert"):
+            self.quant_config.modules_to_not_convert = self.modules_to_not_convert
+
+        self.quantizer = CPMVAwqQuantizer(
+            self,
+            self.model,
+            tokenizer,
+            self.quant_config.w_bit,
+            self.quant_config.q_group_size,
+            self.quant_config.zero_point,
+            self.quant_config.version,
+            calib_data,
+            split,
+            text_column,
+            duo_scaling,
+            modules_to_not_convert=self.quant_config.modules_to_not_convert,
+            export_compatible=export_compatible,
+            apply_clip=apply_clip,
+            n_parallel_calib_samples=n_parallel_calib_samples,
+            max_calib_samples=max_calib_samples,
+            max_calib_seq_len=max_calib_seq_len,
+            max_chunk_memory=max_chunk_memory,
+        )
+        self.quantizer.quantize()
+
+        self.is_quantized = True
+
+    @staticmethod
+    def get_model_layers(model: OldQwen2ForCausalLM):
+        return model.llm.model.layers
+
+    @staticmethod
+    def get_act_for_scaling(module:  OldQwen2DecoderLayer):
+        return dict(is_scalable=False)
+
+    @staticmethod
+    def move_embed(model: OldQwen2DecoderLayer, device: str):
+        model.llm.model.embed_tokens = model.get_input_embeddings().to(
+            device
+        )
+
+    @staticmethod
+    def get_layers_for_scaling(module: OldQwen2DecoderLayer, input_feat, module_kwargs):
+        layers = []
+
+        # attention input
+        layers.append(
+            dict(
+                prev_op=module.input_layernorm,
+                layers=[
+                    module.self_attn.q_proj,
+                    module.self_attn.k_proj,
+                    module.self_attn.v_proj,
+                ],
+                inp=input_feat["self_attn.q_proj"],
+                module2inspect=module.self_attn,
+                kwargs=module_kwargs,
+            )
+        )
+
+        # attention out
+        # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
+        if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
+            layers.append(
+                dict(
+                    prev_op=module.self_attn.v_proj,
+                    layers=[module.self_attn.o_proj],
+                    inp=input_feat["self_attn.o_proj"],
+                )
+            )
+
+        # linear 1
+        layers.append(
+            dict(
+                prev_op=module.post_attention_layernorm,
+                layers=[module.mlp.gate_proj, module.mlp.up_proj],
+                inp=input_feat["mlp.gate_proj"],
+                module2inspect=module.mlp,
+            )
+        )
+
+        # linear 2
+        layers.append(
+            dict(
+                prev_op=module.mlp.up_proj,
+                layers=[module.mlp.down_proj],
+                inp=input_feat["mlp.down_proj"],
+            )
+        )
+
+        return layers
+
diff --git a/examples/minicpmv2.6_quantize.py b/examples/minicpmv2.6_quantize.py
new file mode 100644
index 00000000..6677d9c0
--- /dev/null
+++ b/examples/minicpmv2.6_quantize.py
@@ -0,0 +1,718 @@
+from __future__ import annotations
+
+import logging
+from awq import AutoAWQForCausalLM
+from transformers import AutoTokenizer
+from torchvision import transforms
+import argparse
+import json
+import copy
+import logging
+import math
+import os
+import re
+import random
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+import requests
+from io import BytesIO
+import numpy as np
+import torch
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+import logging
+def main():
+# 打印读取的 JSON 列表
+
+    # 定义 argparse 解析器
+    parser = argparse.ArgumentParser(description="Quantize and save a model.")
+    
+    # 添加参数
+    parser.add_argument('--model-path', type=str, default="/root/ld/ld_model_pretrained/Minicpmv2_6",
+                        help='Path to the model directory.')
+    parser.add_argument('--quant-path', type=str, default="/root/ld/ld_model_pretrained/Minicpmv2_6_awq_new",
+                        help='Path to save the quantized model.')
+    parser.add_argument('--zero-point', action='store_true',
+                        help='Enable zero point quantization.')
+    parser.add_argument('--q-group-size', type=int, default=128,
+                        help='Quantization group size.')
+    parser.add_argument('--w-bit', type=int, default=4,
+                        help='Weight bit size.')
+    parser.add_argument('--version', type=str, default="GEMM",
+                        help='Quantization version.')
+    parser.add_argument('--batch-size', type=int, default=8,
+                        help='you will forward batch_size, 4090 machine can run 8 batch, A100 machine can run 32 batch.')
+    args = parser.parse_args()
+
+    quant_config = {"zero_point": args.zero_point, "q_group_size": args.q_group_size, "w_bit": args.w_bit, "version": args.version}
+    batch=args.batch_size # you will forward batch_size, 4090 machine can run 8 batch, A100 machine can run 32 batch
+    
+    # here, we use a caption dataset **only for demonstration**. You should replace it with your own sft dataset.
+    def prepare_dataset(n_sample: int = 8) -> list[list[dict]]:
+        from datasets import load_dataset
+        dataset = load_dataset("laion/220k-GPT4Vision-captions-from-LIVIS", split=f"train[:{n_sample}]")
+        return [
+            [
+                {'id': str(index), 
+                'conversations': [{'content': '<image>\n能否协助我辨认这张图片展示的内容？', 'role': 'user'}, 
+                                {'content': sample["caption"], 'role': 'assistant'}], 
+                'image': sample["url"]},
+            ]
+            for index,sample in enumerate(dataset)
+        ]
+    logger = logging.getLogger(__name__)
+    #loading the data set
+    dataset=prepare_dataset(n_sample=32)
+    # Then you need to prepare your data for calibaration. What you need to do is just put samples into a list,
+    # each of which is a typical chat message as shown below. you can specify text and image in `content` field:
+    # dataset = [
+    #            first_line
+    #            {'id': '0', 
+    #             'conversations': [{'content': '<image>\n能否协助我辨认这张图片展示的内容？', 'role': 'user'}, 
+    #                               {'content': '2017 中共文山市委执政纪要\n政策向符合条件的职业农民倾斜.....', 'role': 'assistant'}], 
+    #             'image': '/root/ld/ld_dataset/30k_data/63677831/198.jpg'}, 
+    #             second_line
+    #            {'id': '1', 
+    #              'conversations': [{'content': '<image>\n我需要帮忙确定这张图片里呈现的是什么。', 'role': 'user'}, 
+    #                                {'content': '大学计算机是国家最重要的课程，我们需要从娃娃抓起....', 'role': 'assistant'}], 
+    #             'image': '/root/ld/ld_dataset/30k_data/61520120/3.jpg'}
+    #           ]
+
+    ## The following is a script for data processing, no need to modify it
+    llama3_chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}"
+    
+    class SupervisedDataset(Dataset):
+        """Dataset for supervised fine-tuning."""
+
+        def __init__(
+            self,
+            raw_data,
+            transform,
+            tokenizer,
+            slice_config,
+            llm_type="minicpm",
+            patch_size=14,
+            query_nums=64,
+            batch_vision=False,
+            max_length=2048,
+        ):
+            super(SupervisedDataset, self).__init__()
+            self.raw_data = raw_data
+            self.tokenizer = tokenizer
+            self.transform = transform
+            self.slice_config = slice_config
+            self.llm_type = llm_type
+            self.patch_size = patch_size
+            self.query_nums=query_nums
+            self.batch_vision = batch_vision
+            self.max_length = max_length
+
+        def __len__(self):
+            return len(self.raw_data)
+
+        def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+            try:
+                if isinstance(self.raw_data[i]["image"], str):
+                    if self.raw_data[i]["image"].startswith("http"):
+                        yzmdata = requests.get(self.raw_data[i]["image"])
+                        tempIm = BytesIO(yzmdata.content)
+                        image = Image.open(tempIm).convert('RGB')
+                        images_dict = { "<image>" : image }
+                    else:
+                        images_dict = { "<image>" : Image.open(self.raw_data[i]["image"]).convert("RGB") }
+                elif isinstance(self.raw_data[i]["image"], Dict):
+                    ### for multi-images input, the template for every image is <image_xx>, such as <image_00>, <image_01>
+                    images_dict = {img_name : Image.open(img_path).convert("RGB") for img_name, img_path in self.raw_data[i]["image"].items()}
+                    
+                ret = preprocess(
+                    images_dict,
+                    self.raw_data[i]["conversations"],
+                    self.tokenizer,
+                    self.transform,
+                    query_nums=self.query_nums,
+                    slice_config=self.slice_config,
+                    llm_type=self.llm_type,
+                    patch_size=self.patch_size,
+                    batch_vision=self.batch_vision,
+                    max_length=self.max_length
+                )
+                ret = dict(
+                    input_ids=ret["input_ids"],
+                    position_ids=ret["position_ids"],
+                    labels=ret["target"],
+                    attention_mask=torch.ones_like(ret["input_ids"], dtype=torch.bool),
+                    pixel_values=ret["pixel_values"],
+                    tgt_sizes=ret["tgt_sizes"],
+                    image_bound=ret["image_bound"],
+                )
+            except:
+                logger.error(f"data fetch error")
+                return self.__getitem__(random.randint(0, len(self)))
+            return ret
+
+            
+    def data_collator(examples, padding_value=0, max_length=2048):
+        def trim_and_pad(seq, batch_first, padding_value):
+            return pad_sequence([s[:max_length] for s in seq], batch_first=True, padding_value=padding_value)
+
+        input_ids = trim_and_pad(
+            [example["input_ids"] for example in examples],
+            batch_first=True,
+            padding_value=padding_value,
+        )
+        position_ids = trim_and_pad(
+            [example["position_ids"] for example in examples],
+            batch_first=True,
+            padding_value=padding_value,
+        )
+        targets = trim_and_pad(
+            [example["labels"] for example in examples],
+            batch_first=True,
+            padding_value=-100,
+        )
+        attention_mask = trim_and_pad(
+            [example["attention_mask"] for example in examples],
+            batch_first=True,
+            padding_value=padding_value,
+        )
+        pixel_values = [example["pixel_values"] for example in examples]
+        image_bound = [example["image_bound"] for example in examples]
+        tgt_sizes = [example["tgt_sizes"] for example in examples]
+        return {
+            "input_ids": torch.tensor(input_ids),
+            "position_ids": torch.tensor(position_ids),
+            "labels": torch.tensor(targets),
+            "attention_mask": torch.tensor(attention_mask),
+            "image_bound": image_bound,
+            "tgt_sizes": tgt_sizes,
+            "pixel_values": pixel_values,
+        }
+
+
+    def conversation_to_ids(conversation, tokenizer, llm_type=None, new_schema=False, max_length=2048):
+        """
+        for single image multi-turn conversation
+        conversation: [{'role': 'user', 'content': 'Describe this image'},
+                    {'role': 'assistant', 'content': 'This is a cat.'}]
+        """
+        if llm_type == "llama3":
+            input_ids, context, raw_msg = conversation_to_ids_llama3(
+                conversation, tokenizer
+            )
+        elif llm_type == "qwen2":
+            input_ids, context, raw_msg = conversation_to_ids_qwen2(
+                conversation, tokenizer
+            )
+        else:
+            input_ids, context, raw_msg = conversation_to_ids_minicpm(
+                conversation, tokenizer
+            )
+
+        ids = torch.from_numpy(np.hstack(input_ids, dtype=np.int32))
+        context = torch.from_numpy(np.hstack(context, dtype=np.int8))
+        if input_ids.shape[-1] > max_length:
+            ids =ids[:max_length]
+            context = context[:max_length]
+            logger.warning(f"The input length ({input_ids.shape[-1]}) exceeds the model's maximum length ({max_length}), so it has been truncated")
+        
+        if torch.all(context):
+            logger.error("No tokens available to compute loss.")
+            raise Exception("No tokens available to compute loss.")
+
+        # build target
+        target = torch.full_like(ids, -100, dtype=torch.int32)
+        
+        for i in range(1, len(ids)):
+            if context[i] == 0:
+                target[i - 1] = ids[i]
+            if context[i] == 1 and context[i - 1] == 0:
+                if hasattr(tokenizer, "eot_id"):
+                    target[i - 1] = tokenizer.eot_id
+                else:
+                    target[i - 1] = tokenizer.eos_id
+        
+        # build image bound
+        if new_schema:
+            start_cond = (ids == tokenizer.im_start_id) | (ids == tokenizer.slice_start_id)
+            end_cond = (ids == tokenizer.im_end_id) | (ids == tokenizer.slice_end_id)
+            image_start_tokens = torch.where(start_cond)[0]
+            image_start_tokens += 1
+            image_end_tokens = torch.where(end_cond)[0]
+        else:
+            image_start_tokens = torch.where(ids == tokenizer.im_start_id)[0]
+            image_start_tokens += 1
+            image_end_tokens = torch.where(ids == tokenizer.im_end_id)[0]
+        if len(image_start_tokens) != len(image_end_tokens):
+            logger.error("image start token != image end tokens")
+            raise Exception("image start token != image end tokens")
+        
+        if len(image_start_tokens) > 0:
+            image_bound = torch.hstack(
+                [image_start_tokens.unsqueeze(-1), image_end_tokens.unsqueeze(-1)]
+            )
+        else:
+            image_bound = []
+
+        position_ids = torch.arange(ids.size(0)).long()
+        return {
+            "input_ids": ids,
+            "target": target,
+            "image_bound": image_bound,
+            "raw_msg": raw_msg,
+            "position_ids": position_ids
+        }
+
+
+    def conversation_to_ids_minicpm(conversation, tokenizer):
+        raw_msg = ""
+        input_ids = []
+        context = []
+        for idx, msg in enumerate(conversation):
+            role = msg["role"]
+            message = msg["content"]
+            assert role in ["user", "assistant"]
+            if role == "user":
+                prefix = "<用户>"
+            else:
+                prefix = "<AI>"
+            # append eos
+            if idx == len(conversation) - 1:
+                message = message + tokenizer.eos_token
+            prefix_ids = tokenizer.encode(prefix)[1:]  # remove bos
+            message_ids = tokenizer.encode(message)[1:]
+
+            input_ids.append(prefix_ids)
+            input_ids.append(message_ids)
+
+            context.append(np.ones((len(prefix_ids),), dtype=np.int8))
+            if role == "assistant":
+                context.append(np.zeros((len(message_ids),), dtype=np.int8))
+            else:
+                context.append(np.ones((len(message_ids),), dtype=np.int8))
+
+            raw_msg += prefix + message
+
+        return input_ids, context, raw_msg
+
+
+    def conversation_to_ids_llama3(conversation, tokenizer):
+        raw_msg = ""
+        input_ids = []
+        context = []
+        raw_msg = tokenizer.apply_chat_template(
+            conversation, tokenize=False, add_generation_prompt=False, chat_template=llama3_chat_template,
+        )
+        input_ids = tokenizer.apply_chat_template(
+            conversation, tokenize=True, add_generation_prompt=False, chat_template=llama3_chat_template,
+        )
+        input_ids = np.array(input_ids)
+
+        start_header_idxs = np.where(
+            input_ids == tokenizer.convert_tokens_to_ids("<|start_header_id|>")
+        )[0]
+        assistant_idxs = np.where(
+            input_ids == tokenizer.convert_tokens_to_ids("assistant")
+        )[0]
+        end_header_idxs = np.where(
+            input_ids == tokenizer.convert_tokens_to_ids("<|end_header_id|>")
+        )[0]
+        eot_idxs = np.where(
+            input_ids == tokenizer.convert_tokens_to_ids("<|eot_id|>"))[0]
+
+        context = np.ones_like(input_ids, dtype=np.int8)
+
+        for assistant_idx in assistant_idxs:
+            if assistant_idx in set((start_header_idxs + end_header_idxs) / 2):
+                st = assistant_idx + 3  # assistant<|end_header_id|>\n\n
+                for eot_idx in eot_idxs:
+                    if eot_idx > st:
+                        context[st: eot_idx + 1] = 0
+                        break
+
+        input_ids = np.hstack(input_ids)
+        context = np.hstack(context)
+
+        return input_ids, context, raw_msg
+
+
+    def conversation_to_ids_qwen2(conversation, tokenizer):
+        raw_msg = ""
+        chat = []
+        context = []
+        for idx, msg in enumerate(conversation):
+            role = msg["role"]
+            message = msg["content"]
+            assert role in ["user", "assistant"]
+            if role == "user":
+                prefix = "user"
+            else:
+                prefix = "assistant"
+            chat.append({"role":prefix, "content":message})
+            raw_msg += prefix + message
+        assert set([i['role'] for i in chat]) & set(['assistant'])
+
+        ret = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
+        input_ids = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=False)
+        input_ids = np.array(input_ids)
+
+        start_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_start|>'))[0]
+        assistant_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('assistant'))[0]
+        end_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_end|>'))[0]
+
+        context = np.ones_like(input_ids, dtype=np.int8)
+
+        for assistant_idx in assistant_idxs:
+            if assistant_idx-1 in set(start_idxs):
+                st = assistant_idx + 1
+                for end_idx in end_idxs:
+                    if end_idx > st:
+                        context[st: end_idx + 1] = 0
+                        break
+                        
+        input_ids = np.hstack(input_ids)
+        context = np.hstack(context)
+        return input_ids, context, raw_msg
+
+
+    def preprocess(
+        images_dict,
+        conversations,
+        tokenizer,
+        transform,
+        query_nums=64,
+        slice_config=None,
+        llm_type=None,
+        patch_size=14,
+        batch_vision=False,
+        max_length=2048,
+    ):
+        """
+        single(multi) image(s) preprocess, the image(s) will be placed at the top of the conversation
+        """
+        conversations = copy.deepcopy(conversations)
+        assert len(conversations) > 1, "conversations length must large than 2"
+        assert conversations[0]["role"] == "user", "the first role must be user"
+
+        if slice_config is not None:
+            assert isinstance(slice_config, Dict)
+            assert "patch_size" in slice_config
+            assert "max_slice_nums" in slice_config
+            assert "scale_resolution" in slice_config
+        default_image_placeholder = (
+            tokenizer.im_start + tokenizer.unk_token * query_nums + tokenizer.im_end
+        )
+        new_schema = False
+        use_image_id = False
+        if llm_type=='qwen2':
+            new_schema = True
+            use_image_id = True
+        image_placeholder_dict = {}
+        images = []
+        image_id_cnt = 0 
+        for img_name, image in images_dict.items():
+            if slice_config:
+                source_image, patches, best_grid = slice_image(
+                    image,
+                    slice_config["max_slice_nums"],
+                    slice_config["scale_resolution"],
+                    slice_config["patch_size"],
+                )
+                images.append(source_image)
+                image_placeholder = default_image_placeholder
+                if len(patches) > 0:
+                    for i in range(len(patches)):
+                        for j in range(len(patches[0])):
+                            images.append(patches[i][j])
+                    if use_image_id:
+                        image_placeholder = f'{tokenizer.im_id_start}{image_id_cnt}{tokenizer.im_id_end}' + image_placeholder
+                        image_id_cnt += 1
+                    image_placeholder += get_grid_placeholder(
+                        tokenizer, best_grid, query_nums, new_schema = new_schema)
+                image_placeholder_dict[img_name] = image_placeholder
+            else:
+                images.append(image)
+                if use_image_id:
+                    image_placeholder = f'{tokenizer.im_id_start}{image_id_cnt}{tokenizer.im_id_end}' + image_placeholder
+                    image_id_cnt += 1
+                else:
+                    image_placeholder = default_image_placeholder
+                image_placeholder_dict[img_name] = image_placeholder
+        
+        images = [transform(i) for i in images]
+        
+        if len(images_dict) == 1 and "<image>" in images_dict:       
+            if "<image>" in conversations[0]["content"]:
+                conversations[0]["content"] = conversations[0]["content"].replace(
+                    "<image>", image_placeholder
+                )
+            else:
+                conversations[0]["content"] = (
+                    image_placeholder + "\n" + conversation[0]["content"]
+                )
+            input_dict = conversation_to_ids(conversations, tokenizer, llm_type, new_schema, max_length)
+        else:
+            pattern = r'<image_\d+>'
+            new_conversations = []
+            for conversation in conversations:
+                content = conversation['content']
+                parts = re.split(f'({pattern})', content)
+                for i, part in enumerate(parts):
+                    if not part.strip():
+                        continue
+                    if re.match(pattern, part):  
+                        if part in image_placeholder_dict:
+                            parts[i] = image_placeholder_dict[part] 
+                        else:
+                            raise Exception(f"not found {part} in image dict")
+                conversation['content'] = '\n'.join(parts)
+                new_conversations.append(conversation)
+            conversations = new_conversations
+            
+            input_dict = conversation_to_ids(conversations, tokenizer, llm_type, new_schema, max_length)
+
+        if batch_vision:
+            tgt_sizes = []
+            reshape_images = []
+            for image in images:
+                H, W = image.shape[1:]
+                reshape_image = reshape_by_patch(image, patch_size)
+                reshape_images.append(reshape_image)
+                tgt_sizes.append([H // patch_size, W // patch_size])
+            if tgt_sizes:
+                tgt_sizes = torch.Tensor(tgt_sizes).type(torch.int32)
+
+            input_dict["pixel_values"] = reshape_images
+            input_dict["tgt_sizes"] = tgt_sizes
+
+        else:
+            input_dict["pixel_values"] = images
+            input_dict["tgt_sizes"] = []
+
+        return input_dict
+
+
+    def slice_image(
+        image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False
+    ):
+        original_size = image.size
+        original_width, original_height = original_size
+        log_ratio = math.log(original_width / original_height)
+        ratio = original_width * original_height / \
+            (scale_resolution * scale_resolution)
+        multiple = min(math.ceil(ratio), max_slice_nums)
+
+        source_image = None
+        best_grid = None
+        patches = []
+
+        if multiple <= 1 or never_split:
+            # dont need to slice, upsample
+            best_size = find_best_resize(
+                original_size, scale_resolution, patch_size, allow_upscale=True
+            )
+            source_image = image.resize(best_size, Image.Resampling.BICUBIC)
+        else:
+            candidate_split_grids_nums = []
+            for i in [multiple - 1, multiple, multiple + 1]:
+                if i == 1 or i > max_slice_nums:
+                    continue
+                candidate_split_grids_nums.append(i)
+
+            # source image, down-sampling and ensure divided by patch_size
+            best_resize = find_best_resize(
+                original_size, scale_resolution, patch_size)
+            source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
+            candidate_grids = []
+
+            # find best grid
+            for split_grids_nums in candidate_split_grids_nums:
+                m = 1
+                while m <= split_grids_nums:
+                    if split_grids_nums % m == 0:
+                        candidate_grids.append([m, split_grids_nums // m])
+                    m += 1
+
+            best_grid = [1, 1]
+            min_error = float("inf")
+            for grid in candidate_grids:
+                error = abs(log_ratio - math.log(grid[0] / grid[1]))
+                if error < min_error:
+                    best_grid = grid
+                    min_error = error
+
+            refine_size = get_refine_size(
+                original_size, best_grid, scale_resolution, patch_size, allow_upscale=True
+            )
+
+            refine_image = image.resize(refine_size, Image.Resampling.BICUBIC)
+            patches = split_to_patches(refine_image, best_grid)
+
+        return source_image, patches, best_grid
+
+
+    def ensure_divide(length, patch_size):
+        return max(round(length / patch_size) * patch_size, patch_size)
+
+
+    def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False):
+        width, height = original_size
+        if (width * height > scale_resolution * scale_resolution) or allow_upscale:
+            r = width / height
+            height = int(scale_resolution / math.sqrt(r))
+            width = int(height * r)
+        best_width = ensure_divide(width, patch_size)
+        best_height = ensure_divide(height, patch_size)
+        return (best_width, best_height)
+
+
+    def get_refine_size(
+        original_size, grid, scale_resolution, patch_size, allow_upscale=False
+    ):
+        width, height = original_size
+        grid_x, grid_y = grid
+
+        refine_width = ensure_divide(width, grid_x)
+        refine_height = ensure_divide(height, grid_y)
+
+        grid_width = refine_width / grid_x
+        grid_height = refine_height / grid_y
+
+        best_grid_size = find_best_resize(
+            (grid_width, grid_height),
+            scale_resolution,
+            patch_size,
+            allow_upscale=allow_upscale,
+        )
+
+        refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)
+
+        return refine_size
+
+
+    def split_to_patches(image, grid):
+        patches = []
+        width, height = image.size
+        grid_x = int(width / grid[0])
+        grid_y = int(height / grid[1])
+
+        for i in range(0, height, grid_y):
+            images = []
+            for j in range(0, width, grid_x):
+                box = (j, i, j + grid_x, i + grid_y)
+                patch = image.crop(box)
+                images.append(patch)
+            patches.append(images)
+
+        return patches
+
+
+    def get_grid_placeholder(tokenizer, grid, query_num, new_schema=False):
+        if new_schema:
+            image_placeholder = (
+                tokenizer.slice_start + tokenizer.unk_token * query_num + tokenizer.slice_end
+            )
+        else:
+            image_placeholder = (
+                tokenizer.im_start + tokenizer.unk_token * query_num + tokenizer.im_end
+            )
+
+        cols = grid[0]
+        rows = grid[1]
+        slices = []
+        for i in range(rows):
+            lines = []
+            for j in range(cols):
+                lines.append(image_placeholder)
+            slices.append("".join(lines))
+        if new_schema:
+            slice_placeholder = '\n'.join(slices)
+        else:
+            slice_placeholder = tokenizer.slice_start + \
+            "\n".join(slices) + tokenizer.slice_end
+        return slice_placeholder
+
+
+    def reshape_by_patch(image_tensor, patch_size):
+        """
+        :param image_tensor: shape [3, H, W]
+        :param patch_size:
+        :return: [3, patch_size, HW/patch_size]
+        """
+        patches = torch.nn.functional.unfold(
+            image_tensor, (patch_size, patch_size), stride=(patch_size, patch_size)
+        )
+
+        patches = patches.reshape(image_tensor.size(0), patch_size, patch_size, -1)
+        patches = patches.permute(0, 1, 3, 2).reshape(
+            image_tensor.size(0), patch_size, -1)
+        patches=patches.cuda()
+        return patches
+    # Load your tokenizer and model with AutoAWQ
+    model = AutoAWQForCausalLM.from_pretrained(args.model_path).cuda()
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True,device_map={"": "cuda:0"})
+
+    # set some parameters
+    if hasattr(model.config, "slice_config"):
+            model.config.slice_config.max_slice_nums = 1
+            slice_config = model.config.slice_config.to_dict()
+    else:
+        model.config.max_slice_nums = 1
+        slice_config = model.config.to_dict()
+
+    if hasattr(model.config, "batch_vision_input"):
+        batch_vision = model.config.batch_vision_input
+    else:
+        batch_vision = False
+    def build_transform():
+        IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_MEAN
+        IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)  # timm.data.IMAGENET_INCEPTION_STD
+        return transforms.Compose(
+                [
+                    transforms.ToTensor(),
+                    transforms.Normalize(
+                        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD
+                    ),
+                ]
+            )
+    transform_func = build_transform()
+
+    
+
+    
+    calib_data = SupervisedDataset(
+            dataset,
+            transform_func,
+            tokenizer,
+            slice_config=slice_config,
+            llm_type="qwen2",
+            patch_size=model.config.patch_size,
+            query_nums=model.config.query_num,
+            batch_vision=batch_vision,
+            max_length=2048,
+        )
+
+
+    out_data=[]
+    batch_data=[]
+    for index in range(len(calib_data)//batch):
+        batch_data=[]
+        for j in range(batch):
+            batch_data.append(calib_data[j+index*batch])
+        out_data.append(data_collator(batch_data))
+
+
+    # Then just run the calibration process by one line of code:
+    model.quantize(calib_data=out_data[0], quant_config=quant_config)
+
+    # remove pos_embed
+    if hasattr(model.model, 'resampler') and hasattr(model.model.resampler, 'pos_embed'):
+        del model.model.resampler.pos_embed
+    # Finally, save the quantized model:
+    model.model.config.use_cache = model.model.generation_config.use_cache = True
+    model.save_quantized(args.quant_path, safetensors=True, shard_size="4GB")
+
+if __name__ == "__main__":
+    main()
+
+    

From c5b28d6bde630b00dc5a3eb013d159d0fc9b0219 Mon Sep 17 00:00:00 2001
From: root <403644786@qq.com>
Date: Fri, 6 Sep 2024 19:05:51 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E9=80=82=E9=85=8D=E4=BA=86minicpm3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 awq/models/__init__.py           |   2 +-
 awq/models/auto.py               |   2 +-
 awq/models/base.py               |   2 +-
 awq/models/minicpm3.py           | 265 ++++++++++++
 awq/models/minicpmv.py           | 406 -----------------
 examples/minicpmv2.6_quantize.py | 718 -------------------------------
 6 files changed, 268 insertions(+), 1127 deletions(-)
 create mode 100644 awq/models/minicpm3.py
 delete mode 100644 awq/models/minicpmv.py
 delete mode 100644 examples/minicpmv2.6_quantize.py

diff --git a/awq/models/__init__.py b/awq/models/__init__.py
index cd85ae4d..615ed25c 100644
--- a/awq/models/__init__.py
+++ b/awq/models/__init__.py
@@ -24,4 +24,4 @@
 from .deepseek_v2 import DeepseekV2AWQForCausalLM
 from .minicpm import MiniCPMAWQForCausalLM
 from .internlm2 import InternLM2AWQForCausalLM
-from .minicpmv import MiniCPMVAWQForCausalLM
+from .minicpm3 import MiniCPM3AWQForCausalLM
diff --git a/awq/models/auto.py b/awq/models/auto.py
index 201fc3dd..df4d580f 100644
--- a/awq/models/auto.py
+++ b/awq/models/auto.py
@@ -34,7 +34,7 @@
     "deepseek_v2": DeepseekV2AWQForCausalLM,
     "minicpm": MiniCPMAWQForCausalLM,
     "internlm2": InternLM2AWQForCausalLM,
-    "minicpmv": MiniCPMVAWQForCausalLM
+    "minicpm3": MiniCPM3AWQForCausalLM
 }
 
 
diff --git a/awq/models/base.py b/awq/models/base.py
index 86a98f01..e8512ca7 100644
--- a/awq/models/base.py
+++ b/awq/models/base.py
@@ -84,7 +84,7 @@
     "cohere": "AutoModelForCausalLM",
     "deepseek_v2": "AutoModelForCausalLM",
     "minicpm": "AutoModelForCausalLM",
-    "minicpmv":"AutoModelForCausalLM",
+    "minicpm3":"AutoModelForCausalLM",
     "internlm2": "AutoModelForCausalLM",
 }
 
diff --git a/awq/models/minicpm3.py b/awq/models/minicpm3.py
new file mode 100644
index 00000000..0ea87b38
--- /dev/null
+++ b/awq/models/minicpm3.py
@@ -0,0 +1,265 @@
+
+from .base import BaseAWQForCausalLM
+from ..quantize.quantizer import AwqQuantizer
+import torch
+from .base import (
+     Annotated,
+     AwqConfig,
+     BaseAWQForCausalLM,
+     Dict,
+     Doc,
+     List,
+     PreTrainedTokenizer,
+     Union,
+ )
+from ..quantize.quantizer import AwqQuantizer, clear_memory, get_best_device
+
+class CPM3AwqQuantizer(AwqQuantizer):
+    @torch.no_grad()
+    def _compute_best_clip(
+        self,
+        w: torch.Tensor,
+        input_feat: torch.Tensor,
+        n_grid=20,
+        max_shrink=0.5,
+        n_sample_token=512,
+    ):
+        assert w.dim() == 2
+        org_w_shape = w.shape
+        # w           [co, ci]      -> [co, 1, n_group, group size]
+        # input_feat  [n_token, ci] -> [1, n_token, n_group, group size]
+        group_size = self.group_size if self.group_size > 0 else org_w_shape[1]
+        input_feat = input_feat.view(-1, input_feat.shape[-1])
+        input_feat = input_feat.reshape(1, input_feat.shape[0], -1, group_size)
+
+        # Compute input feature step size (minimum 1)
+        step_size = max(1, input_feat.shape[1] // n_sample_token)
+        input_feat = input_feat[:, ::step_size]
+        
+        w = w.reshape(org_w_shape[0], 1, -1, group_size)
+
+        oc_batch_size = 256 if org_w_shape[0] % 256 == 0 else 64  # prevent OOM
+        if org_w_shape[0] % oc_batch_size != 0:
+            oc_batch_size = org_w_shape[0]
+        assert org_w_shape[0] % oc_batch_size == 0
+        w_all = w
+        best_max_val_all = []
+
+        for i_b in range(org_w_shape[0] // oc_batch_size):
+            w = w_all[i_b * oc_batch_size : (i_b + 1) * oc_batch_size]
+
+            org_max_val = w.abs().amax(dim=-1, keepdim=True)  # co, 1, n_group, 1
+
+            best_max_val = org_max_val.clone()
+            min_errs = torch.ones_like(org_max_val) * 1e9
+            input_feat = input_feat.to(w.device)
+            org_out = (input_feat * w).sum(dim=-1)  # co, n_token, n_group
+
+            for i_s in range(int(max_shrink * n_grid)):
+                max_val = org_max_val * (1 - i_s / n_grid)
+                min_val = -max_val
+                cur_w = torch.clamp(w, min_val, max_val)
+                q_w = self.pseudo_quantize_tensor(cur_w)[0]
+                cur_out = (input_feat * q_w).sum(dim=-1)
+
+                # co, 1, n_group, 1
+                err = (cur_out - org_out).pow(2).mean(dim=1).view(min_errs.shape)
+                del cur_w
+                del cur_out
+                cur_best_idx = err < min_errs
+                min_errs[cur_best_idx] = err[cur_best_idx]
+                best_max_val[cur_best_idx] = max_val[cur_best_idx]
+            best_max_val_all.append(best_max_val)
+
+        best_max_val = torch.cat(best_max_val_all, dim=0)
+
+        clear_memory(input_feat)
+        clear_memory(org_out)
+
+        return best_max_val.squeeze(1)
+
+class MiniCPM3AWQForCausalLM(BaseAWQForCausalLM):
+    layer_type = "MiniCPMDecoderLayer"
+    max_seq_len_key = "max_position_embeddings"
+    @torch.no_grad()
+    def quantize(
+        self,
+        tokenizer: Annotated[
+            PreTrainedTokenizer, Doc("The tokenizer to use for quantization.")
+        ] = None,
+        quant_config: Annotated[
+            Dict, Doc("The quantization config you want to use.")
+        ] = {},
+        calib_data: Annotated[
+            Union[str, List[str]],
+            Doc(
+                "The calibration dataset. Either a string pointing to Huggingface or a list of preloaded examples."
+            ),
+        ] = "pileval",
+        split: Annotated[str, Doc("The split of calib_data.")] = "train",
+        text_column: Annotated[str, Doc("The text column of calib_data.")] = "text",
+        duo_scaling: Annotated[
+            bool, Doc("Whether to scale using both w/x or just x.")
+        ] = True,
+        export_compatible: Annotated[
+            bool,
+            Doc(
+                "This argument avoids real quantization by only applying the scales without quantizing down to FP16."
+            ),
+        ] = False,
+        apply_clip: Annotated[
+            bool,
+            Doc(
+                "Whether to apply clipping to the model during quantization. Some models may perform better with this set to False."
+            ),
+        ] = True,
+        n_parallel_calib_samples: Annotated[
+            int,
+            Doc(
+                "The number of parallel samples to run through the model. "
+                "A high number of parallel samples can result in OOM during quantization if max_calib_samples is high enough. "
+                "If None, runs through all samples at the same time. "
+                "You can set this to a low number for more memory efficient quantization."
+            ),
+        ] = None,
+        max_calib_samples: Annotated[
+            int, Doc("The maximum number of samples to run through the model.")
+        ] = 128,
+        max_calib_seq_len: Annotated[
+            int,
+            Doc(
+                "The maximum sequence length of the calibration dataset. Discard samples greater than max_calib_seq_len."
+            ),
+        ] = 512,
+        max_chunk_memory: Annotated[
+            int,
+            Doc(
+                "The loss computation and per-channel mean is optimized into chunked computations."
+                " Adjust this parameter to increase or decrease memory usage for these computations."
+                " Default is 1GB (1024 * 1024 * 1024)."
+            ),
+        ] = 1024
+        * 1024
+        * 1024,
+    ):
+        """
+        The main quantization function that you can use to quantize your model.
+
+        Example:
+
+        ```python
+        from awq import AutoAWQForCausalLM
+        from transformers import AutoTokenizer
+
+        model_path = "..."
+        model = AutoAWQForCausalLM.from_pretrained(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+        model.quantize(tokenizer, quant_config)
+        ```
+        """
+        self.quant_config: AwqConfig = AwqConfig.from_dict(quant_config)
+
+        if hasattr(self, "modules_to_not_convert"):
+            self.quant_config.modules_to_not_convert = self.modules_to_not_convert
+
+        self.quantizer = CPM3AwqQuantizer(
+            self,
+            self.model,
+            tokenizer,
+            self.quant_config.w_bit,
+            self.quant_config.q_group_size,
+            self.quant_config.zero_point,
+            self.quant_config.version,
+            calib_data,
+            split,
+            text_column,
+            duo_scaling,
+            modules_to_not_convert=self.quant_config.modules_to_not_convert,
+            export_compatible=export_compatible,
+            apply_clip=apply_clip,
+            n_parallel_calib_samples=n_parallel_calib_samples,
+            max_calib_samples=max_calib_samples,
+            max_calib_seq_len=max_calib_seq_len,
+            max_chunk_memory=max_chunk_memory,
+        )
+        self.quantizer.quantize()
+
+        self.is_quantized = True
+    @staticmethod
+    def get_model_layers(model):
+        print(model.model.layers)
+        return model.model.layers
+
+    @staticmethod
+    def get_act_for_scaling(module):
+        return dict(is_scalable=False)
+
+    @staticmethod
+    def move_embed(model, device: str):
+        model.model.embed_tokens = model.model.embed_tokens.to(device)
+
+    @staticmethod
+    def get_layers_for_scaling(module, input_feat, module_kwargs):
+        layers = []
+
+        # layers.append(
+        #     dict(
+        #         prev_op=module.input_layernorm,
+        #         layers=[
+        #             module.self_attn.q_a_proj,
+        #         ],
+        #         inp=input_feat["self_attn.q_a_proj"],
+        #         module2inspect=module.self_attn.q_a_proj,
+        #         kwargs=module_kwargs,
+        #     )
+        # )
+        # mlp
+        layers.append(
+            dict(
+                prev_op=module.self_attn.q_a_layernorm,
+                layers=[
+                    module.self_attn.q_b_proj,
+                    
+                ],
+                inp=input_feat["self_attn.q_b_proj"],
+                module2inspect=module.self_attn.q_b_proj,
+                kwargs=module_kwargs,
+            )
+        )
+
+        layers.append(
+            dict(
+                prev_op=module.self_attn.kv_a_layernorm,
+                layers=[
+                    module.self_attn.kv_b_proj,
+                ],
+                inp=input_feat["self_attn.kv_b_proj"],
+                module2inspect=module.self_attn.kv_b_proj,
+                kwargs=module_kwargs,
+            )
+        )
+        
+
+        # linear 2
+        layers.append(
+            dict(
+                prev_op=module.mlp.up_proj,
+                layers=[module.mlp.down_proj],
+                inp=input_feat["mlp.down_proj"],
+            )
+        )
+
+        layers.append(
+            dict(
+                prev_op=module.post_attention_layernorm,
+                layers=[module.mlp.gate_proj,module.mlp.up_proj],
+                inp=input_feat["mlp.gate_proj"],
+                module2inspect=module.mlp
+            )
+        )
+
+        return layers
+
+
diff --git a/awq/models/minicpmv.py b/awq/models/minicpmv.py
deleted file mode 100644
index bf12ebd7..00000000
--- a/awq/models/minicpmv.py
+++ /dev/null
@@ -1,406 +0,0 @@
-import tqdm
-from typing import List, Tuple
-from .base import BaseAWQForCausalLM
-from awq.utils.fused_utils import fuse_qkv
-from awq.modules.fused.block import LlamaLikeBlock
-from awq.modules.fused.model import LlamaLikeModel
-from transformers.models.llama.modeling_llama import (
-    LlamaDecoderLayer as OldLlamaDecoderLayer,
-)
-from transformers.models.qwen2.modeling_qwen2 import (
-    Qwen2DecoderLayer as OldQwen2DecoderLayer,
-    Qwen2ForCausalLM as OldQwen2ForCausalLM,
-)
-from .base import (
-    Annotated,
-    AwqConfig,
-    BaseAWQForCausalLM,
-    Dict,
-    Doc,
-    List,
-    PreTrainedTokenizer,
-    Union,
-)
-
-from transformers.models.llava.modeling_llava import (
-    LlavaForConditionalGeneration as OldLlavaForConditionalGeneration,
-)
-from awq.modules.fused.norm import FasterTransformerRMSNorm
-import torch
-from transformers import AutoProcessor
-import json
-from copy import deepcopy
-from PIL import Image
-from awq.modules.fused.attn import QuantAttentionFused
-from torch import nn
-from ..quantize.quantizer import AwqQuantizer, clear_memory, get_best_device
-
-class CPMVAwqQuantizer(AwqQuantizer):
-    def init_quant(self, n_samples=None, max_seq_len=None):
-        modules = self.awq_model.get_model_layers(self.model)
-        samples = self.calib_data
-
-        inps = []
-        layer_kwargs = {}
-
-        best_device = get_best_device()
-        modules[0] = modules[0].to(best_device)
-        self.awq_model.move_embed(self.model, best_device)
-
-        # get input and kwargs to layer 0
-        # with_kwargs is only supported in PyTorch 2.0
-        # use this Catcher hack for now
-        class Catcher(nn.Module):
-            def __init__(self, module):
-                super().__init__()
-                self.module = module
-
-            def forward(self, *args, **kwargs):
-                # assume first input to forward is hidden states
-                if len(args) > 0:
-                    hidden_states = args[0]
-                    del args
-                else:
-                    first_key = list(kwargs.keys())[0]
-                    hidden_states = kwargs.pop(first_key)
-
-                inps.append(hidden_states)
-                layer_kwargs.update(kwargs)
-                raise ValueError  # early exit to break later inference
-
-        def move_to_device(obj: torch.Tensor | nn.Module, device: torch.device):
-            def get_device(obj: torch.Tensor | nn.Module):
-                if isinstance(obj, torch.Tensor):
-                    return obj.device
-                return next(obj.parameters()).device
-
-            if get_device(obj) != device:
-                obj = obj.to(device)
-            return obj
-
-        # patch layer 0 to catch input and kwargs
-        modules[0] = Catcher(modules[0])
-        for k, v in samples.items():
-            if isinstance(v, (torch.Tensor, nn.Module)):
-                samples[k] = move_to_device(v, best_device)
-        try:
-            self.model(**samples)
-        except ValueError:  # work with early exit
-            pass
-        finally:
-            for k, v in samples.items():
-                if isinstance(v, (torch.Tensor, nn.Module)):
-                    samples[k] = move_to_device(v, "cpu")
-        modules[0] = modules[0].module  # restore
-
-        del samples
-        inps = inps[0]
-
-        modules[0] = modules[0].cpu()
-        self.awq_model.move_embed(self.model, "cpu")
-
-        clear_memory()
-
-        return modules, layer_kwargs, inps
-
-class MiniCPMVAWQForCausalLM(BaseAWQForCausalLM):
-    layer_type = "Qwen2DecoderLayer"
-    max_seq_len_key = "max_position_embeddings"
-
-    def chat(
-        self,
-        image,
-        msgs,
-        tokenizer,
-        processor=None,
-        vision_hidden_states=None,
-        max_new_tokens=2048,
-        min_new_tokens=0,
-        sampling=True,
-        max_inp_length=8192,
-        system_prompt='',
-        stream=False,
-        max_slice_nums=None,
-        use_image_id=None,
-        **kwargs
-    ):
-        if isinstance(msgs[0], list):
-            batched = True
-        else:
-            batched = False
-        msgs_list = msgs
-        images_list = image
-        
-        if batched is False:
-            images_list, msgs_list = [images_list], [msgs_list]
-        else:
-            assert images_list is None, "Please integrate image to msgs when using batch inference."
-            images_list = [None] * len(msgs_list)
-        assert len(images_list) == len(msgs_list), "The batch dim of images_list and msgs_list should be the same."
-
-        if processor is None:
-            if self.processor is None:
-                self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True)
-            processor = self.processor
-        
-        assert self.config.query_num == processor.image_processor.image_feature_size, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert self.config.patch_size == processor.image_processor.patch_size, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert self.config.use_image_id == processor.image_processor.use_image_id, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert self.config.slice_config.max_slice_nums == processor.image_processor.max_slice_nums, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert self.config.slice_mode == processor.image_processor.slice_mode, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-
-        prompts_lists = []
-        input_images_lists = []
-        for image, msgs in zip(images_list, msgs_list):
-            if isinstance(msgs, str):
-                msgs = json.loads(msgs)
-            copy_msgs = deepcopy(msgs)
-
-            assert len(msgs) > 0, "msgs is empty"
-            assert sampling or not stream, "if use stream mode, make sure sampling=True"
-
-            if image is not None and isinstance(copy_msgs[0]["content"], str):
-                copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]]
-
-            images = []
-            for i, msg in enumerate(copy_msgs):
-                role = msg["role"]
-                content = msg["content"]
-                assert role in ["user", "assistant"]
-                if i == 0:
-                    assert role == "user", "The role of first msg should be user"
-                if isinstance(content, str):
-                    content = [content]
-                cur_msgs = []
-                for c in content:
-                    if isinstance(c, Image.Image):
-                        images.append(c)
-                        cur_msgs.append("(<image>./</image>)")
-                    elif isinstance(c, str):
-                        cur_msgs.append(c)
-                msg["content"] = "\n".join(cur_msgs)
-
-            if system_prompt:
-                sys_msg = {'role': 'system', 'content': system_prompt}
-                copy_msgs = [sys_msg] + copy_msgs        
-
-            prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True))
-            input_images_lists.append(images)
-
-        inputs = processor(
-            prompts_lists, 
-            input_images_lists, 
-            max_slice_nums=max_slice_nums,
-            use_image_id=use_image_id,
-            return_tensors="pt", 
-            max_length=max_inp_length
-        ).to('cuda:0')
-
-        if sampling:
-            generation_config = {
-                "top_p": 0.8,
-                "top_k": 100,
-                "temperature": 0.7,
-                "do_sample": True,
-                "repetition_penalty": 1.05
-            }
-        else:
-            generation_config = {
-                "num_beams": 3,
-                "repetition_penalty": 1.2,
-            }
-            
-        if min_new_tokens > 0:
-            generation_config['min_new_tokens'] = min_new_tokens
-
-        generation_config.update(
-            (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
-        )
-
-        inputs.pop("image_sizes")
-        with torch.inference_mode():
-            res = self.generate(
-                **inputs,
-                tokenizer=tokenizer,
-                max_new_tokens=max_new_tokens,
-                vision_hidden_states=vision_hidden_states,
-                stream=stream,
-                decode_text=True,
-                **generation_config
-            )
-        
-        if stream:
-            def stream_gen():
-                for text in res:
-                    for term in self.terminators:
-                        text = text.replace(term, '')
-                    yield text
-            return stream_gen()
-
-        else:
-            if batched:
-                answer = res
-            else:
-                answer = res[0]
-            return answer
-    # @staticmethod
-    # def fuse_layers(model: OldQwen2ForCausalLM):
-    #     fuser = MiniCPMVFuser(model) # 这里是算子融合
-    #     fuser.fuse_transformer()
-
-    # hack to use `Qwen2VLAwqQuantizer` as quantizer
-    @torch.no_grad()
-    def quantize(
-        self,
-        tokenizer: Annotated[
-            PreTrainedTokenizer, Doc("The tokenizer to use for quantization.")
-        ] = None,
-        quant_config: Annotated[
-            Dict, Doc("The quantization config you want to use.")
-        ] = {},
-        calib_data: Annotated[
-            Union[str, List[str]],
-            Doc(
-                "The calibration dataset. Either a string pointing to Huggingface or a list of preloaded examples."
-            ),
-        ] = "pileval",
-        split: Annotated[str, Doc("The split of calib_data.")] = "train",
-        text_column: Annotated[str, Doc("The text column of calib_data.")] = "text",
-        duo_scaling: Annotated[
-            bool, Doc("Whether to scale using both w/x or just x.")
-        ] = True,
-        export_compatible: Annotated[
-            bool,
-            Doc(
-                "This argument avoids real quantization by only applying the scales without quantizing down to FP16."
-            ),
-        ] = False,
-        apply_clip: Annotated[
-            bool,
-            Doc(
-                "Whether to apply clipping to the model during quantization. Some models may perform better with this set to False."
-            ),
-        ] = True,
-        n_parallel_calib_samples: Annotated[
-            int,
-            Doc(
-                "The number of parallel samples to run through the model. "
-                "A high number of parallel samples can result in OOM during quantization if max_calib_samples is high enough. "
-                "If None, runs through all samples at the same time. "
-                "You can set this to a low number for more memory efficient quantization."
-            ),
-        ] = None,
-        max_calib_samples: Annotated[
-            int, Doc("The maximum number of samples to run through the model.")
-        ] = 128,
-        max_calib_seq_len: Annotated[
-            int,
-            Doc(
-                "The maximum sequence length of the calibration dataset. Discard samples greater than max_calib_seq_len."
-            ),
-        ] = 512,
-        max_chunk_memory: Annotated[
-            int,
-            Doc(
-                "The loss computation and per-channel mean is optimized into chunked computations."
-                " Adjust this parameter to increase or decrease memory usage for these computations."
-                " Default is 1GB (1024 * 1024 * 1024)."
-            ),
-        ] = 1024
-        * 1024
-        * 1024,
-    ):
-        self.quant_config: AwqConfig = AwqConfig.from_dict(quant_config)
-
-        if hasattr(self, "modules_to_not_convert"):
-            self.quant_config.modules_to_not_convert = self.modules_to_not_convert
-
-        self.quantizer = CPMVAwqQuantizer(
-            self,
-            self.model,
-            tokenizer,
-            self.quant_config.w_bit,
-            self.quant_config.q_group_size,
-            self.quant_config.zero_point,
-            self.quant_config.version,
-            calib_data,
-            split,
-            text_column,
-            duo_scaling,
-            modules_to_not_convert=self.quant_config.modules_to_not_convert,
-            export_compatible=export_compatible,
-            apply_clip=apply_clip,
-            n_parallel_calib_samples=n_parallel_calib_samples,
-            max_calib_samples=max_calib_samples,
-            max_calib_seq_len=max_calib_seq_len,
-            max_chunk_memory=max_chunk_memory,
-        )
-        self.quantizer.quantize()
-
-        self.is_quantized = True
-
-    @staticmethod
-    def get_model_layers(model: OldQwen2ForCausalLM):
-        return model.llm.model.layers
-
-    @staticmethod
-    def get_act_for_scaling(module:  OldQwen2DecoderLayer):
-        return dict(is_scalable=False)
-
-    @staticmethod
-    def move_embed(model: OldQwen2DecoderLayer, device: str):
-        model.llm.model.embed_tokens = model.get_input_embeddings().to(
-            device
-        )
-
-    @staticmethod
-    def get_layers_for_scaling(module: OldQwen2DecoderLayer, input_feat, module_kwargs):
-        layers = []
-
-        # attention input
-        layers.append(
-            dict(
-                prev_op=module.input_layernorm,
-                layers=[
-                    module.self_attn.q_proj,
-                    module.self_attn.k_proj,
-                    module.self_attn.v_proj,
-                ],
-                inp=input_feat["self_attn.q_proj"],
-                module2inspect=module.self_attn,
-                kwargs=module_kwargs,
-            )
-        )
-
-        # attention out
-        # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
-        if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
-            layers.append(
-                dict(
-                    prev_op=module.self_attn.v_proj,
-                    layers=[module.self_attn.o_proj],
-                    inp=input_feat["self_attn.o_proj"],
-                )
-            )
-
-        # linear 1
-        layers.append(
-            dict(
-                prev_op=module.post_attention_layernorm,
-                layers=[module.mlp.gate_proj, module.mlp.up_proj],
-                inp=input_feat["mlp.gate_proj"],
-                module2inspect=module.mlp,
-            )
-        )
-
-        # linear 2
-        layers.append(
-            dict(
-                prev_op=module.mlp.up_proj,
-                layers=[module.mlp.down_proj],
-                inp=input_feat["mlp.down_proj"],
-            )
-        )
-
-        return layers
-
diff --git a/examples/minicpmv2.6_quantize.py b/examples/minicpmv2.6_quantize.py
deleted file mode 100644
index 6677d9c0..00000000
--- a/examples/minicpmv2.6_quantize.py
+++ /dev/null
@@ -1,718 +0,0 @@
-from __future__ import annotations
-
-import logging
-from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer
-from torchvision import transforms
-import argparse
-import json
-import copy
-import logging
-import math
-import os
-import re
-import random
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional
-import requests
-from io import BytesIO
-import numpy as np
-import torch
-from PIL import Image
-from torch.nn.utils.rnn import pad_sequence
-from torch.utils.data import Dataset
-import logging
-def main():
-# 打印读取的 JSON 列表
-
-    # 定义 argparse 解析器
-    parser = argparse.ArgumentParser(description="Quantize and save a model.")
-    
-    # 添加参数
-    parser.add_argument('--model-path', type=str, default="/root/ld/ld_model_pretrained/Minicpmv2_6",
-                        help='Path to the model directory.')
-    parser.add_argument('--quant-path', type=str, default="/root/ld/ld_model_pretrained/Minicpmv2_6_awq_new",
-                        help='Path to save the quantized model.')
-    parser.add_argument('--zero-point', action='store_true',
-                        help='Enable zero point quantization.')
-    parser.add_argument('--q-group-size', type=int, default=128,
-                        help='Quantization group size.')
-    parser.add_argument('--w-bit', type=int, default=4,
-                        help='Weight bit size.')
-    parser.add_argument('--version', type=str, default="GEMM",
-                        help='Quantization version.')
-    parser.add_argument('--batch-size', type=int, default=8,
-                        help='you will forward batch_size, 4090 machine can run 8 batch, A100 machine can run 32 batch.')
-    args = parser.parse_args()
-
-    quant_config = {"zero_point": args.zero_point, "q_group_size": args.q_group_size, "w_bit": args.w_bit, "version": args.version}
-    batch=args.batch_size # you will forward batch_size, 4090 machine can run 8 batch, A100 machine can run 32 batch
-    
-    # here, we use a caption dataset **only for demonstration**. You should replace it with your own sft dataset.
-    def prepare_dataset(n_sample: int = 8) -> list[list[dict]]:
-        from datasets import load_dataset
-        dataset = load_dataset("laion/220k-GPT4Vision-captions-from-LIVIS", split=f"train[:{n_sample}]")
-        return [
-            [
-                {'id': str(index), 
-                'conversations': [{'content': '<image>\n能否协助我辨认这张图片展示的内容？', 'role': 'user'}, 
-                                {'content': sample["caption"], 'role': 'assistant'}], 
-                'image': sample["url"]},
-            ]
-            for index,sample in enumerate(dataset)
-        ]
-    logger = logging.getLogger(__name__)
-    #loading the data set
-    dataset=prepare_dataset(n_sample=32)
-    # Then you need to prepare your data for calibaration. What you need to do is just put samples into a list,
-    # each of which is a typical chat message as shown below. you can specify text and image in `content` field:
-    # dataset = [
-    #            first_line
-    #            {'id': '0', 
-    #             'conversations': [{'content': '<image>\n能否协助我辨认这张图片展示的内容？', 'role': 'user'}, 
-    #                               {'content': '2017 中共文山市委执政纪要\n政策向符合条件的职业农民倾斜.....', 'role': 'assistant'}], 
-    #             'image': '/root/ld/ld_dataset/30k_data/63677831/198.jpg'}, 
-    #             second_line
-    #            {'id': '1', 
-    #              'conversations': [{'content': '<image>\n我需要帮忙确定这张图片里呈现的是什么。', 'role': 'user'}, 
-    #                                {'content': '大学计算机是国家最重要的课程，我们需要从娃娃抓起....', 'role': 'assistant'}], 
-    #             'image': '/root/ld/ld_dataset/30k_data/61520120/3.jpg'}
-    #           ]
-
-    ## The following is a script for data processing, no need to modify it
-    llama3_chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}"
-    
-    class SupervisedDataset(Dataset):
-        """Dataset for supervised fine-tuning."""
-
-        def __init__(
-            self,
-            raw_data,
-            transform,
-            tokenizer,
-            slice_config,
-            llm_type="minicpm",
-            patch_size=14,
-            query_nums=64,
-            batch_vision=False,
-            max_length=2048,
-        ):
-            super(SupervisedDataset, self).__init__()
-            self.raw_data = raw_data
-            self.tokenizer = tokenizer
-            self.transform = transform
-            self.slice_config = slice_config
-            self.llm_type = llm_type
-            self.patch_size = patch_size
-            self.query_nums=query_nums
-            self.batch_vision = batch_vision
-            self.max_length = max_length
-
-        def __len__(self):
-            return len(self.raw_data)
-
-        def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-            try:
-                if isinstance(self.raw_data[i]["image"], str):
-                    if self.raw_data[i]["image"].startswith("http"):
-                        yzmdata = requests.get(self.raw_data[i]["image"])
-                        tempIm = BytesIO(yzmdata.content)
-                        image = Image.open(tempIm).convert('RGB')
-                        images_dict = { "<image>" : image }
-                    else:
-                        images_dict = { "<image>" : Image.open(self.raw_data[i]["image"]).convert("RGB") }
-                elif isinstance(self.raw_data[i]["image"], Dict):
-                    ### for multi-images input, the template for every image is <image_xx>, such as <image_00>, <image_01>
-                    images_dict = {img_name : Image.open(img_path).convert("RGB") for img_name, img_path in self.raw_data[i]["image"].items()}
-                    
-                ret = preprocess(
-                    images_dict,
-                    self.raw_data[i]["conversations"],
-                    self.tokenizer,
-                    self.transform,
-                    query_nums=self.query_nums,
-                    slice_config=self.slice_config,
-                    llm_type=self.llm_type,
-                    patch_size=self.patch_size,
-                    batch_vision=self.batch_vision,
-                    max_length=self.max_length
-                )
-                ret = dict(
-                    input_ids=ret["input_ids"],
-                    position_ids=ret["position_ids"],
-                    labels=ret["target"],
-                    attention_mask=torch.ones_like(ret["input_ids"], dtype=torch.bool),
-                    pixel_values=ret["pixel_values"],
-                    tgt_sizes=ret["tgt_sizes"],
-                    image_bound=ret["image_bound"],
-                )
-            except:
-                logger.error(f"data fetch error")
-                return self.__getitem__(random.randint(0, len(self)))
-            return ret
-
-            
-    def data_collator(examples, padding_value=0, max_length=2048):
-        def trim_and_pad(seq, batch_first, padding_value):
-            return pad_sequence([s[:max_length] for s in seq], batch_first=True, padding_value=padding_value)
-
-        input_ids = trim_and_pad(
-            [example["input_ids"] for example in examples],
-            batch_first=True,
-            padding_value=padding_value,
-        )
-        position_ids = trim_and_pad(
-            [example["position_ids"] for example in examples],
-            batch_first=True,
-            padding_value=padding_value,
-        )
-        targets = trim_and_pad(
-            [example["labels"] for example in examples],
-            batch_first=True,
-            padding_value=-100,
-        )
-        attention_mask = trim_and_pad(
-            [example["attention_mask"] for example in examples],
-            batch_first=True,
-            padding_value=padding_value,
-        )
-        pixel_values = [example["pixel_values"] for example in examples]
-        image_bound = [example["image_bound"] for example in examples]
-        tgt_sizes = [example["tgt_sizes"] for example in examples]
-        return {
-            "input_ids": torch.tensor(input_ids),
-            "position_ids": torch.tensor(position_ids),
-            "labels": torch.tensor(targets),
-            "attention_mask": torch.tensor(attention_mask),
-            "image_bound": image_bound,
-            "tgt_sizes": tgt_sizes,
-            "pixel_values": pixel_values,
-        }
-
-
-    def conversation_to_ids(conversation, tokenizer, llm_type=None, new_schema=False, max_length=2048):
-        """
-        for single image multi-turn conversation
-        conversation: [{'role': 'user', 'content': 'Describe this image'},
-                    {'role': 'assistant', 'content': 'This is a cat.'}]
-        """
-        if llm_type == "llama3":
-            input_ids, context, raw_msg = conversation_to_ids_llama3(
-                conversation, tokenizer
-            )
-        elif llm_type == "qwen2":
-            input_ids, context, raw_msg = conversation_to_ids_qwen2(
-                conversation, tokenizer
-            )
-        else:
-            input_ids, context, raw_msg = conversation_to_ids_minicpm(
-                conversation, tokenizer
-            )
-
-        ids = torch.from_numpy(np.hstack(input_ids, dtype=np.int32))
-        context = torch.from_numpy(np.hstack(context, dtype=np.int8))
-        if input_ids.shape[-1] > max_length:
-            ids =ids[:max_length]
-            context = context[:max_length]
-            logger.warning(f"The input length ({input_ids.shape[-1]}) exceeds the model's maximum length ({max_length}), so it has been truncated")
-        
-        if torch.all(context):
-            logger.error("No tokens available to compute loss.")
-            raise Exception("No tokens available to compute loss.")
-
-        # build target
-        target = torch.full_like(ids, -100, dtype=torch.int32)
-        
-        for i in range(1, len(ids)):
-            if context[i] == 0:
-                target[i - 1] = ids[i]
-            if context[i] == 1 and context[i - 1] == 0:
-                if hasattr(tokenizer, "eot_id"):
-                    target[i - 1] = tokenizer.eot_id
-                else:
-                    target[i - 1] = tokenizer.eos_id
-        
-        # build image bound
-        if new_schema:
-            start_cond = (ids == tokenizer.im_start_id) | (ids == tokenizer.slice_start_id)
-            end_cond = (ids == tokenizer.im_end_id) | (ids == tokenizer.slice_end_id)
-            image_start_tokens = torch.where(start_cond)[0]
-            image_start_tokens += 1
-            image_end_tokens = torch.where(end_cond)[0]
-        else:
-            image_start_tokens = torch.where(ids == tokenizer.im_start_id)[0]
-            image_start_tokens += 1
-            image_end_tokens = torch.where(ids == tokenizer.im_end_id)[0]
-        if len(image_start_tokens) != len(image_end_tokens):
-            logger.error("image start token != image end tokens")
-            raise Exception("image start token != image end tokens")
-        
-        if len(image_start_tokens) > 0:
-            image_bound = torch.hstack(
-                [image_start_tokens.unsqueeze(-1), image_end_tokens.unsqueeze(-1)]
-            )
-        else:
-            image_bound = []
-
-        position_ids = torch.arange(ids.size(0)).long()
-        return {
-            "input_ids": ids,
-            "target": target,
-            "image_bound": image_bound,
-            "raw_msg": raw_msg,
-            "position_ids": position_ids
-        }
-
-
-    def conversation_to_ids_minicpm(conversation, tokenizer):
-        raw_msg = ""
-        input_ids = []
-        context = []
-        for idx, msg in enumerate(conversation):
-            role = msg["role"]
-            message = msg["content"]
-            assert role in ["user", "assistant"]
-            if role == "user":
-                prefix = "<用户>"
-            else:
-                prefix = "<AI>"
-            # append eos
-            if idx == len(conversation) - 1:
-                message = message + tokenizer.eos_token
-            prefix_ids = tokenizer.encode(prefix)[1:]  # remove bos
-            message_ids = tokenizer.encode(message)[1:]
-
-            input_ids.append(prefix_ids)
-            input_ids.append(message_ids)
-
-            context.append(np.ones((len(prefix_ids),), dtype=np.int8))
-            if role == "assistant":
-                context.append(np.zeros((len(message_ids),), dtype=np.int8))
-            else:
-                context.append(np.ones((len(message_ids),), dtype=np.int8))
-
-            raw_msg += prefix + message
-
-        return input_ids, context, raw_msg
-
-
-    def conversation_to_ids_llama3(conversation, tokenizer):
-        raw_msg = ""
-        input_ids = []
-        context = []
-        raw_msg = tokenizer.apply_chat_template(
-            conversation, tokenize=False, add_generation_prompt=False, chat_template=llama3_chat_template,
-        )
-        input_ids = tokenizer.apply_chat_template(
-            conversation, tokenize=True, add_generation_prompt=False, chat_template=llama3_chat_template,
-        )
-        input_ids = np.array(input_ids)
-
-        start_header_idxs = np.where(
-            input_ids == tokenizer.convert_tokens_to_ids("<|start_header_id|>")
-        )[0]
-        assistant_idxs = np.where(
-            input_ids == tokenizer.convert_tokens_to_ids("assistant")
-        )[0]
-        end_header_idxs = np.where(
-            input_ids == tokenizer.convert_tokens_to_ids("<|end_header_id|>")
-        )[0]
-        eot_idxs = np.where(
-            input_ids == tokenizer.convert_tokens_to_ids("<|eot_id|>"))[0]
-
-        context = np.ones_like(input_ids, dtype=np.int8)
-
-        for assistant_idx in assistant_idxs:
-            if assistant_idx in set((start_header_idxs + end_header_idxs) / 2):
-                st = assistant_idx + 3  # assistant<|end_header_id|>\n\n
-                for eot_idx in eot_idxs:
-                    if eot_idx > st:
-                        context[st: eot_idx + 1] = 0
-                        break
-
-        input_ids = np.hstack(input_ids)
-        context = np.hstack(context)
-
-        return input_ids, context, raw_msg
-
-
-    def conversation_to_ids_qwen2(conversation, tokenizer):
-        raw_msg = ""
-        chat = []
-        context = []
-        for idx, msg in enumerate(conversation):
-            role = msg["role"]
-            message = msg["content"]
-            assert role in ["user", "assistant"]
-            if role == "user":
-                prefix = "user"
-            else:
-                prefix = "assistant"
-            chat.append({"role":prefix, "content":message})
-            raw_msg += prefix + message
-        assert set([i['role'] for i in chat]) & set(['assistant'])
-
-        ret = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
-        input_ids = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=False)
-        input_ids = np.array(input_ids)
-
-        start_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_start|>'))[0]
-        assistant_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('assistant'))[0]
-        end_idxs = np.where(input_ids == tokenizer.convert_tokens_to_ids('<|im_end|>'))[0]
-
-        context = np.ones_like(input_ids, dtype=np.int8)
-
-        for assistant_idx in assistant_idxs:
-            if assistant_idx-1 in set(start_idxs):
-                st = assistant_idx + 1
-                for end_idx in end_idxs:
-                    if end_idx > st:
-                        context[st: end_idx + 1] = 0
-                        break
-                        
-        input_ids = np.hstack(input_ids)
-        context = np.hstack(context)
-        return input_ids, context, raw_msg
-
-
-    def preprocess(
-        images_dict,
-        conversations,
-        tokenizer,
-        transform,
-        query_nums=64,
-        slice_config=None,
-        llm_type=None,
-        patch_size=14,
-        batch_vision=False,
-        max_length=2048,
-    ):
-        """
-        single(multi) image(s) preprocess, the image(s) will be placed at the top of the conversation
-        """
-        conversations = copy.deepcopy(conversations)
-        assert len(conversations) > 1, "conversations length must large than 2"
-        assert conversations[0]["role"] == "user", "the first role must be user"
-
-        if slice_config is not None:
-            assert isinstance(slice_config, Dict)
-            assert "patch_size" in slice_config
-            assert "max_slice_nums" in slice_config
-            assert "scale_resolution" in slice_config
-        default_image_placeholder = (
-            tokenizer.im_start + tokenizer.unk_token * query_nums + tokenizer.im_end
-        )
-        new_schema = False
-        use_image_id = False
-        if llm_type=='qwen2':
-            new_schema = True
-            use_image_id = True
-        image_placeholder_dict = {}
-        images = []
-        image_id_cnt = 0 
-        for img_name, image in images_dict.items():
-            if slice_config:
-                source_image, patches, best_grid = slice_image(
-                    image,
-                    slice_config["max_slice_nums"],
-                    slice_config["scale_resolution"],
-                    slice_config["patch_size"],
-                )
-                images.append(source_image)
-                image_placeholder = default_image_placeholder
-                if len(patches) > 0:
-                    for i in range(len(patches)):
-                        for j in range(len(patches[0])):
-                            images.append(patches[i][j])
-                    if use_image_id:
-                        image_placeholder = f'{tokenizer.im_id_start}{image_id_cnt}{tokenizer.im_id_end}' + image_placeholder
-                        image_id_cnt += 1
-                    image_placeholder += get_grid_placeholder(
-                        tokenizer, best_grid, query_nums, new_schema = new_schema)
-                image_placeholder_dict[img_name] = image_placeholder
-            else:
-                images.append(image)
-                if use_image_id:
-                    image_placeholder = f'{tokenizer.im_id_start}{image_id_cnt}{tokenizer.im_id_end}' + image_placeholder
-                    image_id_cnt += 1
-                else:
-                    image_placeholder = default_image_placeholder
-                image_placeholder_dict[img_name] = image_placeholder
-        
-        images = [transform(i) for i in images]
-        
-        if len(images_dict) == 1 and "<image>" in images_dict:       
-            if "<image>" in conversations[0]["content"]:
-                conversations[0]["content"] = conversations[0]["content"].replace(
-                    "<image>", image_placeholder
-                )
-            else:
-                conversations[0]["content"] = (
-                    image_placeholder + "\n" + conversation[0]["content"]
-                )
-            input_dict = conversation_to_ids(conversations, tokenizer, llm_type, new_schema, max_length)
-        else:
-            pattern = r'<image_\d+>'
-            new_conversations = []
-            for conversation in conversations:
-                content = conversation['content']
-                parts = re.split(f'({pattern})', content)
-                for i, part in enumerate(parts):
-                    if not part.strip():
-                        continue
-                    if re.match(pattern, part):  
-                        if part in image_placeholder_dict:
-                            parts[i] = image_placeholder_dict[part] 
-                        else:
-                            raise Exception(f"not found {part} in image dict")
-                conversation['content'] = '\n'.join(parts)
-                new_conversations.append(conversation)
-            conversations = new_conversations
-            
-            input_dict = conversation_to_ids(conversations, tokenizer, llm_type, new_schema, max_length)
-
-        if batch_vision:
-            tgt_sizes = []
-            reshape_images = []
-            for image in images:
-                H, W = image.shape[1:]
-                reshape_image = reshape_by_patch(image, patch_size)
-                reshape_images.append(reshape_image)
-                tgt_sizes.append([H // patch_size, W // patch_size])
-            if tgt_sizes:
-                tgt_sizes = torch.Tensor(tgt_sizes).type(torch.int32)
-
-            input_dict["pixel_values"] = reshape_images
-            input_dict["tgt_sizes"] = tgt_sizes
-
-        else:
-            input_dict["pixel_values"] = images
-            input_dict["tgt_sizes"] = []
-
-        return input_dict
-
-
-    def slice_image(
-        image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False
-    ):
-        original_size = image.size
-        original_width, original_height = original_size
-        log_ratio = math.log(original_width / original_height)
-        ratio = original_width * original_height / \
-            (scale_resolution * scale_resolution)
-        multiple = min(math.ceil(ratio), max_slice_nums)
-
-        source_image = None
-        best_grid = None
-        patches = []
-
-        if multiple <= 1 or never_split:
-            # dont need to slice, upsample
-            best_size = find_best_resize(
-                original_size, scale_resolution, patch_size, allow_upscale=True
-            )
-            source_image = image.resize(best_size, Image.Resampling.BICUBIC)
-        else:
-            candidate_split_grids_nums = []
-            for i in [multiple - 1, multiple, multiple + 1]:
-                if i == 1 or i > max_slice_nums:
-                    continue
-                candidate_split_grids_nums.append(i)
-
-            # source image, down-sampling and ensure divided by patch_size
-            best_resize = find_best_resize(
-                original_size, scale_resolution, patch_size)
-            source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
-            candidate_grids = []
-
-            # find best grid
-            for split_grids_nums in candidate_split_grids_nums:
-                m = 1
-                while m <= split_grids_nums:
-                    if split_grids_nums % m == 0:
-                        candidate_grids.append([m, split_grids_nums // m])
-                    m += 1
-
-            best_grid = [1, 1]
-            min_error = float("inf")
-            for grid in candidate_grids:
-                error = abs(log_ratio - math.log(grid[0] / grid[1]))
-                if error < min_error:
-                    best_grid = grid
-                    min_error = error
-
-            refine_size = get_refine_size(
-                original_size, best_grid, scale_resolution, patch_size, allow_upscale=True
-            )
-
-            refine_image = image.resize(refine_size, Image.Resampling.BICUBIC)
-            patches = split_to_patches(refine_image, best_grid)
-
-        return source_image, patches, best_grid
-
-
-    def ensure_divide(length, patch_size):
-        return max(round(length / patch_size) * patch_size, patch_size)
-
-
-    def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False):
-        width, height = original_size
-        if (width * height > scale_resolution * scale_resolution) or allow_upscale:
-            r = width / height
-            height = int(scale_resolution / math.sqrt(r))
-            width = int(height * r)
-        best_width = ensure_divide(width, patch_size)
-        best_height = ensure_divide(height, patch_size)
-        return (best_width, best_height)
-
-
-    def get_refine_size(
-        original_size, grid, scale_resolution, patch_size, allow_upscale=False
-    ):
-        width, height = original_size
-        grid_x, grid_y = grid
-
-        refine_width = ensure_divide(width, grid_x)
-        refine_height = ensure_divide(height, grid_y)
-
-        grid_width = refine_width / grid_x
-        grid_height = refine_height / grid_y
-
-        best_grid_size = find_best_resize(
-            (grid_width, grid_height),
-            scale_resolution,
-            patch_size,
-            allow_upscale=allow_upscale,
-        )
-
-        refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)
-
-        return refine_size
-
-
-    def split_to_patches(image, grid):
-        patches = []
-        width, height = image.size
-        grid_x = int(width / grid[0])
-        grid_y = int(height / grid[1])
-
-        for i in range(0, height, grid_y):
-            images = []
-            for j in range(0, width, grid_x):
-                box = (j, i, j + grid_x, i + grid_y)
-                patch = image.crop(box)
-                images.append(patch)
-            patches.append(images)
-
-        return patches
-
-
-    def get_grid_placeholder(tokenizer, grid, query_num, new_schema=False):
-        if new_schema:
-            image_placeholder = (
-                tokenizer.slice_start + tokenizer.unk_token * query_num + tokenizer.slice_end
-            )
-        else:
-            image_placeholder = (
-                tokenizer.im_start + tokenizer.unk_token * query_num + tokenizer.im_end
-            )
-
-        cols = grid[0]
-        rows = grid[1]
-        slices = []
-        for i in range(rows):
-            lines = []
-            for j in range(cols):
-                lines.append(image_placeholder)
-            slices.append("".join(lines))
-        if new_schema:
-            slice_placeholder = '\n'.join(slices)
-        else:
-            slice_placeholder = tokenizer.slice_start + \
-            "\n".join(slices) + tokenizer.slice_end
-        return slice_placeholder
-
-
-    def reshape_by_patch(image_tensor, patch_size):
-        """
-        :param image_tensor: shape [3, H, W]
-        :param patch_size:
-        :return: [3, patch_size, HW/patch_size]
-        """
-        patches = torch.nn.functional.unfold(
-            image_tensor, (patch_size, patch_size), stride=(patch_size, patch_size)
-        )
-
-        patches = patches.reshape(image_tensor.size(0), patch_size, patch_size, -1)
-        patches = patches.permute(0, 1, 3, 2).reshape(
-            image_tensor.size(0), patch_size, -1)
-        patches=patches.cuda()
-        return patches
-    # Load your tokenizer and model with AutoAWQ
-    model = AutoAWQForCausalLM.from_pretrained(args.model_path).cuda()
-    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True,device_map={"": "cuda:0"})
-
-    # set some parameters
-    if hasattr(model.config, "slice_config"):
-            model.config.slice_config.max_slice_nums = 1
-            slice_config = model.config.slice_config.to_dict()
-    else:
-        model.config.max_slice_nums = 1
-        slice_config = model.config.to_dict()
-
-    if hasattr(model.config, "batch_vision_input"):
-        batch_vision = model.config.batch_vision_input
-    else:
-        batch_vision = False
-    def build_transform():
-        IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_MEAN
-        IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)  # timm.data.IMAGENET_INCEPTION_STD
-        return transforms.Compose(
-                [
-                    transforms.ToTensor(),
-                    transforms.Normalize(
-                        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD
-                    ),
-                ]
-            )
-    transform_func = build_transform()
-
-    
-
-    
-    calib_data = SupervisedDataset(
-            dataset,
-            transform_func,
-            tokenizer,
-            slice_config=slice_config,
-            llm_type="qwen2",
-            patch_size=model.config.patch_size,
-            query_nums=model.config.query_num,
-            batch_vision=batch_vision,
-            max_length=2048,
-        )
-
-
-    out_data=[]
-    batch_data=[]
-    for index in range(len(calib_data)//batch):
-        batch_data=[]
-        for j in range(batch):
-            batch_data.append(calib_data[j+index*batch])
-        out_data.append(data_collator(batch_data))
-
-
-    # Then just run the calibration process by one line of code:
-    model.quantize(calib_data=out_data[0], quant_config=quant_config)
-
-    # remove pos_embed
-    if hasattr(model.model, 'resampler') and hasattr(model.model.resampler, 'pos_embed'):
-        del model.model.resampler.pos_embed
-    # Finally, save the quantized model:
-    model.model.config.use_cache = model.model.generation_config.use_cache = True
-    model.save_quantized(args.quant_path, safetensors=True, shard_size="4GB")
-
-if __name__ == "__main__":
-    main()
-
-