diff --git a/awq/models/base.py b/awq/models/base.py index 66dc02e6..e0938b95 100644 --- a/awq/models/base.py +++ b/awq/models/base.py @@ -75,6 +75,7 @@ "baichuan": "AutoModelForCausalLM", "llava": "AutoModelForVision2Seq", "qwen2": "AutoModelForCausalLM", + "qwen2_vl": "AutoModelForVision2Seq", "gemma": "AutoModelForCausalLM", "stablelm": "AutoModelForCausalLM", "starcoder2": "AutoModelForCausalLM", diff --git a/awq/models/qwen2vl.py b/awq/models/qwen2vl.py new file mode 100644 index 00000000..dbd483a2 --- /dev/null +++ b/awq/models/qwen2vl.py @@ -0,0 +1,255 @@ +"""hack to use qwen2vl model with awq""" +import torch +from torch import nn +from typing_extensions import TYPE_CHECKING + +from ..quantize.quantizer import AwqQuantizer, clear_memory, get_best_device +from .base import ( + Annotated, + AwqConfig, + BaseAWQForCausalLM, + Dict, + Doc, + List, + PreTrainedTokenizer, + Union, +) + + +if TYPE_CHECKING: + from transformers import Qwen2VLForConditionalGeneration + from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLDecoderLayer + + + +# hack to +# 1. use `self.calib_data` as processed input data +# 2. set the `layer_kwargs` and `inps` correctly +class Qwen2VLAwqQuantizer(AwqQuantizer): + def init_quant(self, n_samples=None, max_seq_len=None): + modules = self.awq_model.get_model_layers(self.model) + samples = self.calib_data + + inps = [] + layer_kwargs = {} + + best_device = get_best_device() + modules[0] = modules[0].to(best_device) + self.awq_model.move_embed(self.model, best_device) + + # get input and kwargs to layer 0 + # with_kwargs is only supported in PyTorch 2.0 + # use this Catcher hack for now + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, *args, **kwargs): + # assume first input to forward is hidden states + if len(args) > 0: + hidden_states = args[0] + del args + else: + first_key = list(kwargs.keys())[0] + hidden_states = kwargs.pop(first_key) + + inps.append(hidden_states) + layer_kwargs.update(kwargs) + raise ValueError # early exit to break later inference + + def move_to_device(obj: torch.Tensor | nn.Module, device: torch.device): + def get_device(obj: torch.Tensor | nn.Module): + if isinstance(obj, torch.Tensor): + return obj.device + return next(obj.parameters()).device + + if get_device(obj) != device: + obj = obj.to(device) + return obj + + # patch layer 0 to catch input and kwargs + modules[0] = Catcher(modules[0]) + for k, v in samples.items(): + if isinstance(v, (torch.Tensor, nn.Module)): + samples[k] = move_to_device(v, best_device) + try: + self.model(**samples) + except ValueError: # work with early exit + pass + finally: + for k, v in samples.items(): + if isinstance(v, (torch.Tensor, nn.Module)): + samples[k] = move_to_device(v, "cpu") + modules[0] = modules[0].module # restore + + del samples + inps = inps[0] + + modules[0] = modules[0].cpu() + self.awq_model.move_embed(self.model, "cpu") + + clear_memory() + + return modules, layer_kwargs, inps + + +class Qwen2VLAWQForConditionalGeneration(BaseAWQForCausalLM): + layer_type = "Qwen2VLDecoderLayer" + max_seq_len_key = "max_position_embeddings" + modules_to_not_convert = ["visual"] + + @staticmethod + def get_model_layers(model: "Qwen2VLForConditionalGeneration"): + return model.model.layers + + @staticmethod + def get_act_for_scaling(module: "Qwen2VLForConditionalGeneration"): + return dict(is_scalable=False) + + @staticmethod + def move_embed(model: "Qwen2VLForConditionalGeneration", device: str): + model.model.embed_tokens = model.model.embed_tokens.to(device) + model.visual = model.visual.to(device) + + @staticmethod + def get_layers_for_scaling(module: "Qwen2VLDecoderLayer", input_feat, module_kwargs): + layers = [] + + # attention input + layers.append( + dict( + prev_op=module.input_layernorm, + layers=[ + module.self_attn.q_proj, + module.self_attn.k_proj, + module.self_attn.v_proj, + ], + inp=input_feat["self_attn.q_proj"], + module2inspect=module.self_attn, + kwargs=module_kwargs, + ) + ) + + # attention out + # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696 + if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape: + layers.append( + dict( + prev_op=module.self_attn.v_proj, + layers=[module.self_attn.o_proj], + inp=input_feat["self_attn.o_proj"], + ) + ) + + # linear 1 + layers.append( + dict( + prev_op=module.post_attention_layernorm, + layers=[module.mlp.gate_proj, module.mlp.up_proj], + inp=input_feat["mlp.gate_proj"], + module2inspect=module.mlp, + ) + ) + + # linear 2 + layers.append( + dict( + prev_op=module.mlp.up_proj, + layers=[module.mlp.down_proj], + inp=input_feat["mlp.down_proj"], + ) + ) + + return layers + + # hack to use `Qwen2VLAwqQuantizer` as quantizer + @torch.no_grad() + def quantize( + self, + tokenizer: Annotated[ + PreTrainedTokenizer, Doc("The tokenizer to use for quantization.") + ] = None, + quant_config: Annotated[ + Dict, Doc("The quantization config you want to use.") + ] = {}, + calib_data: Annotated[ + Union[str, List[str]], + Doc( + "The calibration dataset. Either a string pointing to Huggingface or a list of preloaded examples." + ), + ] = "pileval", + split: Annotated[str, Doc("The split of calib_data.")] = "train", + text_column: Annotated[str, Doc("The text column of calib_data.")] = "text", + duo_scaling: Annotated[ + bool, Doc("Whether to scale using both w/x or just x.") + ] = True, + export_compatible: Annotated[ + bool, + Doc( + "This argument avoids real quantization by only applying the scales without quantizing down to FP16." + ), + ] = False, + apply_clip: Annotated[ + bool, + Doc( + "Whether to apply clipping to the model during quantization. Some models may perform better with this set to False." + ), + ] = True, + n_parallel_calib_samples: Annotated[ + int, + Doc( + "The number of parallel samples to run through the model. " + "A high number of parallel samples can result in OOM during quantization if max_calib_samples is high enough. " + "If None, runs through all samples at the same time. " + "You can set this to a low number for more memory efficient quantization." + ), + ] = None, + max_calib_samples: Annotated[ + int, Doc("The maximum number of samples to run through the model.") + ] = 128, + max_calib_seq_len: Annotated[ + int, + Doc( + "The maximum sequence length of the calibration dataset. Discard samples greater than max_calib_seq_len." + ), + ] = 512, + max_chunk_memory: Annotated[ + int, + Doc( + "The loss computation and per-channel mean is optimized into chunked computations." + " Adjust this parameter to increase or decrease memory usage for these computations." + " Default is 1GB (1024 * 1024 * 1024)." + ), + ] = 1024 + * 1024 + * 1024, + ): + self.quant_config: AwqConfig = AwqConfig.from_dict(quant_config) + + if hasattr(self, "modules_to_not_convert"): + self.quant_config.modules_to_not_convert = self.modules_to_not_convert + + self.quantizer = Qwen2VLAwqQuantizer( + self, + self.model, + tokenizer, + self.quant_config.w_bit, + self.quant_config.q_group_size, + self.quant_config.zero_point, + self.quant_config.version, + calib_data, + split, + text_column, + duo_scaling, + modules_to_not_convert=self.quant_config.modules_to_not_convert, + export_compatible=export_compatible, + apply_clip=apply_clip, + n_parallel_calib_samples=n_parallel_calib_samples, + max_calib_samples=max_calib_samples, + max_calib_seq_len=max_calib_seq_len, + max_chunk_memory=max_chunk_memory, + ) + self.quantizer.quantize() + + self.is_quantized = True diff --git a/examples/quantize_qwen2vl.py b/examples/quantize_qwen2vl.py new file mode 100644 index 00000000..08f0964d --- /dev/null +++ b/examples/quantize_qwen2vl.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import logging + +from qwen_vl_utils import process_vision_info +from transformers import Qwen2VLProcessor + +from awq.models.qwen2vl import Qwen2VLAWQForConditionalGeneration + + +logging.basicConfig( + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S", +) + +# Specify paths and hyperparameters for quantization +model_path = "your_model_path" +quant_path = "your_quantized_model_path" +quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"} + +# Load your processor and model with AutoAWQ +processor = Qwen2VLProcessor.from_pretrained(model_path) +model = Qwen2VLAWQForConditionalGeneration.from_pretrained( + model_path, model_type="qwen2_vl", use_cache=False, attn_implementation="flash_attention_2" +) + + +# Then you need to prepare your data for calibaration. What you need to do is just put samples into a list, +# each of which is a typical chat message as shown below. you can specify text and image in `content` field: +# dataset = [ +# # message 0 +# [ +# {"role": "system", "content": "You are a helpful assistant."}, +# {"role": "user", "content": "Tell me who you are."}, +# {"role": "assistant", "content": "I am a large language model named Qwen..."}, +# ], +# # message 1 +# [ +# { +# "role": "user", +# "content": [ +# {"type": "image", "image": "file:///path/to/your/image.jpg"}, +# {"type": "text", "text": "Output all text in the image"}, +# ], +# }, +# {"role": "assistant", "content": "The text in the image is balabala..."}, +# ], +# # other messages... +# ..., +# ] +# here, we use a caption dataset **only for demonstration**. You should replace it with your own sft dataset. +def prepare_dataset(n_sample: int = 8) -> list[list[dict]]: + from datasets import load_dataset + + dataset = load_dataset("laion/220k-GPT4Vision-captions-from-LIVIS", split=f"train[:{n_sample}]") + return [ + [ + { + "role": "user", + "content": [ + {"type": "image", "image": sample["url"]}, + {"type": "text", "text": "generate a caption for this image"}, + ], + }, + {"role": "assistant", "content": sample["caption"]}, + ] + for sample in dataset + ] + + +dataset = prepare_dataset() + +# process the dataset into tensors +text = processor.apply_chat_template(dataset, tokenize=False, add_generation_prompt=True) +image_inputs, video_inputs = process_vision_info(dataset) +inputs = processor(text=text, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt") + +# Then just run the calibration process by one line of code: +model.quantize(calib_data=inputs, quant_config=quant_config) + +# Finally, save the quantized model: +model.model.config.use_cache = model.model.generation_config.use_cache = True +model.save_quantized(quant_path, safetensors=True, shard_size="4GB") +processor.save_pretrained(quant_path) + +# Then you can obtain your own AWQ quantized model for deployment. Enjoy!