diff --git a/awq/models/__init__.py b/awq/models/__init__.py index e2dfb0d7..4ea17bfb 100644 --- a/awq/models/__init__.py +++ b/awq/models/__init__.py @@ -20,5 +20,6 @@ from .llava_next import LlavaNextAWQForCausalLM from .phi3 import Phi3AWQForCausalLM from .cohere import CohereAWQForCausalLM +from .deepseek import DeepseekAWQForCausalLM from .deepseek_v2 import DeepseekV2AWQForCausalLM from .minicpm import MiniCPMAWQForCausalLM diff --git a/awq/models/auto.py b/awq/models/auto.py index 92a69b63..b9afb34e 100644 --- a/awq/models/auto.py +++ b/awq/models/auto.py @@ -30,6 +30,7 @@ "llava_next": LlavaNextAWQForCausalLM, "phi3": Phi3AWQForCausalLM, "cohere": CohereAWQForCausalLM, + "deepseek": DeepseekAWQForCausalLM, "deepseek_v2": DeepseekV2AWQForCausalLM, "minicpm": MiniCPMAWQForCausalLM, } diff --git a/awq/models/base.py b/awq/models/base.py index da7117d8..d1ab8134 100644 --- a/awq/models/base.py +++ b/awq/models/base.py @@ -81,6 +81,7 @@ "llava_next": "AutoModelForVision2Seq", "phi3": "AutoModelForCausalLM", "cohere": "AutoModelForCausalLM", + "deepseek": "AutoModelForCausalLM", "deepseek_v2": "AutoModelForCausalLM", "minicpm": "AutoModelForCausalLM", } diff --git a/awq/models/deepseek.py b/awq/models/deepseek.py new file mode 100644 index 00000000..f5f1c4a8 --- /dev/null +++ b/awq/models/deepseek.py @@ -0,0 +1,103 @@ +import tqdm +from typing import List, Tuple +from .base import BaseAWQForCausalLM + + +class DeepseekAWQForCausalLM(BaseAWQForCausalLM): + layer_type = "DeepseekDecoderLayer" + max_seq_len_key = "max_position_embeddings" + + @staticmethod + def get_model_layers(model): + return model.model.layers + + @staticmethod + def get_act_for_scaling(module): + return dict(is_scalable=False) + + @staticmethod + def move_embed(model, device: str): + model.model.embed_tokens = model.model.embed_tokens.to(device) + + @staticmethod + def get_layers_for_scaling( + module, input_feat, module_kwargs + ): + layers = [] + + # attention input + layers.append( + dict( + prev_op=module.input_layernorm, + layers=[ + module.self_attn.q_proj, + module.self_attn.k_proj, + module.self_attn.v_proj, + ], + inp=input_feat["self_attn.q_proj"], + module2inspect=module.self_attn, + kwargs=module_kwargs, + ) + ) + + if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape: + layers.append( + dict( + prev_op=module.self_attn.v_proj, + layers=[module.self_attn.o_proj], + inp=input_feat["self_attn.o_proj"], + ) + ) + + if hasattr(module.mlp, "gate"): + # linear in + layers.append( + dict( + prev_op=module.post_attention_layernorm, + layers=[ + w + for expert in module.mlp.experts + for w in [expert.gate_proj, expert.up_proj] + ] + [module.mlp.shared_experts.gate_proj, module.mlp.shared_experts.up_proj], + inp=input_feat["mlp"], + module2inspect=module.mlp, + ) + ) + + # linear out + for i, expert in enumerate(module.mlp.experts): + layers.append( + dict( + prev_op=expert.up_proj, + layers=[expert.down_proj], + inp=input_feat[f"mlp.experts.{i}.down_proj"], + ) + ) + layers.append( + dict( + prev_op=module.mlp.shared_experts.up_proj, + layers=[module.mlp.shared_experts.down_proj], + inp=input_feat[f"mlp.shared_experts.down_proj"], + ) + ) + else: + # linear 1 + layers.append( + dict( + prev_op=module.post_attention_layernorm, + layers=[module.mlp.gate_proj, module.mlp.up_proj], + inp=input_feat["mlp.gate_proj"], + module2inspect=module.mlp, + ) + ) + + # linear 2 + layers.append( + dict( + prev_op=module.mlp.up_proj, + layers=[module.mlp.down_proj], + inp=input_feat["mlp.down_proj"], + ) + ) + + return layers diff --git a/awq/quantize/quantizer.py b/awq/quantize/quantizer.py index cd9fb0dd..f58288fa 100644 --- a/awq/quantize/quantizer.py +++ b/awq/quantize/quantizer.py @@ -610,7 +610,7 @@ def cache_input_hook(m, x, y, name, feat_dict): "block_sparse_moe": layer.block_sparse_moe, } - if self.awq_model.model_type == "deepseek_v2": + if self.awq_model.model_type in ["deepseek_v2", "deepseek"]: named_linears = { **named_linears, "mlp": layer.mlp,