diff --git a/awq/models/auto.py b/awq/models/auto.py index 02051d96..e21cb954 100644 --- a/awq/models/auto.py +++ b/awq/models/auto.py @@ -43,7 +43,7 @@ def from_pretrained(self, model_path, trust_remote_code=True, safetensors=False, @classmethod def from_quantized(self, quant_path, quant_filename='', max_new_tokens=None, trust_remote_code=True, fuse_layers=True, - batch_size=1, safetensors=False, + batch_size=1, safetensors=True, max_memory=None, offload_folder=None) -> BaseAWQForCausalLM: os.environ["AWQ_BATCH_SIZE"] = str(batch_size) model_type = check_and_get_model_type(quant_path, trust_remote_code) diff --git a/awq/models/base.py b/awq/models/base.py index 619ca17a..0d9e0bd5 100644 --- a/awq/models/base.py +++ b/awq/models/base.py @@ -53,7 +53,7 @@ def quantize(self, tokenizer=None, quant_config={}, def fuse_layers(model): pass - def save_quantized(self, save_dir, safetensors=False, shard_size="10GB"): + def save_quantized(self, save_dir, safetensors=True, shard_size="10GB"): save_dir = save_dir[:-1] if save_dir[-1] == '/' else save_dir # Save model @@ -67,7 +67,9 @@ def forward(self, x): return x self.quant_config.save_pretrained(save_dir) # Remove empty state dict - os.remove(f'{save_dir}/pytorch_model.bin') + default_path = f'{save_dir}/model.safetensors' + if os.path.exists(default_path): + os.remove(default_path) # model_name has no extension, add it when saving state_dict model_name = 'model.safetensors' if safetensors else 'pytorch_model.bin' @@ -130,7 +132,7 @@ def from_pretrained(self, model_path, model_type, torch_dtype: torch.dtype = tor @classmethod def from_quantized(self, model_path, model_type, model_filename='', max_new_tokens=None, torch_dtype=torch.float16, - trust_remote_code=True, safetensors=False, is_quantized=True, + trust_remote_code=True, safetensors=True, is_quantized=True, fuse_layers=False, version='GEMM', max_memory=None, offload_folder=None): # [STEP 1-2] Load weights path and configs @@ -180,11 +182,11 @@ def from_quantized(self, model_path, model_type, model_filename='', return self(model, model_type, is_quantized=is_quantized, quant_config=quant_config) - def _load_config(self, model_path, model_filename, safetensors=False, + def _load_config(self, model_path, model_filename, safetensors=True, version="GEMM", trust_remote_code=True, max_new_tokens=4096): # [STEP 1] Download model if path is not a directory if not os.path.isdir(model_path): - ignore_patterns = ["*msgpack*", "*h5*"] + ignore_patterns = ["*msgpack*", "*h5*", "optimizer.pt"] if safetensors: ignore_patterns.extend(["*.pt*", "*.bin*"]) else: diff --git a/examples/basic_generate.py b/examples/basic_generate.py index 89342524..e9d9cf4f 100644 --- a/examples/basic_generate.py +++ b/examples/basic_generate.py @@ -4,7 +4,7 @@ quant_path = "TheBloke/Mistral-7B-OpenOrca-AWQ" # Load model -model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True, safetensors=True) +model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True) tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) diff --git a/examples/basic_quant.py b/examples/basic_quant.py index 0a83c98c..a5ee859f 100644 --- a/examples/basic_quant.py +++ b/examples/basic_quant.py @@ -14,7 +14,6 @@ model.quantize(tokenizer, quant_config=quant_config) # Save quantized model -# NOTE: pass safetensors=True to save quantized model weights as safetensors model.save_quantized(quant_path) tokenizer.save_pretrained(quant_path) diff --git a/examples/basic_safetensors_generate.py b/examples/basic_safetensors_generate.py deleted file mode 100644 index 93fcc096..00000000 --- a/examples/basic_safetensors_generate.py +++ /dev/null @@ -1,28 +0,0 @@ -from awq import AutoAWQForCausalLM -from transformers import AutoTokenizer, TextStreamer - -quant_path = "casperhansen/opt-125m-awq" - -# Load model -model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True, safetensors=True) -tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True) -streamer = TextStreamer(tokenizer, skip_special_tokens=True) - -# Convert prompt to tokens -prompt_template = """\ -A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. - -USER: {prompt} -ASSISTANT:""" - -tokens = tokenizer( - prompt_template.format(prompt="How are you today?"), - return_tensors='pt' -).input_ids.cuda() - -# Generate output -generation_output = model.generate( - tokens, - streamer=streamer, - max_new_tokens=512 -) diff --git a/examples/benchmark.py b/examples/benchmark.py index 1b5bcabf..7a4f3bdb 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -129,7 +129,7 @@ def main(args): parser.add_argument("--model_path", type=str, default="casperhansen/mistral-7b-instruct-v0.1-awq", help="path to the model") parser.add_argument("--quant_file", type=str, default="", help="weights filename") parser.add_argument("--batch_size", type=int, default=1, help="Batch size for cache and generation") - parser.add_argument("--safetensors", default=False, action="store_true", help="Use for enabling safetensors") + parser.add_argument("--safetensors", default=True, action="store_false", help="Use for disabling safetensors") args = parser.parse_args() main(args) diff --git a/setup.py b/setup.py index bcebec31..6de5f5b9 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ requirements = [ "torch>=2.1.0", - "transformers>=4.34.0", + "transformers>=4.35.0", "tokenizers>=0.12.1", "accelerate", "sentencepiece",