Merge pull request #20 from microsoft/hjiang/support_gptq

Feature (LLMLingua): support GPT-Q Co-authored-by: Qianhui Wu <[email protected]> Co-authored-by: Xufang Luo <[email protected]>
microsoft · Nov 22, 2023 · 3c3942c · 3c3942c
2 parents 70bbd02 + 9585048
commit 3c3942c
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 16 deletions.
diff --git a/DOCUMENT.md b/DOCUMENT.md
@@ -15,15 +15,15 @@ from llmlingua import PromptCompressor
 llm_lingua = PromptCompressor(
     model_name: str = "NousResearch/Llama-2-7b-hf",
     device_map: str = "cuda",
-    use_auth_token: bool = False,
+    model_config: dict = {},
     open_api_config: dict = {}, 
 )
 ```
 ### Parameters
 
 - model_name(str), the name of small language model from huggingface. Default set to "NousResearch/Llama-2-7b-hf";
 - device_map(str), the device environment for using small models, like 'cuda', 'cpu', 'balanced', 'balanced_low_0', 'auto'. Default set to "cuda";
-- use_auth_token(bool, optional), controls the usage of huggingface auto_token. Default set to False;
+- model_config(dict, optional), the config of huggingface model. Default set to {};
 - open_api_config(dict, optional), the config of openai which use in OpenAI Embedding in coarse-level prompt compression. Default set to {};
 
 ## Function Call

diff --git a/README.md b/README.md
@@ -106,6 +106,10 @@ compressed_prompt = llm_lingua.compress_prompt(prompt, instruction="", question=
 #  'compressed_tokens': 211,
 #  'ratio': '11.2x',
 #  'saving': ', Saving $0.1 in GPT-4.'}
+
+## Or use the quantation model, like TheBloke/Llama-2-7b-Chat-GPTQ, only need <8GB GPU memory.
+## Before that, you need to pip install optimum auto-gptq
+llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"})
 ```
 
 You can refer to the [**examples**](./examples) to understand how to use **LLMLingua** and **LongLLMLingua** in practical scenarios, such as RAG, Online Meeting, CoT, Code, and RAG using LlamaIndex. Additionally, you can refer to the [**document**](./DOCUMENT.md) for more recommendations on how to use LLMLingua effectively.

diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py
@@ -18,36 +18,41 @@ def __init__(
         self,
         model_name: str = "NousResearch/Llama-2-7b-hf",
         device_map: str = "cuda",
-        use_auth_token: bool = False,
+        model_config: dict = {},
         open_api_config: dict = {},
     ):
-        self.load_model(model_name, device_map, use_auth_token)
+        self.load_model(model_name, device_map, model_config)
         self.retrieval_model = None
         self.retrieval_model_name = None
         self.open_api_config = open_api_config
         self.cache_bos_num = 10
         self.prefix_bos_num = 100
 
     def load_model(
-        self, model_name: str, device_map: str = "cuda", use_auth_token: bool = False
+        self, model_name: str, device_map: str = "cuda", model_config: dict = {}
     ):
-        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        tokenizer.padding_side = "left"
-        tokenizer.pad_token_id = (
-            config.pad_token_id if config.pad_token_id else tokenizer.eos_token_id
-        )
+        trust_remote_code = model_config.get("trust_remote_code", True)
+        if "trust_remote_code" not in model_config:
+            model_config["trust_remote_code"] = trust_remote_code
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code)
+        if model_config.get("pad_to_left", True):
+            tokenizer.padding_side = "left"
+            tokenizer.pad_token_id = (
+                config.pad_token_id if config.pad_token_id else tokenizer.eos_token_id
+            )
         self.device = (
             device_map if any(key in device_map for key in ["cuda", "cpu"]) else "cuda"
         )
         if "cuda" in device_map or "cpu" in device_map:
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 torch_dtype="auto" if device_map == "cuda" else torch.float32,
+                device_map=device_map,
                 config=config,
                 ignore_mismatched_sizes=True,
-                trust_remote_code=True,
-            ).to(device_map)
+                **model_config
+            )
         else:
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
@@ -57,8 +62,7 @@ def load_model(
                 offload_folder="/tmp/offload",
                 offload_state_dict=True,
                 cache_dir="/tmp/cache",
-                use_auth_token=use_auth_token,
-                trust_remote_code=True,
+                **model_config
             )
         self.tokenizer = tokenizer
         self.model = model

diff --git a/llmlingua/version.py b/llmlingua/version.py
@@ -2,7 +2,7 @@
 _MINOR = "1"
 # On master and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "3"
+_PATCH = "4"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""