diff --git a/DOCUMENT.md b/DOCUMENT.md index 08538f2..bffe9c1 100644 --- a/DOCUMENT.md +++ b/DOCUMENT.md @@ -15,7 +15,7 @@ from llmlingua import PromptCompressor llm_lingua = PromptCompressor( model_name: str = "NousResearch/Llama-2-7b-hf", device_map: str = "cuda", - use_auth_token: bool = False, + model_config: dict = {}, open_api_config: dict = {}, ) ``` @@ -23,7 +23,7 @@ llm_lingua = PromptCompressor( - model_name(str), the name of small language model from huggingface. Default set to "NousResearch/Llama-2-7b-hf"; - device_map(str), the device environment for using small models, like 'cuda', 'cpu', 'balanced', 'balanced_low_0', 'auto'. Default set to "cuda"; -- use_auth_token(bool, optional), controls the usage of huggingface auto_token. Default set to False; +- model_config(dict, optional), the config of huggingface model. Default set to {}; - open_api_config(dict, optional), the config of openai which use in OpenAI Embedding in coarse-level prompt compression. Default set to {}; ## Function Call diff --git a/README.md b/README.md index 9caab35..1527fdc 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,10 @@ compressed_prompt = llm_lingua.compress_prompt(prompt, instruction="", question= # 'compressed_tokens': 211, # 'ratio': '11.2x', # 'saving': ', Saving $0.1 in GPT-4.'} + +## Or use the quantation model, like TheBloke/Llama-2-7b-Chat-GPTQ, only need <8GB GPU memory. +## Before that, you need to pip install optimum auto-gptq +llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"}) ``` You can refer to the [**examples**](./examples) to understand how to use **LLMLingua** and **LongLLMLingua** in practical scenarios, such as RAG, Online Meeting, CoT, Code, and RAG using LlamaIndex. Additionally, you can refer to the [**document**](./DOCUMENT.md) for more recommendations on how to use LLMLingua effectively. diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py index 68a0c4d..9eb2243 100644 --- a/llmlingua/prompt_compressor.py +++ b/llmlingua/prompt_compressor.py @@ -18,10 +18,10 @@ def __init__( self, model_name: str = "NousResearch/Llama-2-7b-hf", device_map: str = "cuda", - use_auth_token: bool = False, + model_config: dict = {}, open_api_config: dict = {}, ): - self.load_model(model_name, device_map, use_auth_token) + self.load_model(model_name, device_map, model_config) self.retrieval_model = None self.retrieval_model_name = None self.open_api_config = open_api_config @@ -29,14 +29,18 @@ def __init__( self.prefix_bos_num = 100 def load_model( - self, model_name: str, device_map: str = "cuda", use_auth_token: bool = False + self, model_name: str, device_map: str = "cuda", model_config: dict = {} ): - config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - tokenizer.padding_side = "left" - tokenizer.pad_token_id = ( - config.pad_token_id if config.pad_token_id else tokenizer.eos_token_id - ) + trust_remote_code = model_config.get("trust_remote_code", True) + if "trust_remote_code" not in model_config: + model_config["trust_remote_code"] = trust_remote_code + config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code) + if model_config.get("pad_to_left", True): + tokenizer.padding_side = "left" + tokenizer.pad_token_id = ( + config.pad_token_id if config.pad_token_id else tokenizer.eos_token_id + ) self.device = ( device_map if any(key in device_map for key in ["cuda", "cpu"]) else "cuda" ) @@ -44,10 +48,11 @@ def load_model( model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto" if device_map == "cuda" else torch.float32, + device_map=device_map, config=config, ignore_mismatched_sizes=True, - trust_remote_code=True, - ).to(device_map) + **model_config + ) else: model = AutoModelForCausalLM.from_pretrained( model_name, @@ -57,8 +62,7 @@ def load_model( offload_folder="/tmp/offload", offload_state_dict=True, cache_dir="/tmp/cache", - use_auth_token=use_auth_token, - trust_remote_code=True, + **model_config ) self.tokenizer = tokenizer self.model = model diff --git a/llmlingua/version.py b/llmlingua/version.py index dfa1829..c0b20f2 100644 --- a/llmlingua/version.py +++ b/llmlingua/version.py @@ -2,7 +2,7 @@ _MINOR = "1" # On master and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "3" +_PATCH = "4" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""