Skip to content

Commit

Permalink
Merge pull request #20 from microsoft/hjiang/support_gptq
Browse files Browse the repository at this point in the history
Feature (LLMLingua): support GPT-Q

Co-authored-by: Qianhui Wu <[email protected]>
Co-authored-by: Xufang Luo <[email protected]>
  • Loading branch information
3 people authored Nov 22, 2023
2 parents 70bbd02 + 9585048 commit 3c3942c
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 16 deletions.
4 changes: 2 additions & 2 deletions DOCUMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ from llmlingua import PromptCompressor
llm_lingua = PromptCompressor(
model_name: str = "NousResearch/Llama-2-7b-hf",
device_map: str = "cuda",
use_auth_token: bool = False,
model_config: dict = {},
open_api_config: dict = {},
)
```
### Parameters

- model_name(str), the name of small language model from huggingface. Default set to "NousResearch/Llama-2-7b-hf";
- device_map(str), the device environment for using small models, like 'cuda', 'cpu', 'balanced', 'balanced_low_0', 'auto'. Default set to "cuda";
- use_auth_token(bool, optional), controls the usage of huggingface auto_token. Default set to False;
- model_config(dict, optional), the config of huggingface model. Default set to {};
- open_api_config(dict, optional), the config of openai which use in OpenAI Embedding in coarse-level prompt compression. Default set to {};

## Function Call
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ compressed_prompt = llm_lingua.compress_prompt(prompt, instruction="", question=
# 'compressed_tokens': 211,
# 'ratio': '11.2x',
# 'saving': ', Saving $0.1 in GPT-4.'}

## Or use the quantation model, like TheBloke/Llama-2-7b-Chat-GPTQ, only need <8GB GPU memory.
## Before that, you need to pip install optimum auto-gptq
llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"})
```

You can refer to the [**examples**](./examples) to understand how to use **LLMLingua** and **LongLLMLingua** in practical scenarios, such as RAG, Online Meeting, CoT, Code, and RAG using LlamaIndex. Additionally, you can refer to the [**document**](./DOCUMENT.md) for more recommendations on how to use LLMLingua effectively.
Expand Down
30 changes: 17 additions & 13 deletions llmlingua/prompt_compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,41 @@ def __init__(
self,
model_name: str = "NousResearch/Llama-2-7b-hf",
device_map: str = "cuda",
use_auth_token: bool = False,
model_config: dict = {},
open_api_config: dict = {},
):
self.load_model(model_name, device_map, use_auth_token)
self.load_model(model_name, device_map, model_config)
self.retrieval_model = None
self.retrieval_model_name = None
self.open_api_config = open_api_config
self.cache_bos_num = 10
self.prefix_bos_num = 100

def load_model(
self, model_name: str, device_map: str = "cuda", use_auth_token: bool = False
self, model_name: str, device_map: str = "cuda", model_config: dict = {}
):
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left"
tokenizer.pad_token_id = (
config.pad_token_id if config.pad_token_id else tokenizer.eos_token_id
)
trust_remote_code = model_config.get("trust_remote_code", True)
if "trust_remote_code" not in model_config:
model_config["trust_remote_code"] = trust_remote_code
config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code)
if model_config.get("pad_to_left", True):
tokenizer.padding_side = "left"
tokenizer.pad_token_id = (
config.pad_token_id if config.pad_token_id else tokenizer.eos_token_id
)
self.device = (
device_map if any(key in device_map for key in ["cuda", "cpu"]) else "cuda"
)
if "cuda" in device_map or "cpu" in device_map:
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto" if device_map == "cuda" else torch.float32,
device_map=device_map,
config=config,
ignore_mismatched_sizes=True,
trust_remote_code=True,
).to(device_map)
**model_config
)
else:
model = AutoModelForCausalLM.from_pretrained(
model_name,
Expand All @@ -57,8 +62,7 @@ def load_model(
offload_folder="/tmp/offload",
offload_state_dict=True,
cache_dir="/tmp/cache",
use_auth_token=use_auth_token,
trust_remote_code=True,
**model_config
)
self.tokenizer = tokenizer
self.model = model
Expand Down
2 changes: 1 addition & 1 deletion llmlingua/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
_MINOR = "1"
# On master and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "3"
_PATCH = "4"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
Expand Down

0 comments on commit 3c3942c

Please sign in to comment.