-
Notifications
You must be signed in to change notification settings - Fork 204
/
generate.py
37 lines (33 loc) · 1.09 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import torch
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer
model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
tokenizer = AutoTokenizer.from_pretrained(model_id)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
model = AutoAWQForCausalLM.from_quantized(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map="auto",
)
prompt = [
{"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
{"role": "user", "content": \
"You're standing on the surface of the Earth. "\
"You walk one mile south, one mile west and one mile north. "\
"You end up exactly where you started. Where are you?"},
]
inputs = tokenizer.apply_chat_template(
prompt,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
).to("cuda")
outputs = model.generate(
**inputs,
do_sample=True,
max_new_tokens=256,
streamer=streamer,
eos_token_id=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
)