Description
my code:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
from accelerate import Accelerator
from accelerate.utils import set_seed
from transformers import LlamaForCausalLM, LlamaTokenizer, LlamaConfig
set_seed(1234)
prefix_path = '/home/xsong/llama/Llama-2-7b-hf'
accelerator = Accelerator()
tokenizer = LlamaTokenizer.from_pretrained(prefix_path)
model = LlamaForCausalLM.from_pretrained(prefix_path,
# torch_dtype=torch.float16,
device_map=0)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'left'
input = 'What is the capital of China?'
batch = tokenizer.batch_encode_plus([input], padding=True, return_tensors='pt')
generation_config = {'do_sample' : True,
'num_beams' : 1,
'temperature' : 0.6,
'top_p' : 0.9,
'use_cache' : True,
'num_return_sequences' : 1,
'max_length' : 200,
'eos_token_id' : [2]}
b_out = model.generate(batch['input_ids'].cuda(), attention_mask=batch['attention_mask'].cuda(), **generation_config)
print(tokenizer.decode(b_out[0]))
output:
What is the capital of China?OOOOOOOOOOOOOOOOOOOOOOO2OOOOOOOOOOOOOOOOOOOO0OOOOOOOOOOOOOOOOOOOOOO0OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO0OOOOOtOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO0OOOOOOOOOOOOO0OOOOOOOtOOOOOOOOOOOOO
almost OOOO?