Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

在两张GPU卡上加载Chatglm3-6b 出错,请问如何解决? #514

Open
jamesjiangxing opened this issue Dec 2, 2023 · 1 comment

Comments

@jamesjiangxing
Copy link

Describe the bug
在两张GPU卡上加载Chatglm3-6b 出错.

To Reproduce
Steps to reproduce the behavior:

  1. 在配置文件中,strategy: "cuda:0 fp16 *14 -> cuda:1 fp16"
  2. 运行服务GLM6B
  3. 加载模型出错

Expected behavior
A clear and concise description of what you expected to happen.

Screenshots

Wenda-chatglm3-屏幕截图 2023-12-02 191320

@hekang26
Copy link

hekang26 commented Jan 30, 2024

from plugins.settings import settings

import os
from typing import Dict, Tuple, Union, Optional

from torch.nn import Module
from transformers import AutoModel

def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
# transformer.word_embeddings 占用1层
# transformer.final_layernorm 和 lm_head 占用1层
# transformer.layers 占用 28 层
# 总共30层分配到num_gpus张卡上
num_trans_layers = 28
per_gpu_layers = 30 / num_gpus

# bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
# windows下 model.device 会被设置成 transformer.word_embeddings.device
# linux下 model.device 会被设置成 lm_head.device
# 在调用chat或者stream_chat时,input_ids会被放到model.device上
# 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
# 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
device_map = {'transformer.word_embeddings': 0,
              'transformer.final_layernorm': 0, 'lm_head': 0}

used = 2
gpu_target = 0
for i in range(num_trans_layers):
    if used >= per_gpu_layers:
        gpu_target += 1
        used = 0
    assert gpu_target < num_gpus
    device_map[f'transformer.layers.{i}'] = gpu_target
    used += 1

return device_map

def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2,
device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module:
if num_gpus < 2 and device_map is None:
model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
else:
from accelerate import dispatch_model

    model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half()

    if device_map is None:
        device_map = auto_configure_device_map(num_gpus)

    model = dispatch_model(model, device_map=device_map)

return 

def chat_init(history):
history_formatted = None
if history is not None:
history_formatted = []
tmp = []
for i, old_chat in enumerate(history):
if len(tmp) == 0 and old_chat['role'] == "user":
tmp.append(old_chat['content'])
elif old_chat['role'] == "AI" or old_chat['role'] == 'assistant':
tmp.append(old_chat['content'])
history_formatted.append(tuple(tmp))
tmp = []
else:
continue
return history_formatted

def chat_one(prompt, history_formatted, max_length, top_p, temperature, zhishiku=False):
for response, history in model.stream_chat(tokenizer, prompt, history_formatted,
max_length=max_length, top_p=top_p, temperature=temperature):
yield response

def load_model():
global model, tokenizer
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
settings.Path, local_files_only=True, trust_remote_code=True)
# model = AutoModel.from_pretrained(
# settings.Path, local_files_only=True, trust_remote_code=True)
model = load_model_on_gpus(settings.Path, num_gpus=2)
if not (settings.Lora == '' or settings.Lora == None):
print('Lora模型地址', settings.Lora)
from peft import PeftModel
model = PeftModel.from_pretrained(model, settings.Lora)
device, precision = settings.Strategy.split()
# 根据设备执行不同的操作
if device == 'cpu':
# 如果是cpu,不做任何操作
pass
elif device == 'cuda':
# 如果是gpu,把模型移动到显卡
import torch
if not (precision.startswith('fp16i') and torch.cuda.get_device_properties(0).total_memory < 1.4e+10):
model = model.cuda()
else:
# 如果是其他设备,报错并退出程序
print('Error: 不受支持的设备')
exit()
# 根据精度执行不同的操作
if precision == 'fp16':
# 如果是fp16,把模型转化为半精度
model = model.half()
elif precision == 'fp32':
# 如果是fp32,把模型转化为全精度
model = model.float()
elif precision.startswith('fp16i'):
# 如果是fp16i开头,把模型转化为指定的精度
# 从字符串中提取精度的数字部分
bits = int(precision[5:])
# 调用quantize方法,传入精度参数
model = model.quantize(bits)
if device == 'cuda':
model = model.cuda()
model = model.half()
elif precision.startswith('fp32i'):
# 如果是fp32i开头,把模型转化为指定的精度
# 从字符串中提取精度的数字部分
bits = int(precision[5:])
# 调用quantize方法,传入精度参数
model = model.quantize(bits)
if device == 'cuda':
model = model.cuda()
model = model.float()
else:
# 如果是其他精度,报错并退出程序
print('Error: 不受支持的精度')
exit()
#model = model.eval()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants