在两张GPU卡上加载Chatglm3-6b 出错，请问如何解决？ #514

jamesjiangxing · 2023-12-02T11:22:45Z

Describe the bug
在两张GPU卡上加载Chatglm3-6b 出错.

To Reproduce
Steps to reproduce the behavior:

在配置文件中，strategy: "cuda:0 fp16 *14 -> cuda:1 fp16"
运行服务GLM6B
加载模型出错

Expected behavior
A clear and concise description of what you expected to happen.

Screenshots

hekang26 · 2024-01-30T01:06:19Z

from plugins.settings import settings

import os
from typing import Dict, Tuple, Union, Optional

from torch.nn import Module
from transformers import AutoModel

def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
# transformer.word_embeddings 占用1层
# transformer.final_layernorm 和 lm_head 占用1层
# transformer.layers 占用 28 层
# 总共30层分配到num_gpus张卡上
num_trans_layers = 28
per_gpu_layers = 30 / num_gpus

# bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
# windows下 model.device 会被设置成 transformer.word_embeddings.device
# linux下 model.device 会被设置成 lm_head.device
# 在调用chat或者stream_chat时,input_ids会被放到model.device上
# 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
# 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
device_map = {'transformer.word_embeddings': 0,
              'transformer.final_layernorm': 0, 'lm_head': 0}

used = 2
gpu_target = 0
for i in range(num_trans_layers):
    if used >= per_gpu_layers:
        gpu_target += 1
        used = 0
    assert gpu_target < num_gpus
    device_map[f'transformer.layers.{i}'] = gpu_target
    used += 1

return device_map

def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2,
device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module:
if num_gpus < 2 and device_map is None:
model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
else:
from accelerate import dispatch_model

    model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half()

    if device_map is None:
        device_map = auto_configure_device_map(num_gpus)

    model = dispatch_model(model, device_map=device_map)

return

def chat_init(history):
history_formatted = None
if history is not None:
history_formatted = []
tmp = []
for i, old_chat in enumerate(history):
if len(tmp) == 0 and old_chat['role'] == "user":
tmp.append(old_chat['content'])
elif old_chat['role'] == "AI" or old_chat['role'] == 'assistant':
tmp.append(old_chat['content'])
history_formatted.append(tuple(tmp))
tmp = []
else:
continue
return history_formatted

def chat_one(prompt, history_formatted, max_length, top_p, temperature, zhishiku=False):
for response, history in model.stream_chat(tokenizer, prompt, history_formatted,
max_length=max_length, top_p=top_p, temperature=temperature):
yield response

def load_model():
global model, tokenizer
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
settings.Path, local_files_only=True, trust_remote_code=True)
# model = AutoModel.from_pretrained(
# settings.Path, local_files_only=True, trust_remote_code=True)
model = load_model_on_gpus(settings.Path, num_gpus=2)
if not (settings.Lora == '' or settings.Lora == None):
print('Lora模型地址', settings.Lora)
from peft import PeftModel
model = PeftModel.from_pretrained(model, settings.Lora)
device, precision = settings.Strategy.split()
# 根据设备执行不同的操作
if device == 'cpu':
# 如果是cpu，不做任何操作
pass
elif device == 'cuda':
# 如果是gpu，把模型移动到显卡
import torch
if not (precision.startswith('fp16i') and torch.cuda.get_device_properties(0).total_memory < 1.4e+10):
model = model.cuda()
else:
# 如果是其他设备，报错并退出程序
print('Error: 不受支持的设备')
exit()
# 根据精度执行不同的操作
if precision == 'fp16':
# 如果是fp16，把模型转化为半精度
model = model.half()
elif precision == 'fp32':
# 如果是fp32，把模型转化为全精度
model = model.float()
elif precision.startswith('fp16i'):
# 如果是fp16i开头，把模型转化为指定的精度
# 从字符串中提取精度的数字部分
bits = int(precision[5:])
# 调用quantize方法，传入精度参数
model = model.quantize(bits)
if device == 'cuda':
model = model.cuda()
model = model.half()
elif precision.startswith('fp32i'):
# 如果是fp32i开头，把模型转化为指定的精度
# 从字符串中提取精度的数字部分
bits = int(precision[5:])
# 调用quantize方法，传入精度参数
model = model.quantize(bits)
if device == 'cuda':
model = model.cuda()
model = model.float()
else:
# 如果是其他精度，报错并退出程序
print('Error: 不受支持的精度')
exit()
#model = model.eval()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

在两张GPU卡上加载Chatglm3-6b 出错，请问如何解决？ #514

在两张GPU卡上加载Chatglm3-6b 出错，请问如何解决？ #514

jamesjiangxing commented Dec 2, 2023

hekang26 commented Jan 30, 2024 •

edited

在两张GPU卡上加载Chatglm3-6b 出错，请问如何解决？ #514

在两张GPU卡上加载Chatglm3-6b 出错，请问如何解决？ #514

Comments

jamesjiangxing commented Dec 2, 2023

hekang26 commented Jan 30, 2024 • edited

hekang26 commented Jan 30, 2024 •

edited