From 414815f44ca0741fa6bdbf50fbe44184e8ed5ff0 Mon Sep 17 00:00:00 2001 From: Tuchuanhuhuhu Date: Tue, 5 Mar 2024 15:16:21 +0800 Subject: [PATCH] bugfix: Improvements on GPT4V --- modules/models/OpenAIVision.py | 101 +++++++++------------------------ modules/models/base_model.py | 4 +- modules/presets.py | 1 + 3 files changed, 32 insertions(+), 74 deletions(-) diff --git a/modules/models/OpenAIVision.py b/modules/models/OpenAIVision.py index 2ad39c79..563c4462 100644 --- a/modules/models/OpenAIVision.py +++ b/modules/models/OpenAIVision.py @@ -43,7 +43,6 @@ def __init__( self.api_key = api_key self.need_api_key = True self.max_generation_token = 4096 - self.images = [] self._refresh_header() def get_answer_stream_iter(self): @@ -64,68 +63,6 @@ def get_answer_at_once(self): total_token_count = response["usage"]["total_tokens"] return content, total_token_count - def try_read_image(self, filepath): - def is_image_file(filepath): - # 判断文件是否为图片 - valid_image_extensions = [ - ".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff"] - file_extension = os.path.splitext(filepath)[1].lower() - return file_extension in valid_image_extensions - def image_to_base64(image_path): - # 打开并加载图片 - img = Image.open(image_path) - - # 获取图片的宽度和高度 - width, height = img.size - - # 计算压缩比例,以确保最长边小于4096像素 - max_dimension = 2048 - scale_ratio = min(max_dimension / width, max_dimension / height) - - if scale_ratio < 1: - # 按压缩比例调整图片大小 - width = int(width * scale_ratio) - height = int(height * scale_ratio) - img = img.resize((width, height), Image.LANCZOS) - # 使用新的宽度和高度计算图片的token数量 - self.image_token = self.count_image_tokens(width, height) - - # 将图片转换为jpg格式的二进制数据 - buffer = BytesIO() - if img.mode == "RGBA": - img = img.convert("RGB") - img.save(buffer, format='JPEG') - binary_image = buffer.getvalue() - - # 对二进制数据进行Base64编码 - base64_image = base64.b64encode(binary_image).decode('utf-8') - - return base64_image - - if is_image_file(filepath): - logging.info(f"读取图片文件: {filepath}") - base64_image = image_to_base64(filepath) - self.images.append({ - "path": filepath, - "base64": base64_image, - }) - - def handle_file_upload(self, files, chatbot, language): - """if the model accepts multi modal input, implement this function""" - if files: - for file in files: - if file.name: - self.try_read_image(file.name) - if self.images is not None: - chatbot = chatbot + [([image["path"] for image in self.images], None)] - return None, chatbot, None - - def prepare_inputs(self, real_inputs, use_websearch, files, reply_language, chatbot): - fake_inputs = real_inputs - display_append = "" - limited_context = False - return limited_context, fake_inputs, display_append, real_inputs, chatbot - def count_token(self, user_input): input_token_count = count_token(construct_user(user_input)) @@ -185,20 +122,38 @@ def billing_info(self): logging.error(i18n("获取API使用情况失败:") + str(e)) return STANDARD_ERROR_MSG + ERROR_RETRIEVE_MSG + def _get_gpt4v_style_history(self): + history = [] + image_buffer = [] + for message in self.history: + if message["role"] == "user": + content = [] + if image_buffer: + for image in image_buffer: + content.append( + { + "type": "image_url", + "image_url": f"data:image/{self.get_image_type(image)};base64,{self.get_base64_image(image)}" + }, + ) + if content: + content.insert(0, {"type": "text", "text": message["content"]}) + history.append(construct_user(content)) + image_buffer = [] + else: + history.append(message) + elif message["role"] == "assistant": + history.append(message) + elif message["role"] == "image": + image_buffer.append(message["content"]) + return history + + @shared.state.switching_api_key # 在不开启多账号模式的时候,这个装饰器不会起作用 def _get_response(self, stream=False): openai_api_key = self.api_key system_prompt = self.system_prompt - history = self.history - if self.images: - self.history[-1]["content"] = [ - {"type": "text", "text": self.history[-1]["content"]}, - *[{"type": "image_url", "image_url": "data:image/jpeg;base64,"+image["base64"]} for image in self.images] - ] - self.images = [] - # 添加图片token到总计数中 - self.all_token_counts[-1] += self.image_token - self.image_token = 0 + history = self._get_gpt4v_style_history() logging.debug(colorama.Fore.YELLOW + f"{history}" + colorama.Fore.RESET) diff --git a/modules/models/base_model.py b/modules/models/base_model.py index a2541378..06411632 100644 --- a/modules/models/base_model.py +++ b/modules/models/base_model.py @@ -423,7 +423,9 @@ def handle_file_upload(self, files, chatbot, language): import traceback traceback.print_exc() status = i18n("索引构建失败!") + str(e) - if not other_files: + if other_files: + other_files = [f.name for f in other_files] + else: other_files = None return gr.File.update(value=other_files), chatbot, status diff --git a/modules/presets.py b/modules/presets.py index d8d41371..67fd3a0c 100644 --- a/modules/presets.py +++ b/modules/presets.py @@ -160,6 +160,7 @@ "GPT4 Vision": { "model_name": "gpt-4-vision-preview", "token_limit": 128000, + "multimodal": True }, "Claude": { "model_name": "Claude",