From 9c4bcbe13cb8e704bc90d07ff750a23bdaf2f756 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Sat, 31 Aug 2024 09:35:50 -0700 Subject: [PATCH 01/15] moved fine tuning to meta tools --- vision_agent/tools/meta_tools.py | 43 ++++++ vision_agent/tools/tools.py | 214 +++++++++++++----------------- vision_agent/tools/tools_types.py | 14 +- 3 files changed, 141 insertions(+), 130 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 833ad542..e04a055d 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -4,14 +4,18 @@ import tempfile from pathlib import Path from typing import Any, Dict, List, Optional, Union +from uuid import UUID from IPython.display import display import vision_agent as va +from vision_agent.clients.landing_public_api import LandingPublicAPI from vision_agent.lmm.types import Message from vision_agent.tools.tool_utils import get_tool_documentation from vision_agent.tools.tools import TOOL_DESCRIPTIONS +from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask from vision_agent.utils.execute import Execution, MimeType +from vision_agent.utils.image_utils import convert_to_b64 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent @@ -397,6 +401,45 @@ def get_tool_descriptions() -> str: return TOOL_DESCRIPTIONS +def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: + """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect + objects in an image based on a given dataset. It returns the fine tuning job id. + + Parameters: + bboxes (List[BboxInput]): A list of BboxInput containing the + image path, labels and bounding boxes. + task (str): The florencev2 fine-tuning task. The options are + 'phrase_grounding'. + + Returns: + UUID: The fine tuning job id, this id will used to retrieve the fine + tuned model. + + Example + ------- + >>> fine_tuning_job_id = florencev2_fine_tuning( + [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]}, + {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}], + "OBJECT_DETECTION" + ) + """ + bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] + task_type = PromptTask(task.upper()) + fine_tuning_request = [ + BboxInputBase64( + image=convert_to_b64(bbox_input.image_path), + filename=bbox_input.image_path.split("/")[-1], + labels=bbox_input.labels, + bboxes=bbox_input.bboxes, + ) + for bbox_input in bboxes_input + ] + landing_api = LandingPublicAPI() + return landing_api.launch_fine_tuning_job( + "florencev2", task_type, fine_tuning_request + ) + + META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 0695b547..92a47a99 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -29,7 +29,7 @@ BboxInput, BboxInputBase64, FineTuning, - Florencev2FtRequest, + Florence2FtRequest, JobStatus, PromptTask, ) @@ -762,7 +762,7 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s return answer[task] # type: ignore -def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]: +def florence2_phrase_grounding(prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]: """'florence2_phrase_grounding' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list @@ -790,14 +790,31 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, """ image_size = image.shape[:2] image_b64 = convert_to_b64(image) - data = { - "image": image_b64, - "task": "", - "prompt": prompt, - "function_name": "florence2_phrase_grounding", - } - detections = send_inference_request(data, "florence2", v2=True) + if fine_tune_id is not None: + landing_api = LandingPublicAPI() + status = landing_api.check_fine_tuning_job(UUID(fine_tune_id)) + if status is not JobStatus.SUCCEEDED: + raise FineTuneModelIsNotReady(f"Fine-tuned model {fine_tune_id} is not ready yet") + + data_obj = Florence2FtRequest( + image=image_b64, + task=PromptTask.PHRASE_GROUNDING, + tool="florence2_fine_tuning", + prompt=prompt, + fine_tuning=FineTuning(job_id=UUID(fine_tune_id)) + ) + data = data_obj.model_dump(by_alias=True) + detections = send_inference_request(data, "tools", v2=False) + else: + data = { + "image": image_b64, + "task": "", + "prompt": prompt, + "function_name": "florence2_phrase_grounding", + } + detections = send_inference_request(data, "florence2", v2=True) + detections = detections[""] return_data = [] for i in range(len(detections["bboxes"])): @@ -1560,116 +1577,75 @@ def overlay_heat_map( # TODO: add this function to the imports so that is picked in the agent -def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: - """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able - to detect objects in an image based on a given dataset. It returns the fine - tuning job id. - - Parameters: - bboxes (List[BboxInput]): A list of BboxInput containing the - image path, labels and bounding boxes. - task (PromptTask): The florencev2 fine-tuning task. The options are - CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - - Returns: - UUID: The fine tuning job id, this id will used to retrieve the fine - tuned model. - - Example - ------- - >>> fine_tuning_job_id = florencev2_fine_tuning( - [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]}, - {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}], - "OBJECT_DETECTION" - ) - """ - bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] - task_input = PromptTask[task] - fine_tuning_request = [ - BboxInputBase64( - image=convert_to_b64(bbox_input.image_path), - filename=bbox_input.image_path.split("/")[-1], - labels=bbox_input.labels, - bboxes=bbox_input.bboxes, - ) - for bbox_input in bboxes_input - ] - landing_api = LandingPublicAPI() - return landing_api.launch_fine_tuning_job( - "florencev2", task_input, fine_tuning_request - ) - - -# TODO: add this function to the imports so that is picked in the agent -def florencev2_fine_tuned_object_detection( - image: np.ndarray, prompt: str, model_id: UUID, task: str -) -> List[Dict[str, Any]]: - """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model - to detect objects given a text prompt such as a phrase or class names separated by - commas. It returns a list of detected objects as labels and their location as - bounding boxes with score of 1.0. - - Parameters: - image (np.ndarray): The image to used to detect objects. - prompt (str): The prompt to help find objects in the image. - model_id (UUID): The fine-tuned model id. - task (PromptTask): The florencev2 fine-tuning task. The options are - CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - - Returns: - List[Dict[str, Any]]: A list of dictionaries containing the score, label, and - bounding box of the detected objects with normalized coordinates between 0 - and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the - top-left and xmax and ymax are the coordinates of the bottom-right of the - bounding box. The scores are always 1.0 and cannot be thresholded - - Example - ------- - >>> florencev2_fine_tuned_object_detection( - image, - 'person looking at a coyote', - UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") - ) - [ - {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, - {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, - ] - """ - # check if job succeeded first - landing_api = LandingPublicAPI() - status = landing_api.check_fine_tuning_job(model_id) - if status is not JobStatus.SUCCEEDED: - raise FineTuneModelIsNotReady() - - task = PromptTask[task] - if task is PromptTask.OBJECT_DETECTION: - prompt = "" - - data_obj = Florencev2FtRequest( - image=convert_to_b64(image), - task=task, - tool="florencev2_fine_tuning", - prompt=prompt, - fine_tuning=FineTuning(job_id=model_id), - ) - data = data_obj.model_dump(by_alias=True) - metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"} - detections = send_inference_request( - data, "tools", v2=False, metadata_payload=metadata_payload - ) - - detections = detections[task.value] - return_data = [] - image_size = image.shape[:2] - for i in range(len(detections["bboxes"])): - return_data.append( - { - "score": 1.0, - "label": detections["labels"][i], - "bbox": normalize_bbox(detections["bboxes"][i], image_size), - } - ) - return return_data +# def florencev2_fine_tuned_object_detection( +# image: np.ndarray, prompt: str, model_id: UUID, task: str +# ) -> List[Dict[str, Any]]: +# """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model +# to detect objects given a text prompt such as a phrase or class names separated by +# commas. It returns a list of detected objects as labels and their location as +# bounding boxes with score of 1.0. + +# Parameters: +# image (np.ndarray): The image to used to detect objects. +# prompt (str): The prompt to help find objects in the image. +# model_id (UUID): The fine-tuned model id. +# task (PromptTask): The florencev2 fine-tuning task. The options are +# CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. + +# Returns: +# List[Dict[str, Any]]: A list of dictionaries containing the score, label, and +# bounding box of the detected objects with normalized coordinates between 0 +# and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the +# top-left and xmax and ymax are the coordinates of the bottom-right of the +# bounding box. The scores are always 1.0 and cannot be thresholded + +# Example +# ------- +# >>> florencev2_fine_tuned_object_detection( +# image, +# 'person looking at a coyote', +# UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") +# ) +# [ +# {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, +# {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, +# ] +# """ +# # check if job succeeded first +# landing_api = LandingPublicAPI() +# status = landing_api.check_fine_tuning_job(model_id) +# if status is not JobStatus.SUCCEEDED: +# raise FineTuneModelIsNotReady() + +# task = PromptTask[task] +# if task is PromptTask.OBJECT_DETECTION: +# prompt = "" + +# data_obj = Florencev2FtRequest( +# image=convert_to_b64(image), +# task=task, +# tool="florencev2_fine_tuning", +# prompt=prompt, +# fine_tuning=FineTuning(job_id=model_id), +# ) +# data = data_obj.model_dump(by_alias=True) +# metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"} +# detections = send_inference_request( +# data, "tools", v2=False, metadata_payload=metadata_payload +# ) + +# detections = detections[task.value] +# return_data = [] +# image_size = image.shape[:2] +# for i in range(len(detections["bboxes"])): +# return_data.append( +# { +# "score": 1.0, +# "label": detections["labels"][i], +# "bbox": normalize_bbox(detections["bboxes"][i], image_size), +# } +# ) +# return return_data FUNCTION_TOOLS = [ diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py index 7b640adb..20d178d7 100644 --- a/vision_agent/tools/tools_types.py +++ b/vision_agent/tools/tools_types.py @@ -19,16 +19,8 @@ class BboxInputBase64(BaseModel): class PromptTask(str, Enum): - """ - Valid task prompts options for the Florencev2 model. - """ - - CAPTION = "" - """""" - CAPTION_TO_PHRASE_GROUNDING = "" - """""" - OBJECT_DETECTION = "" - """""" + """Valid task prompts options for the Florence2 model.""" + PHRASE_GROUNDING = "" class FineTuning(BaseModel): @@ -41,7 +33,7 @@ def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str: return str(job_id) -class Florencev2FtRequest(BaseModel): +class Florence2FtRequest(BaseModel): model_config = ConfigDict(populate_by_name=True) image: str From 4f32079ae9a47d46d226d34f5865e5ef1f5fee23 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 07:49:51 -0700 Subject: [PATCH 02/15] fix error messages --- vision_agent/agent/vision_agent.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 2bb04343..77237954 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: def run_code_action( code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str -) -> Execution: - return code_interpreter.exec_isolation( +) -> Tuple[Execution, str]: + result = code_interpreter.exec_isolation( BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path) ) + obs = str(result.logs) + if result.error: + obs += f"\n{result.error}" + return result, obs + def parse_execution(response: str) -> Optional[str]: code = None @@ -260,10 +265,9 @@ def chat_with_code( code_action = parse_execution(response["response"]) if code_action is not None: - result = run_code_action( + result, obs = run_code_action( code_action, code_interpreter, str(remote_artifacts_path) ) - obs = str(result.logs) if self.verbosity >= 1: _LOGGER.info(obs) From b9e7541f66afc961776abb5846c1035739520306 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 07:50:41 -0700 Subject: [PATCH 03/15] move get_diff and add use_florence2_fine_tuning --- vision_agent/agent/vision_agent_coder.py | 10 +--- vision_agent/tools/meta_tools.py | 71 ++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 15 deletions(-) diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index c8488902..dd893d1d 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -1,5 +1,4 @@ import copy -import difflib import logging import os import sys @@ -29,6 +28,7 @@ USER_REQ, ) from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM +from vision_agent.tools.meta_tools import get_diff from vision_agent.utils import CodeInterpreterFactory, Execution from vision_agent.utils.execute import CodeInterpreter from vision_agent.utils.image_utils import b64_to_pil @@ -63,14 +63,6 @@ def prepend_imports(code: str) -> str: return DefaultImports.to_code_string() + "\n\n" + code -def get_diff(before: str, after: str) -> str: - return "".join( - difflib.unified_diff( - before.splitlines(keepends=True), after.splitlines(keepends=True) - ) - ) - - def format_memory(memory: List[Dict[str, str]]) -> str: output_str = "" for i, m in enumerate(memory): diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index e04a055d..ee2e7c30 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -1,5 +1,7 @@ +import difflib import os import pickle as pkl +import re import subprocess import tempfile from pathlib import Path @@ -394,6 +396,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str: return f"[Media {Path(local_path).name} saved]" +def list_artifacts(artifacts: Artifacts) -> str: + """Lists all the artifacts that have been loaded into the artifacts object.""" + output_str = artifacts.show() + print(output_str) + return output_str + + def get_tool_descriptions() -> str: """Returns a description of all the tools that `generate_vision_code` has access to. Helpful for answering questions about what types of vision tasks you can do with @@ -401,7 +410,7 @@ def get_tool_descriptions() -> str: return TOOL_DESCRIPTIONS -def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: +def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str: """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect objects in an image based on a given dataset. It returns the fine tuning job id. @@ -420,26 +429,73 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: >>> fine_tuning_job_id = florencev2_fine_tuning( [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]}, {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}], - "OBJECT_DETECTION" + "phrase_grounding" ) """ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] - task_type = PromptTask(task.upper()) + task_type = PromptTask[task.upper()] fine_tuning_request = [ BboxInputBase64( image=convert_to_b64(bbox_input.image_path), - filename=bbox_input.image_path.split("/")[-1], + filename=Path(bbox_input.image_path).name, labels=bbox_input.labels, bboxes=bbox_input.bboxes, ) for bbox_input in bboxes_input ] landing_api = LandingPublicAPI() - return landing_api.launch_fine_tuning_job( - "florencev2", task_type, fine_tuning_request + # fine_tune_id = str(landing_api.launch_fine_tuning_job( + # "florencev2", task_type, fine_tuning_request + # )) + fine_tune_id = "23b3b022-5ebf-4798-9373-20ef36429abf" + print(f"[Florence2 fine tuning id: {fine_tune_id}]") + return fine_tune_id + + +def get_diff(before: str, after: str) -> str: + return "".join( + difflib.unified_diff( + before.splitlines(keepends=True), after.splitlines(keepends=True) + ) ) +def use_florence2_fine_tuning( + artifacts: Artifacts, name: str, task: str, fine_tune_id: str +) -> str: + """Replaces florence2 calls with the fine tuning id. This ensures that the code + utilizes the fined tuned florence2 model. Returns the diff between the original + code and the new code. + + Parameters: + artifacts (Artifacts): The artifacts object to edit the code from. + name (str): The name of the artifact to edit. + task (str): The task to fine tune the model for. The options are + 'phrase_grounding'. + fine_tune_id (str): The fine tuning job id. + + Examples + -------- + >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf") + """ + code = artifacts[name] + if task.lower() == "phrase_grounding": + pattern = r'florence2_phrase_grounding\((".*?", .*?)\)' + + def replacer(match): + return f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")' + + else: + raise ValueError(f"Task {task} is not supported.") + + new_code = re.sub(pattern, replacer, code) + artifacts[name] = new_code + + diff = get_diff(code, new_code) + print(diff) + return diff + + META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, @@ -449,5 +505,8 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: generate_vision_code, edit_vision_code, write_media_artifact, + florence2_fine_tuning, + use_florence2_fine_tuning, + list_artifacts, ] ) From d0bf79e2e4916153fba652187703fc45a8903cec Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 07:52:27 -0700 Subject: [PATCH 04/15] add fine tuning arg to florence2 --- vision_agent/tools/tools.py | 14 ++++++++++---- vision_agent/tools/tools_types.py | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 92a47a99..1ea19f62 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -762,7 +762,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s return answer[task] # type: ignore -def florence2_phrase_grounding(prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]: +def florence2_phrase_grounding( + prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None +) -> List[Dict[str, Any]]: """'florence2_phrase_grounding' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list @@ -772,6 +774,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray, fine_tune_id: Opt Parameters: prompt (str): The prompt to ground to the image. image (np.ndarray): The image to used to detect objects + fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the + fine-tuned model ID here to use it. Returns: List[Dict[str, Any]]: A list of dictionaries containing the score, label, and @@ -795,14 +799,16 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray, fine_tune_id: Opt landing_api = LandingPublicAPI() status = landing_api.check_fine_tuning_job(UUID(fine_tune_id)) if status is not JobStatus.SUCCEEDED: - raise FineTuneModelIsNotReady(f"Fine-tuned model {fine_tune_id} is not ready yet") + raise FineTuneModelIsNotReady( + f"Fine-tuned model {fine_tune_id} is not ready yet" + ) data_obj = Florence2FtRequest( image=image_b64, task=PromptTask.PHRASE_GROUNDING, - tool="florence2_fine_tuning", + tool="florencev2_fine_tuning", prompt=prompt, - fine_tuning=FineTuning(job_id=UUID(fine_tune_id)) + fine_tuning=FineTuning(job_id=UUID(fine_tune_id)), ) data = data_obj.model_dump(by_alias=True) detections = send_inference_request(data, "tools", v2=False) diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py index 20d178d7..eb436d94 100644 --- a/vision_agent/tools/tools_types.py +++ b/vision_agent/tools/tools_types.py @@ -20,6 +20,7 @@ class BboxInputBase64(BaseModel): class PromptTask(str, Enum): """Valid task prompts options for the Florence2 model.""" + PHRASE_GROUNDING = "" From c56d73b8de0e2b7b9e16ac95818c5d05e7e80e8d Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 08:57:42 -0700 Subject: [PATCH 05/15] set notebook execute path to remote path' --- vision_agent/utils/execute.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 37c8d260..447743d1 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -564,7 +564,12 @@ def __init__( ) -> None: super().__init__(timeout=timeout) self.nb = nbformat.v4.new_notebook() - self.nb_client = NotebookClient(self.nb, timeout=self.timeout) + # Set the notebook execution path to the remote path + self.nb_client = NotebookClient( + self.nb, + timeout=self.timeout, + resources={"metadata": {"path": str(self.remote_path)}}, + ) _LOGGER.info( f"""Local code interpreter initialized Python version: {sys.version} From 1802d698d32ab45f8f3eeb3bd32f09466e6c40f3 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 08:57:55 -0700 Subject: [PATCH 06/15] remove comments --- vision_agent/tools/tools.py | 74 ------------------------------------- 1 file changed, 74 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 1ea19f62..828b1ba9 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -26,8 +26,6 @@ send_inference_request, ) from vision_agent.tools.tools_types import ( - BboxInput, - BboxInputBase64, FineTuning, Florence2FtRequest, JobStatus, @@ -1582,78 +1580,6 @@ def overlay_heat_map( return np.array(combined) -# TODO: add this function to the imports so that is picked in the agent -# def florencev2_fine_tuned_object_detection( -# image: np.ndarray, prompt: str, model_id: UUID, task: str -# ) -> List[Dict[str, Any]]: -# """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model -# to detect objects given a text prompt such as a phrase or class names separated by -# commas. It returns a list of detected objects as labels and their location as -# bounding boxes with score of 1.0. - -# Parameters: -# image (np.ndarray): The image to used to detect objects. -# prompt (str): The prompt to help find objects in the image. -# model_id (UUID): The fine-tuned model id. -# task (PromptTask): The florencev2 fine-tuning task. The options are -# CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - -# Returns: -# List[Dict[str, Any]]: A list of dictionaries containing the score, label, and -# bounding box of the detected objects with normalized coordinates between 0 -# and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the -# top-left and xmax and ymax are the coordinates of the bottom-right of the -# bounding box. The scores are always 1.0 and cannot be thresholded - -# Example -# ------- -# >>> florencev2_fine_tuned_object_detection( -# image, -# 'person looking at a coyote', -# UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") -# ) -# [ -# {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, -# {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, -# ] -# """ -# # check if job succeeded first -# landing_api = LandingPublicAPI() -# status = landing_api.check_fine_tuning_job(model_id) -# if status is not JobStatus.SUCCEEDED: -# raise FineTuneModelIsNotReady() - -# task = PromptTask[task] -# if task is PromptTask.OBJECT_DETECTION: -# prompt = "" - -# data_obj = Florencev2FtRequest( -# image=convert_to_b64(image), -# task=task, -# tool="florencev2_fine_tuning", -# prompt=prompt, -# fine_tuning=FineTuning(job_id=model_id), -# ) -# data = data_obj.model_dump(by_alias=True) -# metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"} -# detections = send_inference_request( -# data, "tools", v2=False, metadata_payload=metadata_payload -# ) - -# detections = detections[task.value] -# return_data = [] -# image_size = image.shape[:2] -# for i in range(len(detections["bboxes"])): -# return_data.append( -# { -# "score": 1.0, -# "label": detections["labels"][i], -# "bbox": normalize_bbox(detections["bboxes"][i], image_size), -# } -# ) -# return return_data - - FUNCTION_TOOLS = [ owl_v2, extract_frames, From c8453e75ba3388c8e36b0f174fabb7e7fb6377a4 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 09:30:14 -0700 Subject: [PATCH 07/15] fix bug exec isolation wasn't setting resources --- vision_agent/utils/execute.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 447743d1..33667f17 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -565,10 +565,11 @@ def __init__( super().__init__(timeout=timeout) self.nb = nbformat.v4.new_notebook() # Set the notebook execution path to the remote path + self.resources = {"metadata": {"path": str(self.remote_path)}} self.nb_client = NotebookClient( self.nb, timeout=self.timeout, - resources={"metadata": {"path": str(self.remote_path)}}, + resources=self.resources, ) _LOGGER.info( f"""Local code interpreter initialized @@ -611,7 +612,9 @@ def close(self) -> None: def restart_kernel(self) -> None: self.close() self.nb = nbformat.v4.new_notebook() - self.nb_client = NotebookClient(self.nb, timeout=self.timeout) + self.nb_client = NotebookClient( + self.nb, timeout=self.timeout, resources=self.resources + ) sleep(1) self._new_kernel() @@ -677,7 +680,8 @@ def get_default_instance() -> CodeInterpreter: @staticmethod def new_instance( - code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None + code_sandbox_runtime: Optional[str] = None, + remote_path: Optional[Union[str, Path]] = None, ) -> CodeInterpreter: if not code_sandbox_runtime: code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local") From 5b3c0f01059df0b8cbf3bdae59e2701d40207172 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 09:49:28 -0700 Subject: [PATCH 08/15] ensure agent uses print to view results --- vision_agent/agent/vision_agent_prompts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 85e34cd5..bf9fac80 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -48,7 +48,7 @@ 4| return dogs [End of artifact] -AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false} +AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false} OBSERVATION: ----- stdout ----- @@ -75,7 +75,7 @@ 4| return dogs [End of artifact] -AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false} +AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false} OBSERVATION: ----- stdout ----- @@ -126,7 +126,7 @@ 15| return count [End of artifact] -AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')", "let_user_respond": false} +AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')", "let_user_respond": false} OBSERVATION: ----- stdout ----- From cc0e866d55cfed6bea6acfb36c0f699411bb6fd4 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 14:09:06 -0700 Subject: [PATCH 09/15] fixed bug with edit code errors --- vision_agent/tools/meta_tools.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index ee2e7c30..50c2c873 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -6,7 +6,6 @@ import tempfile from pathlib import Path from typing import Any, Dict, List, Optional, Union -from uuid import UUID from IPython.display import display @@ -105,13 +104,14 @@ def load(self, file_path: Union[str, Path]) -> None: def show(self) -> str: """Shows the artifacts that have been loaded and their remote save paths.""" - out_str = "[Artifacts loaded]\n" + output_str = "[Artifacts loaded]\n" for k in self.artifacts.keys(): - out_str += ( + output_str += ( f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n" ) - out_str += "[End of artifacts]\n" - return out_str + output_str += "[End of artifacts]\n" + print(output_str) + return output_str def save(self, local_path: Optional[Union[str, Path]] = None) -> None: save_path = ( @@ -237,7 +237,7 @@ def edit_code_artifact( new_content_lines = [ line if line.endswith("\n") else line + "\n" for line in new_content_lines ] - lines = artifacts[name].splitlines() + lines = artifacts[name].splitlines(keepends=True) edited_lines = lines[:start] + new_content_lines + lines[end:] cur_line = start + len(content.split("\n")) // 2 @@ -274,6 +274,7 @@ def edit_code_artifact( ) error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}" + print(error_msg) return error_msg artifacts[name] = "".join(edited_lines) @@ -478,6 +479,16 @@ def use_florence2_fine_tuning( -------- >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf") """ + + task_to_fn = { + "phrase_grounding": "florence2_phrase_grounding" + } + + if name not in artifacts: + output_str = f"[Artifact {name} does not exist]" + print(output_str) + return output_str + code = artifacts[name] if task.lower() == "phrase_grounding": pattern = r'florence2_phrase_grounding\((".*?", .*?)\)' @@ -489,6 +500,12 @@ def replacer(match): raise ValueError(f"Task {task} is not supported.") new_code = re.sub(pattern, replacer, code) + + if new_code == code: + output_str = f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]" + print(output_str) + return output_str + artifacts[name] = new_code diff = get_diff(code, new_code) From 7125e620f0971ce2163103498cca095ec0869431 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 14:28:25 -0700 Subject: [PATCH 10/15] fixed bug with edit code errors, and fixed replace code for fine tune --- vision_agent/tools/meta_tools.py | 33 +++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 50c2c873..93cdccf2 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -141,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str: def view_lines( - lines: List[str], line_num: int, window_size: int, name: str, total_lines: int + lines: List[str], + line_num: int, + window_size: int, + name: str, + total_lines: int, + print_output: bool = True, ) -> str: start = max(0, line_num - window_size) end = min(len(lines), line_num + window_size) @@ -154,7 +159,9 @@ def view_lines( else f"[{len(lines) - end} more lines]" ) ) - print(return_str) + + if print_output: + print(return_str) return return_str @@ -267,10 +274,16 @@ def edit_code_artifact( DEFAULT_WINDOW_SIZE, name, total_lines, + print_output=False, ) total_lines_edit = sum(1 for _ in edited_lines) edited_view = view_lines( - edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit + edited_lines, + cur_line, + DEFAULT_WINDOW_SIZE, + name, + total_lines_edit, + print_output=False, ) error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}" @@ -480,9 +493,7 @@ def use_florence2_fine_tuning( >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf") """ - task_to_fn = { - "phrase_grounding": "florence2_phrase_grounding" - } + task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"} if name not in artifacts: output_str = f"[Artifact {name} does not exist]" @@ -491,10 +502,12 @@ def use_florence2_fine_tuning( code = artifacts[name] if task.lower() == "phrase_grounding": - pattern = r'florence2_phrase_grounding\((".*?", .*?)\)' + pattern = r"florence2_phrase_grounding\(([^,]+),\s*([^\)]+)\)" def replacer(match): - return f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")' + arg1 = match.group(1) + arg2 = match.group(2) + return f'florence2_phrase_grounding({arg1}, {arg2}, "{fine_tune_id}")' else: raise ValueError(f"Task {task} is not supported.") @@ -502,7 +515,9 @@ def replacer(match): new_code = re.sub(pattern, replacer, code) if new_code == code: - output_str = f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]" + output_str = ( + f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]" + ) print(output_str) return output_str From 42cf17237fcd15b50422735a8cea9ce79120cd8b Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 15:13:12 -0700 Subject: [PATCH 11/15] add imports for new meta tools --- vision_agent/agent/vision_agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 77237954..4733bb24 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -30,7 +30,7 @@ class BoilerplateCode: pre_code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact", + "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning", "artifacts = Artifacts('{remote_path}')", "artifacts.load('{remote_path}')", ] @@ -197,7 +197,7 @@ def chat_with_code( artifacts = Artifacts(WORKSPACE / "artifacts.pkl") with CodeInterpreterFactory.new_instance( - code_sandbox_runtime=self.code_sandbox_runtime + code_sandbox_runtime=self.code_sandbox_runtime, ) as code_interpreter: orig_chat = copy.deepcopy(chat) int_chat = copy.deepcopy(chat) From 619780eb839b1c195451e420eab3094c2dc965e9 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 15:14:38 -0700 Subject: [PATCH 12/15] fixed type errors --- vision_agent/tools/meta_tools.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 93cdccf2..aa809818 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -458,10 +458,9 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str: for bbox_input in bboxes_input ] landing_api = LandingPublicAPI() - # fine_tune_id = str(landing_api.launch_fine_tuning_job( - # "florencev2", task_type, fine_tuning_request - # )) - fine_tune_id = "23b3b022-5ebf-4798-9373-20ef36429abf" + fine_tune_id = str(landing_api.launch_fine_tuning_job( + "florencev2", task_type, fine_tuning_request + )) print(f"[Florence2 fine tuning id: {fine_tune_id}]") return fine_tune_id @@ -504,7 +503,7 @@ def use_florence2_fine_tuning( if task.lower() == "phrase_grounding": pattern = r"florence2_phrase_grounding\(([^,]+),\s*([^\)]+)\)" - def replacer(match): + def replacer(match: re.Match) -> str: arg1 = match.group(1) arg2 = match.group(2) return f'florence2_phrase_grounding({arg1}, {arg2}, "{fine_tune_id}")' From f722be4a9f5924b0794e56aa35c2df6c4a1c4e62 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 15:23:04 -0700 Subject: [PATCH 13/15] fix format issue --- vision_agent/tools/meta_tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index aa809818..7129a94c 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -458,9 +458,9 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str: for bbox_input in bboxes_input ] landing_api = LandingPublicAPI() - fine_tune_id = str(landing_api.launch_fine_tuning_job( - "florencev2", task_type, fine_tuning_request - )) + fine_tune_id = str( + landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request) + ) print(f"[Florence2 fine tuning id: {fine_tune_id}]") return fine_tune_id From 7dac4cc9cea6b268518affb76ce312ad4caf5d60 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 15:40:10 -0700 Subject: [PATCH 14/15] fixed regex --- vision_agent/tools/meta_tools.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 7129a94c..3670e600 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -501,12 +501,11 @@ def use_florence2_fine_tuning( code = artifacts[name] if task.lower() == "phrase_grounding": - pattern = r"florence2_phrase_grounding\(([^,]+),\s*([^\)]+)\)" + pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)" def replacer(match: re.Match) -> str: - arg1 = match.group(1) - arg2 = match.group(2) - return f'florence2_phrase_grounding({arg1}, {arg2}, "{fine_tune_id}")' + arg = match.group(1) # capture all initial arguments + return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")' else: raise ValueError(f"Task {task} is not supported.") From 534d674114c4429a0012919837a9613fa7fcbcb6 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 3 Sep 2024 20:32:43 -0700 Subject: [PATCH 15/15] fix bug with upload return path --- vision_agent/utils/execute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 33667f17..0de10335 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -644,7 +644,7 @@ def upload_file(self, file_path: Union[str, Path]) -> Path: f.write(contents) _LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}") - return Path(self.remote_path / file_path) + return Path(self.remote_path / Path(file_path).name) def download_file( self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]