diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 2bb04343..4733bb24 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -30,7 +30,7 @@ class BoilerplateCode: pre_code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact", + "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning", "artifacts = Artifacts('{remote_path}')", "artifacts.load('{remote_path}')", ] @@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: def run_code_action( code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str -) -> Execution: - return code_interpreter.exec_isolation( +) -> Tuple[Execution, str]: + result = code_interpreter.exec_isolation( BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path) ) + obs = str(result.logs) + if result.error: + obs += f"\n{result.error}" + return result, obs + def parse_execution(response: str) -> Optional[str]: code = None @@ -192,7 +197,7 @@ def chat_with_code( artifacts = Artifacts(WORKSPACE / "artifacts.pkl") with CodeInterpreterFactory.new_instance( - code_sandbox_runtime=self.code_sandbox_runtime + code_sandbox_runtime=self.code_sandbox_runtime, ) as code_interpreter: orig_chat = copy.deepcopy(chat) int_chat = copy.deepcopy(chat) @@ -260,10 +265,9 @@ def chat_with_code( code_action = parse_execution(response["response"]) if code_action is not None: - result = run_code_action( + result, obs = run_code_action( code_action, code_interpreter, str(remote_artifacts_path) ) - obs = str(result.logs) if self.verbosity >= 1: _LOGGER.info(obs) diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index c8488902..dd893d1d 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -1,5 +1,4 @@ import copy -import difflib import logging import os import sys @@ -29,6 +28,7 @@ USER_REQ, ) from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM +from vision_agent.tools.meta_tools import get_diff from vision_agent.utils import CodeInterpreterFactory, Execution from vision_agent.utils.execute import CodeInterpreter from vision_agent.utils.image_utils import b64_to_pil @@ -63,14 +63,6 @@ def prepend_imports(code: str) -> str: return DefaultImports.to_code_string() + "\n\n" + code -def get_diff(before: str, after: str) -> str: - return "".join( - difflib.unified_diff( - before.splitlines(keepends=True), after.splitlines(keepends=True) - ) - ) - - def format_memory(memory: List[Dict[str, str]]) -> str: output_str = "" for i, m in enumerate(memory): diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 85e34cd5..bf9fac80 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -48,7 +48,7 @@ 4| return dogs [End of artifact] -AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false} +AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false} OBSERVATION: ----- stdout ----- @@ -75,7 +75,7 @@ 4| return dogs [End of artifact] -AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false} +AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false} OBSERVATION: ----- stdout ----- @@ -126,7 +126,7 @@ 15| return count [End of artifact] -AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')", "let_user_respond": false} +AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')", "let_user_respond": false} OBSERVATION: ----- stdout ----- diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 833ad542..3670e600 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -1,5 +1,7 @@ +import difflib import os import pickle as pkl +import re import subprocess import tempfile from pathlib import Path @@ -8,10 +10,13 @@ from IPython.display import display import vision_agent as va +from vision_agent.clients.landing_public_api import LandingPublicAPI from vision_agent.lmm.types import Message from vision_agent.tools.tool_utils import get_tool_documentation from vision_agent.tools.tools import TOOL_DESCRIPTIONS +from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask from vision_agent.utils.execute import Execution, MimeType +from vision_agent.utils.image_utils import convert_to_b64 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent @@ -99,13 +104,14 @@ def load(self, file_path: Union[str, Path]) -> None: def show(self) -> str: """Shows the artifacts that have been loaded and their remote save paths.""" - out_str = "[Artifacts loaded]\n" + output_str = "[Artifacts loaded]\n" for k in self.artifacts.keys(): - out_str += ( + output_str += ( f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n" ) - out_str += "[End of artifacts]\n" - return out_str + output_str += "[End of artifacts]\n" + print(output_str) + return output_str def save(self, local_path: Optional[Union[str, Path]] = None) -> None: save_path = ( @@ -135,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str: def view_lines( - lines: List[str], line_num: int, window_size: int, name: str, total_lines: int + lines: List[str], + line_num: int, + window_size: int, + name: str, + total_lines: int, + print_output: bool = True, ) -> str: start = max(0, line_num - window_size) end = min(len(lines), line_num + window_size) @@ -148,7 +159,9 @@ def view_lines( else f"[{len(lines) - end} more lines]" ) ) - print(return_str) + + if print_output: + print(return_str) return return_str @@ -231,7 +244,7 @@ def edit_code_artifact( new_content_lines = [ line if line.endswith("\n") else line + "\n" for line in new_content_lines ] - lines = artifacts[name].splitlines() + lines = artifacts[name].splitlines(keepends=True) edited_lines = lines[:start] + new_content_lines + lines[end:] cur_line = start + len(content.split("\n")) // 2 @@ -261,13 +274,20 @@ def edit_code_artifact( DEFAULT_WINDOW_SIZE, name, total_lines, + print_output=False, ) total_lines_edit = sum(1 for _ in edited_lines) edited_view = view_lines( - edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit + edited_lines, + cur_line, + DEFAULT_WINDOW_SIZE, + name, + total_lines_edit, + print_output=False, ) error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}" + print(error_msg) return error_msg artifacts[name] = "".join(edited_lines) @@ -390,6 +410,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str: return f"[Media {Path(local_path).name} saved]" +def list_artifacts(artifacts: Artifacts) -> str: + """Lists all the artifacts that have been loaded into the artifacts object.""" + output_str = artifacts.show() + print(output_str) + return output_str + + def get_tool_descriptions() -> str: """Returns a description of all the tools that `generate_vision_code` has access to. Helpful for answering questions about what types of vision tasks you can do with @@ -397,6 +424,108 @@ def get_tool_descriptions() -> str: return TOOL_DESCRIPTIONS +def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str: + """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect + objects in an image based on a given dataset. It returns the fine tuning job id. + + Parameters: + bboxes (List[BboxInput]): A list of BboxInput containing the + image path, labels and bounding boxes. + task (str): The florencev2 fine-tuning task. The options are + 'phrase_grounding'. + + Returns: + UUID: The fine tuning job id, this id will used to retrieve the fine + tuned model. + + Example + ------- + >>> fine_tuning_job_id = florencev2_fine_tuning( + [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]}, + {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}], + "phrase_grounding" + ) + """ + bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] + task_type = PromptTask[task.upper()] + fine_tuning_request = [ + BboxInputBase64( + image=convert_to_b64(bbox_input.image_path), + filename=Path(bbox_input.image_path).name, + labels=bbox_input.labels, + bboxes=bbox_input.bboxes, + ) + for bbox_input in bboxes_input + ] + landing_api = LandingPublicAPI() + fine_tune_id = str( + landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request) + ) + print(f"[Florence2 fine tuning id: {fine_tune_id}]") + return fine_tune_id + + +def get_diff(before: str, after: str) -> str: + return "".join( + difflib.unified_diff( + before.splitlines(keepends=True), after.splitlines(keepends=True) + ) + ) + + +def use_florence2_fine_tuning( + artifacts: Artifacts, name: str, task: str, fine_tune_id: str +) -> str: + """Replaces florence2 calls with the fine tuning id. This ensures that the code + utilizes the fined tuned florence2 model. Returns the diff between the original + code and the new code. + + Parameters: + artifacts (Artifacts): The artifacts object to edit the code from. + name (str): The name of the artifact to edit. + task (str): The task to fine tune the model for. The options are + 'phrase_grounding'. + fine_tune_id (str): The fine tuning job id. + + Examples + -------- + >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf") + """ + + task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"} + + if name not in artifacts: + output_str = f"[Artifact {name} does not exist]" + print(output_str) + return output_str + + code = artifacts[name] + if task.lower() == "phrase_grounding": + pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)" + + def replacer(match: re.Match) -> str: + arg = match.group(1) # capture all initial arguments + return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")' + + else: + raise ValueError(f"Task {task} is not supported.") + + new_code = re.sub(pattern, replacer, code) + + if new_code == code: + output_str = ( + f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]" + ) + print(output_str) + return output_str + + artifacts[name] = new_code + + diff = get_diff(code, new_code) + print(diff) + return diff + + META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, @@ -406,5 +535,8 @@ def get_tool_descriptions() -> str: generate_vision_code, edit_vision_code, write_media_artifact, + florence2_fine_tuning, + use_florence2_fine_tuning, + list_artifacts, ] ) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 8012e60d..958b2cf6 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -28,10 +28,8 @@ filter_bboxes_by_threshold, ) from vision_agent.tools.tools_types import ( - BboxInput, - BboxInputBase64, FineTuning, - Florencev2FtRequest, + Florence2FtRequest, JobStatus, PromptTask, ODResponseData, @@ -867,7 +865,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s return answer[task] # type: ignore -def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]: +def florence2_phrase_grounding( + prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None +) -> List[Dict[str, Any]]: """'florence2_phrase_grounding' is a tool that can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. It returns a list @@ -877,6 +877,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Parameters: prompt (str): The prompt to ground to the image. image (np.ndarray): The image to used to detect objects + fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the + fine-tuned model ID here to use it. Returns: List[Dict[str, Any]]: A list of dictionaries containing the score, label, and @@ -895,14 +897,33 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, """ image_size = image.shape[:2] image_b64 = convert_to_b64(image) - data = { - "image": image_b64, - "task": "", - "prompt": prompt, - "function_name": "florence2_phrase_grounding", - } - detections = send_inference_request(data, "florence2", v2=True) + if fine_tune_id is not None: + landing_api = LandingPublicAPI() + status = landing_api.check_fine_tuning_job(UUID(fine_tune_id)) + if status is not JobStatus.SUCCEEDED: + raise FineTuneModelIsNotReady( + f"Fine-tuned model {fine_tune_id} is not ready yet" + ) + + data_obj = Florence2FtRequest( + image=image_b64, + task=PromptTask.PHRASE_GROUNDING, + tool="florencev2_fine_tuning", + prompt=prompt, + fine_tuning=FineTuning(job_id=UUID(fine_tune_id)), + ) + data = data_obj.model_dump(by_alias=True) + detections = send_inference_request(data, "tools", v2=False) + else: + data = { + "image": image_b64, + "task": "", + "prompt": prompt, + "function_name": "florence2_phrase_grounding", + } + detections = send_inference_request(data, "florence2", v2=True) + detections = detections[""] return_data = [] for i in range(len(detections["bboxes"])): @@ -1732,119 +1753,6 @@ def overlay_counting_results( return np.array(pil_image) -# TODO: add this function to the imports so that is picked in the agent -def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: - """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able - to detect objects in an image based on a given dataset. It returns the fine - tuning job id. - - Parameters: - bboxes (List[BboxInput]): A list of BboxInput containing the - image path, labels and bounding boxes. - task (PromptTask): The florencev2 fine-tuning task. The options are - CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - - Returns: - UUID: The fine tuning job id, this id will used to retrieve the fine - tuned model. - - Example - ------- - >>> fine_tuning_job_id = florencev2_fine_tuning( - [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]}, - {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}], - "OBJECT_DETECTION" - ) - """ - bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] - task_input = PromptTask[task] - fine_tuning_request = [ - BboxInputBase64( - image=convert_to_b64(bbox_input.image_path), - filename=bbox_input.image_path.split("/")[-1], - labels=bbox_input.labels, - bboxes=bbox_input.bboxes, - ) - for bbox_input in bboxes_input - ] - landing_api = LandingPublicAPI() - return landing_api.launch_fine_tuning_job( - "florencev2", task_input, fine_tuning_request - ) - - -# TODO: add this function to the imports so that is picked in the agent -def florencev2_fine_tuned_object_detection( - image: np.ndarray, prompt: str, model_id: UUID, task: str -) -> List[Dict[str, Any]]: - """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model - to detect objects given a text prompt such as a phrase or class names separated by - commas. It returns a list of detected objects as labels and their location as - bounding boxes with score of 1.0. - - Parameters: - image (np.ndarray): The image to used to detect objects. - prompt (str): The prompt to help find objects in the image. - model_id (UUID): The fine-tuned model id. - task (PromptTask): The florencev2 fine-tuning task. The options are - CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - - Returns: - List[Dict[str, Any]]: A list of dictionaries containing the score, label, and - bounding box of the detected objects with normalized coordinates between 0 - and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the - top-left and xmax and ymax are the coordinates of the bottom-right of the - bounding box. The scores are always 1.0 and cannot be thresholded - - Example - ------- - >>> florencev2_fine_tuned_object_detection( - image, - 'person looking at a coyote', - UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") - ) - [ - {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, - {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, - ] - """ - # check if job succeeded first - landing_api = LandingPublicAPI() - status = landing_api.check_fine_tuning_job(model_id) - if status is not JobStatus.SUCCEEDED: - raise FineTuneModelIsNotReady() - - task = PromptTask[task] - if task is PromptTask.OBJECT_DETECTION: - prompt = "" - - data_obj = Florencev2FtRequest( - image=convert_to_b64(image), - task=task, - tool="florencev2_fine_tuning", - prompt=prompt, - fine_tuning=FineTuning(job_id=model_id), - ) - data = data_obj.model_dump(by_alias=True) - metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"} - detections = send_inference_request( - data, "tools", v2=False, metadata_payload=metadata_payload - ) - - detections = detections[task.value] - return_data = [] - image_size = image.shape[:2] - for i in range(len(detections["bboxes"])): - return_data.append( - { - "score": 1.0, - "label": detections["labels"][i], - "bbox": normalize_bbox(detections["bboxes"][i], image_size), - } - ) - return return_data - - FUNCTION_TOOLS = [ owl_v2, extract_frames, diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py index af1e8ee9..f61c2cf1 100644 --- a/vision_agent/tools/tools_types.py +++ b/vision_agent/tools/tools_types.py @@ -19,16 +19,9 @@ class BboxInputBase64(BaseModel): class PromptTask(str, Enum): - """ - Valid task prompts options for the Florencev2 model. - """ + """Valid task prompts options for the Florence2 model.""" - CAPTION = "" - """""" - CAPTION_TO_PHRASE_GROUNDING = "" - """""" - OBJECT_DETECTION = "" - """""" + PHRASE_GROUNDING = "" class FineTuning(BaseModel): @@ -41,7 +34,7 @@ def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str: return str(job_id) -class Florencev2FtRequest(BaseModel): +class Florence2FtRequest(BaseModel): model_config = ConfigDict(populate_by_name=True) image: str diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 37c8d260..0de10335 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -564,7 +564,13 @@ def __init__( ) -> None: super().__init__(timeout=timeout) self.nb = nbformat.v4.new_notebook() - self.nb_client = NotebookClient(self.nb, timeout=self.timeout) + # Set the notebook execution path to the remote path + self.resources = {"metadata": {"path": str(self.remote_path)}} + self.nb_client = NotebookClient( + self.nb, + timeout=self.timeout, + resources=self.resources, + ) _LOGGER.info( f"""Local code interpreter initialized Python version: {sys.version} @@ -606,7 +612,9 @@ def close(self) -> None: def restart_kernel(self) -> None: self.close() self.nb = nbformat.v4.new_notebook() - self.nb_client = NotebookClient(self.nb, timeout=self.timeout) + self.nb_client = NotebookClient( + self.nb, timeout=self.timeout, resources=self.resources + ) sleep(1) self._new_kernel() @@ -636,7 +644,7 @@ def upload_file(self, file_path: Union[str, Path]) -> Path: f.write(contents) _LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}") - return Path(self.remote_path / file_path) + return Path(self.remote_path / Path(file_path).name) def download_file( self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path] @@ -672,7 +680,8 @@ def get_default_instance() -> CodeInterpreter: @staticmethod def new_instance( - code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None + code_sandbox_runtime: Optional[str] = None, + remote_path: Optional[Union[str, Path]] = None, ) -> CodeInterpreter: if not code_sandbox_runtime: code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")