diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py index f774a8f4..6d194d9c 100644 --- a/vision_agent/agent/vision_agent_v3.py +++ b/vision_agent/agent/vision_agent_v3.py @@ -3,7 +3,7 @@ import logging import sys from pathlib import Path -from typing import Any, Dict, List, Optional, Union, cast, Callable +from typing import Any, Dict, List, Optional, Union, cast, Callable, no_type_check from rich.console import Console from rich.syntax import Syntax @@ -117,6 +117,7 @@ def write_and_test_code( log_progress: Callable[[Dict[str, Any]], None], verbosity: int = 0, max_retries: int = 3, + input_media: Optional[Union[str, Path]] = None, ) -> Dict[str, Any]: code = extract_code( coder(CODE.format(docstring=tool_info, question=task, feedback=working_memory)) @@ -124,14 +125,18 @@ def write_and_test_code( test = extract_code( tester( SIMPLE_TEST.format( - docstring=tool_utils, question=task, code=code, feedback=working_memory + docstring=tool_utils, + question=task, + code=code, + feedback=working_memory, + media=input_media, ) ) ) success, result = _EXECUTE.run_isolation(f"{code}\n{test}") if verbosity == 2: - _LOGGER.info("First code and tests:") + _LOGGER.info("Initial code and tests:") log_progress( { "log": "Code:", @@ -153,7 +158,7 @@ def write_and_test_code( "result": result, } ) - _LOGGER.info(f"First result: {result}") + _LOGGER.info(f"Initial result: {result}") count = 0 new_working_memory = [] @@ -198,16 +203,18 @@ def write_and_test_code( _LOGGER.info(f"Debug result: {result}") count += 1 - if verbosity == 1: + if verbosity >= 1: + _LOGGER.info("Final code and tests:") _CONSOLE.print( Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True) ) - _LOGGER.info(f"Result: {result}") + _LOGGER.info(f"Final Result: {result}") return { "code": code, "test": test, "success": success, + "test_result": result, "working_memory": new_working_memory, } @@ -263,23 +270,26 @@ def __init__( else tool_recommender ) self.verbosity = verbosity - self.max_retries = 3 + self.max_retries = 2 self.report_progress_callback = report_progress_callback + @no_type_check def __call__( self, input: Union[List[Dict[str, str]], str], image: Optional[Union[str, Path]] = None, - ) -> str: + ) -> Dict[str, Any]: if isinstance(input, str): input = [{"role": "user", "content": input}] results = self.chat_with_workflow(input, image) - return results["code"] # type: ignore + results.pop("working_memory") + return results def chat_with_workflow( self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None, + self_reflection: bool = False, ) -> Dict[str, Any]: if len(chat) == 0: raise ValueError("Chat cannot be empty.") @@ -302,13 +312,14 @@ def chat_with_workflow( chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner ) plan_i_str = "\n-".join([e["instructions"] for e in plan_i]) - if self.verbosity == 1 or self.verbosity == 2: + if self.verbosity >= 1: self.log_progress( { "log": "Going to run the following plan(s) in sequence:\n", "plan": plan_i, } ) + _LOGGER.info( f""" {tabulate(tabular_data=plan_i, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}""" @@ -330,6 +341,7 @@ def chat_with_workflow( self.debugger, self.log_progress, verbosity=self.verbosity, + input_media=image, ) success = cast(bool, results["success"]) code = cast(str, results["code"]) @@ -337,18 +349,21 @@ def chat_with_workflow( working_memory.extend(results["working_memory"]) # type: ignore plan.append({"code": code, "test": test, "plan": plan_i}) - reflection = reflect(chat, plan_i_str, code, self.planner) - if self.verbosity > 0: - self.log_progress( - { - "log": "Reflection:", - "reflection": reflection, - } - ) - _LOGGER.info(f"Reflection: {reflection}") - feedback = cast(str, reflection["feedback"]) - success = cast(bool, reflection["success"]) - working_memory.append({"code": f"{code}\n{test}", "feedback": feedback}) + if self_reflection: + reflection = reflect(chat, plan_i_str, code, self.planner) + if self.verbosity > 0: + self.log_progress( + { + "log": "Reflection:", + "reflection": reflection, + } + ) + _LOGGER.info(f"Reflection: {reflection}") + feedback = cast(str, reflection["feedback"]) + success = cast(bool, reflection["success"]) + working_memory.append({"code": f"{code}\n{test}", "feedback": feedback}) + + retries += 1 self.log_progress( { @@ -360,6 +375,7 @@ def chat_with_workflow( return { "code": code, "test": test, + "test_result": results["test_result"], "plan": plan, "working_memory": working_memory, } diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py index 3e0813af..769559a4 100644 --- a/vision_agent/agent/vision_agent_v3_prompts.py +++ b/vision_agent/agent/vision_agent_v3_prompts.py @@ -61,6 +61,7 @@ 2. **Algorithm/Method Selection**: Decide on the most efficient way. 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. 4. **Code Generation**: Translate your pseudocode into executable Python code. +5. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools.tools_v2 import *`. Use a debug flag in the function parameters to toggle logging on and off. """ TEST = """ @@ -149,7 +150,7 @@ def find_text(image_path: str, text: str) -> str: **Input Code Snippet**: ```python -### Please decided how would you want to generate test cases. Based on incomplete code or completed version. +### Please decide how would you want to generate test cases. Based on incomplete code or completed version. {code} ``` @@ -159,8 +160,12 @@ def find_text(image_path: str, text: str) -> str: **Instructions**: 1. Verify the fundamental functionality under normal conditions. 2. Ensure each test case is well-documented with comments explaining the scenario it covers. -3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file. -4. DO NOT mock any functions, you must test their functionality as is. +3. Your test case MUST run only on the given image which is {media} +4. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions. +5. DO NOT mock any functions, you must test their functionality as is. +6. DO NOT assert the output value, run the code and verify it runs without any errors and assert only the output format or data structure. +7. DO NOT import the testing function as it will available in the testing environment. +8. Print the output of the function that is being tested. """ diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py index 04f4dedf..8e202856 100644 --- a/vision_agent/tools/tools_v2.py +++ b/vision_agent/tools/tools_v2.py @@ -416,12 +416,15 @@ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float: return cast(float, np.min(dist_matrix)) -def closest_box_distance(box1: List[float], box2: List[float]) -> float: +def closest_box_distance( + box1: List[float], box2: List[float], image_size: Tuple[int, int] +) -> float: """'closest_box_distance' calculates the closest distance between two bounding boxes. Parameters: box1 (List[float]): The first bounding box. box2 (List[float]): The second bounding box. + image_size (Tuple[int, int]): The size of the image given as (height, width). Returns: float: The closest distance between the two bounding boxes. @@ -432,8 +435,8 @@ def closest_box_distance(box1: List[float], box2: List[float]) -> float: 141.42 """ - x11, y11, x12, y12 = box1 - x21, y21, x22, y22 = box2 + x11, y11, x12, y12 = denormalize_bbox(box1, image_size) + x21, y21, x22, y22 = denormalize_bbox(box2, image_size) horizontal_distance = np.max([0, x21 - x12, x11 - x22]) vertical_distance = np.max([0, y21 - y12, y11 - y22])