diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index 08dfdbe7..8fbcfcc1 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -138,6 +138,7 @@ def pick_plan( tool_info: str, model: LMM, code_interpreter: CodeInterpreter, + media: List[str], log_progress: Callable[[Dict[str, Any]], None], verbosity: int = 0, max_retries: int = 3, @@ -155,7 +156,7 @@ def pick_plan( plan_str = format_plans(plans) prompt = TEST_PLANS.format( - docstring=tool_info, plans=plan_str, previous_attempts="" + docstring=tool_info, plans=plan_str, previous_attempts="", media=media ) code = extract_code(model(prompt)) @@ -186,6 +187,7 @@ def pick_plan( previous_attempts=PREVIOUS_FAILED.format( code=code, error=tool_output.text() ), + media=media, ) code = extract_code(model(prompt)) log_progress( @@ -718,6 +720,7 @@ def chat_with_workflow( tool_infos["all"], self.coder, code_interpreter, + media_list, self.log_progress, verbosity=self.verbosity, ) diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py index 8f5e689b..cb4c3eeb 100644 --- a/vision_agent/agent/vision_agent_coder_prompts.py +++ b/vision_agent/agent/vision_agent_coder_prompts.py @@ -66,8 +66,9 @@ **Instructions**: 1. Write a program to load the media and call each tool and save it's output. -2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove any array types from the printed dictionary. -3. Print this final dictionary. +2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary. +3. Your test case MUST run only on the given images which are {media} +4. Print this final dictionary. **Example**: plan1: