diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index bbd2c1a5..5e7c6228 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -414,7 +414,7 @@ def __init__( ): """VisionAgent constructor. - Parameters + Parameters: task_model: the model to use for task decomposition. answer_model: the model to use for reasoning and concluding the answer. reflect_model: the model to use for self reflection. @@ -479,6 +479,21 @@ def chat_with_workflow( reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, ) -> Tuple[str, List[Dict]]: + """Chat with the vision agent and return the final answer and all tool results. + + Parameters: + chat: a conversation in the format of + [{"role": "user", "content": "describe your task here..."}]. + image: the input image referenced in the chat parameter. + reference_data: a dictionary containing the reference image and mask. in the + format of {"image": "image.jpg", "mask": "mask.jpg} + visualize_output: whether to visualize the output. + + Returns: + A tuple where the first item is the final answer and the second item is a + list of all the tool results. The last item in the tool results also + contains the visualized output. + """ question = chat[0]["content"] if image: question += f" Image name: {image}"