From ef65ff2fc2c67704363865c39200f6c29b8aee58 Mon Sep 17 00:00:00 2001 From: shankar_ws3 Date: Mon, 29 Apr 2024 22:57:04 -0700 Subject: [PATCH 1/3] adding reflect to be optional for cases where LMM might not be able to understand the image --- README.md | 3 ++- vision_agent/agent/vision_agent.py | 35 +++++++++++++++++++----------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 76bcc2e1..3eeb19c8 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ you. For example: #### Custom Tools You can also add your own custom tools for your vision agent to use: - + ```python from vision_agent.tools import Tool, register_tool @register_tool @@ -160,6 +160,7 @@ find an example that creates a custom tool for template matching [here](examples | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. | | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. | | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. | +| MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units | | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. | | ExtractFrames | ExtractFrames extracts frames with motion from a video. | | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. | diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index c22c9983..b8a0b844 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -489,6 +489,7 @@ def __call__( image: Optional[Union[str, Path]] = None, reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, + reflect_output: Optional[bool] = True, ) -> str: """Invoke the vision agent. @@ -538,6 +539,7 @@ def chat_with_workflow( image: Optional[Union[str, Path]] = None, reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, + reflect_output: Optional[bool] = True, ) -> Tuple[str, List[Dict]]: """Chat with the vision agent and return the final answer and all tool results. @@ -625,20 +627,25 @@ def chat_with_workflow( reflection_images = [image] else: reflection_images = None - reflection = self_reflect( - self.reflect_model, - question, - self.tools, - all_tool_results, - final_answer, - reflection_images, - ) - self.log_progress(f"Reflection: {reflection}") - parsed_reflection = parse_reflect(reflection) - if parsed_reflection["Finish"]: - break + + if reflect_output: + reflection = self_reflect( + self.reflect_model, + question, + self.tools, + all_tool_results, + final_answer, + reflection_images, + ) + self.log_progress(f"Reflection: {reflection}") + parsed_reflection = parse_reflect(reflection) + if parsed_reflection["Finish"]: + break + else: + reflections += "\n" + parsed_reflection["Reflection"] else: - reflections += "\n" + parsed_reflection["Reflection"] + self.log_progress("Reflection skipped based on user request.") + break # '' is a symbol to indicate the end of the chat, which is useful for streaming logs. self.log_progress( f"The Vision Agent has concluded this chat. {final_answer}" @@ -660,12 +667,14 @@ def chat( image: Optional[Union[str, Path]] = None, reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, + reflect_output: Optional[bool] = True, ) -> str: answer, _ = self.chat_with_workflow( chat, image=image, visualize_output=visualize_output, reference_data=reference_data, + reflect_output=reflect_output, ) return answer From 5bb62832ccd2e15dd753285e5441961c8ef1c013 Mon Sep 17 00:00:00 2001 From: shankar_ws3 Date: Tue, 30 Apr 2024 12:18:21 -0700 Subject: [PATCH 2/3] changed the param name to self_reflect --- vision_agent/agent/vision_agent.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index b8a0b844..b4d08cd8 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -489,7 +489,7 @@ def __call__( image: Optional[Union[str, Path]] = None, reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, - reflect_output: Optional[bool] = True, + self_reflect: Optional[bool] = True, ) -> str: """Invoke the vision agent. @@ -502,6 +502,7 @@ def __call__( {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]} where the bounding box coordinates are normalized. visualize_output: Whether to visualize the output. + self_reflect: boolean to enable and disable self reflection. Returns: The result of the vision agent in text. @@ -513,6 +514,7 @@ def __call__( image=image, visualize_output=visualize_output, reference_data=reference_data, + self_reflect=self_reflect, ) def log_progress(self, description: str) -> None: @@ -539,7 +541,7 @@ def chat_with_workflow( image: Optional[Union[str, Path]] = None, reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, - reflect_output: Optional[bool] = True, + self_reflect: Optional[bool] = True, ) -> Tuple[str, List[Dict]]: """Chat with the vision agent and return the final answer and all tool results. @@ -552,6 +554,7 @@ def chat_with_workflow( {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]} where the bounding box coordinates are normalized. visualize_output: Whether to visualize the output. + self_reflect: boolean to enable and disable self reflection. Returns: A tuple where the first item is the final answer and the second item is a @@ -628,7 +631,7 @@ def chat_with_workflow( else: reflection_images = None - if reflect_output: + if self_reflect: reflection = self_reflect( self.reflect_model, question, @@ -644,7 +647,7 @@ def chat_with_workflow( else: reflections += "\n" + parsed_reflection["Reflection"] else: - self.log_progress("Reflection skipped based on user request.") + self.log_progress("Self Reflection skipped based on user request.") break # '' is a symbol to indicate the end of the chat, which is useful for streaming logs. self.log_progress( @@ -667,14 +670,14 @@ def chat( image: Optional[Union[str, Path]] = None, reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, - reflect_output: Optional[bool] = True, + self_reflect: Optional[bool] = True, ) -> str: answer, _ = self.chat_with_workflow( chat, image=image, visualize_output=visualize_output, reference_data=reference_data, - reflect_output=reflect_output, + self_reflect=self_reflect, ) return answer From cf4211acd959e3a3f7b4961bfdfae080ddbb78f4 Mon Sep 17 00:00:00 2001 From: shankar_ws3 Date: Tue, 30 Apr 2024 13:08:17 -0700 Subject: [PATCH 3/3] fixing param name as it overlaps with function call --- vision_agent/agent/vision_agent.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index b4d08cd8..9e5099de 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -489,7 +489,7 @@ def __call__( image: Optional[Union[str, Path]] = None, reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, - self_reflect: Optional[bool] = True, + self_reflection: Optional[bool] = True, ) -> str: """Invoke the vision agent. @@ -502,7 +502,7 @@ def __call__( {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]} where the bounding box coordinates are normalized. visualize_output: Whether to visualize the output. - self_reflect: boolean to enable and disable self reflection. + self_reflection: boolean to enable and disable self reflection. Returns: The result of the vision agent in text. @@ -514,7 +514,7 @@ def __call__( image=image, visualize_output=visualize_output, reference_data=reference_data, - self_reflect=self_reflect, + self_reflection=self_reflection, ) def log_progress(self, description: str) -> None: @@ -541,7 +541,7 @@ def chat_with_workflow( image: Optional[Union[str, Path]] = None, reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, - self_reflect: Optional[bool] = True, + self_reflection: Optional[bool] = True, ) -> Tuple[str, List[Dict]]: """Chat with the vision agent and return the final answer and all tool results. @@ -554,7 +554,7 @@ def chat_with_workflow( {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]} where the bounding box coordinates are normalized. visualize_output: Whether to visualize the output. - self_reflect: boolean to enable and disable self reflection. + self_reflection: boolean to enable and disable self reflection. Returns: A tuple where the first item is the final answer and the second item is a @@ -631,7 +631,7 @@ def chat_with_workflow( else: reflection_images = None - if self_reflect: + if self_reflection: reflection = self_reflect( self.reflect_model, question, @@ -670,14 +670,14 @@ def chat( image: Optional[Union[str, Path]] = None, reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, - self_reflect: Optional[bool] = True, + self_reflection: Optional[bool] = True, ) -> str: answer, _ = self.chat_with_workflow( chat, image=image, visualize_output=visualize_output, reference_data=reference_data, - self_reflect=self_reflect, + self_reflection=self_reflection, ) return answer