From ef65ff2fc2c67704363865c39200f6c29b8aee58 Mon Sep 17 00:00:00 2001
From: shankar_ws3 <shankar.anand@landing.ai>
Date: Mon, 29 Apr 2024 22:57:04 -0700
Subject: [PATCH 1/3] adding reflect to be optional for cases where LMM might
 not be able to understand the image

---
 README.md                          |  3 ++-
 vision_agent/agent/vision_agent.py | 35 +++++++++++++++++++-----------
 2 files changed, 24 insertions(+), 14 deletions(-)
diff --git a/README.md b/README.md
index 76bcc2e1..3eeb19c8 100644
--- a/README.md
+++ b/README.md
@@ -122,7 +122,7 @@ you. For example:
 
 #### Custom Tools
 You can also add your own custom tools for your vision agent to use:
-    
+
 ```python
 from vision_agent.tools import Tool, register_tool
 @register_tool
@@ -160,6 +160,7 @@ find an example that creates a custom tool for template matching [here](examples
 | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
 | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
 | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
+| MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units |
 | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
 | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. |
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index c22c9983..b8a0b844 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -489,6 +489,7 @@ def __call__(
         image: Optional[Union[str, Path]] = None,
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
+        reflect_output: Optional[bool] = True,
     ) -> str:
         """Invoke the vision agent.
 
@@ -538,6 +539,7 @@ def chat_with_workflow(
         image: Optional[Union[str, Path]] = None,
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
+        reflect_output: Optional[bool] = True,
     ) -> Tuple[str, List[Dict]]:
         """Chat with the vision agent and return the final answer and all tool results.
 
@@ -625,20 +627,25 @@ def chat_with_workflow(
                 reflection_images = [image]
             else:
                 reflection_images = None
-            reflection = self_reflect(
-                self.reflect_model,
-                question,
-                self.tools,
-                all_tool_results,
-                final_answer,
-                reflection_images,
-            )
-            self.log_progress(f"Reflection: {reflection}")
-            parsed_reflection = parse_reflect(reflection)
-            if parsed_reflection["Finish"]:
-                break
+
+            if reflect_output:
+                reflection = self_reflect(
+                    self.reflect_model,
+                    question,
+                    self.tools,
+                    all_tool_results,
+                    final_answer,
+                    reflection_images,
+                )
+                self.log_progress(f"Reflection: {reflection}")
+                parsed_reflection = parse_reflect(reflection)
+                if parsed_reflection["Finish"]:
+                    break
+                else:
+                    reflections += "\n" + parsed_reflection["Reflection"]
             else:
-                reflections += "\n" + parsed_reflection["Reflection"]
+                self.log_progress("Reflection skipped based on user request.")
+                break
         # '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
         self.log_progress(
             f"The Vision Agent has concluded this chat. <ANSWER>{final_answer}</ANSWER>"
@@ -660,12 +667,14 @@ def chat(
         image: Optional[Union[str, Path]] = None,
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
+        reflect_output: Optional[bool] = True,
     ) -> str:
         answer, _ = self.chat_with_workflow(
             chat,
             image=image,
             visualize_output=visualize_output,
             reference_data=reference_data,
+            reflect_output=reflect_output,
         )
         return answer
 

From 5bb62832ccd2e15dd753285e5441961c8ef1c013 Mon Sep 17 00:00:00 2001
From: shankar_ws3 <shankar.anand@landing.ai>
Date: Tue, 30 Apr 2024 12:18:21 -0700
Subject: [PATCH 2/3] changed the param name to self_reflect

---
 vision_agent/agent/vision_agent.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index b8a0b844..b4d08cd8 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -489,7 +489,7 @@ def __call__(
         image: Optional[Union[str, Path]] = None,
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
-        reflect_output: Optional[bool] = True,
+        self_reflect: Optional[bool] = True,
     ) -> str:
         """Invoke the vision agent.
 
@@ -502,6 +502,7 @@ def __call__(
                 {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
                 where the bounding box coordinates are normalized.
             visualize_output: Whether to visualize the output.
+            self_reflect: boolean to enable and disable self reflection.
 
         Returns:
             The result of the vision agent in text.
@@ -513,6 +514,7 @@ def __call__(
             image=image,
             visualize_output=visualize_output,
             reference_data=reference_data,
+            self_reflect=self_reflect,
         )
 
     def log_progress(self, description: str) -> None:
@@ -539,7 +541,7 @@ def chat_with_workflow(
         image: Optional[Union[str, Path]] = None,
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
-        reflect_output: Optional[bool] = True,
+        self_reflect: Optional[bool] = True,
     ) -> Tuple[str, List[Dict]]:
         """Chat with the vision agent and return the final answer and all tool results.
 
@@ -552,6 +554,7 @@ def chat_with_workflow(
                 {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
                 where the bounding box coordinates are normalized.
             visualize_output: Whether to visualize the output.
+            self_reflect: boolean to enable and disable self reflection.
 
         Returns:
             A tuple where the first item is the final answer and the second item is a
@@ -628,7 +631,7 @@ def chat_with_workflow(
             else:
                 reflection_images = None
 
-            if reflect_output:
+            if self_reflect:
                 reflection = self_reflect(
                     self.reflect_model,
                     question,
@@ -644,7 +647,7 @@ def chat_with_workflow(
                 else:
                     reflections += "\n" + parsed_reflection["Reflection"]
             else:
-                self.log_progress("Reflection skipped based on user request.")
+                self.log_progress("Self Reflection skipped based on user request.")
                 break
         # '<ANSWER>' is a symbol to indicate the end of the chat, which is useful for streaming logs.
         self.log_progress(
@@ -667,14 +670,14 @@ def chat(
         image: Optional[Union[str, Path]] = None,
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
-        reflect_output: Optional[bool] = True,
+        self_reflect: Optional[bool] = True,
     ) -> str:
         answer, _ = self.chat_with_workflow(
             chat,
             image=image,
             visualize_output=visualize_output,
             reference_data=reference_data,
-            reflect_output=reflect_output,
+            self_reflect=self_reflect,
         )
         return answer
 

From cf4211acd959e3a3f7b4961bfdfae080ddbb78f4 Mon Sep 17 00:00:00 2001
From: shankar_ws3 <shankar.anand@landing.ai>
Date: Tue, 30 Apr 2024 13:08:17 -0700
Subject: [PATCH 3/3] fixing param name as it overlaps with function call

---
 vision_agent/agent/vision_agent.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index b4d08cd8..9e5099de 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -489,7 +489,7 @@ def __call__(
         image: Optional[Union[str, Path]] = None,
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
-        self_reflect: Optional[bool] = True,
+        self_reflection: Optional[bool] = True,
     ) -> str:
         """Invoke the vision agent.
 
@@ -502,7 +502,7 @@ def __call__(
                 {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
                 where the bounding box coordinates are normalized.
             visualize_output: Whether to visualize the output.
-            self_reflect: boolean to enable and disable self reflection.
+            self_reflection: boolean to enable and disable self reflection.
 
         Returns:
             The result of the vision agent in text.
@@ -514,7 +514,7 @@ def __call__(
             image=image,
             visualize_output=visualize_output,
             reference_data=reference_data,
-            self_reflect=self_reflect,
+            self_reflection=self_reflection,
         )
 
     def log_progress(self, description: str) -> None:
@@ -541,7 +541,7 @@ def chat_with_workflow(
         image: Optional[Union[str, Path]] = None,
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
-        self_reflect: Optional[bool] = True,
+        self_reflection: Optional[bool] = True,
     ) -> Tuple[str, List[Dict]]:
         """Chat with the vision agent and return the final answer and all tool results.
 
@@ -554,7 +554,7 @@ def chat_with_workflow(
                 {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]}
                 where the bounding box coordinates are normalized.
             visualize_output: Whether to visualize the output.
-            self_reflect: boolean to enable and disable self reflection.
+            self_reflection: boolean to enable and disable self reflection.
 
         Returns:
             A tuple where the first item is the final answer and the second item is a
@@ -631,7 +631,7 @@ def chat_with_workflow(
             else:
                 reflection_images = None
 
-            if self_reflect:
+            if self_reflection:
                 reflection = self_reflect(
                     self.reflect_model,
                     question,
@@ -670,14 +670,14 @@ def chat(
         image: Optional[Union[str, Path]] = None,
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
-        self_reflect: Optional[bool] = True,
+        self_reflection: Optional[bool] = True,
     ) -> str:
         answer, _ = self.chat_with_workflow(
             chat,
             image=image,
             visualize_output=visualize_output,
             reference_data=reference_data,
-            self_reflect=self_reflect,
+            self_reflection=self_reflection,
         )
         return answer