From cfa0ecddea82b0d24f86f8e96006cf8193b6a600 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 18 Aug 2024 11:26:48 -0700
Subject: [PATCH 01/21] corrected name florencev2 to florence2

---
 tests/integ/test_tools.py      | 18 ++++++++--------
 vision_agent/tools/__init__.py |  8 +++----
 vision_agent/tools/tools.py    | 38 +++++++++++++++++-----------------
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 1d99ff69..d208d2bb 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -8,10 +8,10 @@
     depth_anything_v2,
     detr_segmentation,
     dpt_hybrid_midas,
-    florencev2_image_caption,
-    florencev2_object_detection,
-    florencev2_roberta_vqa,
-    florencev2_ocr,
+    florence2_image_caption,
+    florence2_object_detection,
+    florence2_roberta_vqa,
+    florence2_ocr,
     generate_pose_image,
     generate_soft_edge_image,
     git_vqa_v2,
@@ -60,7 +60,7 @@ def test_owl():
 
 def test_object_detection():
     img = ski.data.coins()
-    result = florencev2_object_detection(
+    result = florence2_object_detection(
         image=img,
         prompt="coin",
     )
@@ -133,7 +133,7 @@ def test_image_caption() -> None:
 
 def test_florence_image_caption() -> None:
     img = ski.data.rocket()
-    result = florencev2_image_caption(
+    result = florence2_image_caption(
         image=img,
     )
     assert "The image shows a rocket on a launch pad at night" in result.strip()
@@ -168,7 +168,7 @@ def test_git_vqa_v2() -> None:
 
 def test_image_qa_with_context() -> None:
     img = ski.data.rocket()
-    result = florencev2_roberta_vqa(
+    result = florence2_roberta_vqa(
         prompt="Is the scene captured during day or night ?",
         image=img,
     )
@@ -183,9 +183,9 @@ def test_ocr() -> None:
     assert any("Region-based segmentation" in res["label"] for res in result)
 
 
-def test_florencev2_ocr() -> None:
+def test_florence2_ocr() -> None:
     img = ski.data.page()
-    result = florencev2_ocr(
+    result = florence2_ocr(
         image=img,
     )
     assert any("Region-based segmentation" in res["label"] for res in result)
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index f9879626..b8b41588 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -16,10 +16,10 @@
     detr_segmentation,
     dpt_hybrid_midas,
     extract_frames,
-    florencev2_image_caption,
-    florencev2_object_detection,
-    florencev2_roberta_vqa,
-    florencev2_ocr,
+    florence2_image_caption,
+    florence2_object_detection,
+    florence2_roberta_vqa,
+    florence2_ocr,
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 0254a455..0e64fbed 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -408,8 +408,8 @@ def loca_visual_prompt_counting(
     return resp_data
 
 
-def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
-    """'florencev2_roberta_vqa' is a tool that takes an image and analyzes
+def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
+    """'florence2_roberta_vqa' is a tool that takes an image and analyzes
     its contents, generates detailed captions and then tries to answer the given
     question using the generated context. It returns text as an answer to the question.
 
@@ -422,7 +422,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
 
     Example
     -------
-        >>> florencev2_roberta_vqa('What is the top left animal in this image ?', image)
+        >>> florence2_roberta_vqa('What is the top left animal in this image ?', image)
         'white tiger'
     """
 
@@ -430,7 +430,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
     data = {
         "image": image_b64,
         "question": prompt,
-        "function_name": "florencev2_roberta_vqa",
+        "function_name": "florence2_roberta_vqa",
     }
 
     answer = send_inference_request(data, "florence2-qa", v2=True)
@@ -580,8 +580,8 @@ def blip_image_caption(image: np.ndarray) -> str:
     return answer["text"][0]  # type: ignore
 
 
-def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
-    """'florencev2_image_caption' is a tool that can caption or describe an image based
+def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> str:
+    """'florence2_image_caption' is a tool that can caption or describe an image based
     on its contents. It returns a text describing the image.
 
     Parameters:
@@ -594,7 +594,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
 
     Example
     -------
-        >>> florencev2_image_caption(image, False)
+        >>> florence2_image_caption(image, False)
         'This image contains a cat sitting on a table with a bowl of milk.'
     """
     image_b64 = convert_to_b64(image)
@@ -602,15 +602,15 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
     data = {
         "image": image_b64,
         "task": task,
-        "function_name": "florencev2_image_caption",
+        "function_name": "florence2_image_caption",
     }
 
     answer = send_inference_request(data, "florence2", v2=True)
     return answer[task]  # type: ignore
 
 
-def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florencev2_object_detection' is a tool that can detect objects given a text
+def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+    """'florence2_object_detection' is a tool that can detect objects given a text
     prompt such as a phrase or class names separated by commas. It returns a list of
     detected objects as labels and their location as bounding boxes with score of 1.0.
 
@@ -627,7 +627,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
 
     Example
     -------
-        >>> florencev2_object_detection('person looking at a coyote', image)
+        >>> florence2_object_detection('person looking at a coyote', image)
         [
             {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -639,7 +639,7 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
         "image": image_b64,
         "task": "<CAPTION_TO_PHRASE_GROUNDING>",
         "prompt": prompt,
-        "function_name": "florencev2_object_detection",
+        "function_name": "florence2_object_detection",
     }
 
     detections = send_inference_request(data, "florence2", v2=True)
@@ -656,8 +656,8 @@ def florencev2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str
     return return_data
 
 
-def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florencev2_ocr' is a tool that can detect text and text regions in an image.
+def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
+    """'florence2_ocr' is a tool that can detect text and text regions in an image.
     Each text region contains one line of text. It returns a list of detected text,
     the text region as a bounding box with normalized coordinates, and confidence
     scores. The results are sorted from top-left to bottom right.
@@ -671,7 +671,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
 
     Example
     -------
-        >>> florencev2_ocr(image)
+        >>> florence2_ocr(image)
         [
             {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
         ]
@@ -682,7 +682,7 @@ def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     data = {
         "image": image_b64,
         "task": "<OCR_WITH_REGION>",
-        "function_name": "florencev2_ocr",
+        "function_name": "florence2_ocr",
     }
 
     detections = send_inference_request(data, "florence2", v2=True)
@@ -1295,9 +1295,9 @@ def overlay_heat_map(
     vit_nsfw_classification,
     loca_zero_shot_counting,
     loca_visual_prompt_counting,
-    florencev2_roberta_vqa,
-    florencev2_image_caption,
-    florencev2_ocr,
+    florence2_roberta_vqa,
+    florence2_image_caption,
+    florence2_ocr,
     detr_segmentation,
     depth_anything_v2,
     generate_soft_edge_image,

From 28ad471164449f94e39a4d268f6600f2a5a08f30 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 18 Aug 2024 14:09:25 -0700
Subject: [PATCH 02/21] added florence2+sam2 for images

---
 tests/integ/test_tools.py         | 12 ++++++
 vision_agent/tools/__init__.py    |  3 +-
 vision_agent/tools/tool_utils.py  | 15 +++++--
 vision_agent/tools/tools.py       | 65 ++++++++++++++++++++++++++++---
 vision_agent/utils/image_utils.py | 32 +++++++++++++++
 5 files changed, 117 insertions(+), 10 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index d208d2bb..a93ca050 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -12,6 +12,7 @@
     florence2_object_detection,
     florence2_roberta_vqa,
     florence2_ocr,
+    florence2_sam2_image,
     generate_pose_image,
     generate_soft_edge_image,
     git_vqa_v2,
@@ -88,6 +89,17 @@ def test_grounding_sam():
     assert len([res["mask"] for res in result]) == 24
 
 
+def test_florence2_sam2_image():
+    img = ski.data.coins()
+    result = florence2_sam2_image(
+        prompt="coin",
+        image=img,
+    )
+    assert len(result) == 25
+    assert [res["label"] for res in result] == ["coin"] * 25
+    assert len([res["mask"] for res in result]) == 25
+
+
 def test_segmentation():
     img = ski.data.coins()
     result = detr_segmentation(
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index b8b41588..5025d28d 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -18,8 +18,9 @@
     extract_frames,
     florence2_image_caption,
     florence2_object_detection,
-    florence2_roberta_vqa,
     florence2_ocr,
+    florence2_roberta_vqa,
+    florence2_sam2_image,
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,
diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py
index 0ff56177..e82bd10e 100644
--- a/vision_agent/tools/tool_utils.py
+++ b/vision_agent/tools/tool_utils.py
@@ -1,7 +1,7 @@
 import inspect
 import logging
 import os
-from typing import Any, Callable, Dict, List, MutableMapping, Optional
+from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
 
 import pandas as pd
 from IPython.display import display
@@ -28,7 +28,10 @@ class ToolCallTrace(BaseModel):
 
 
 def send_inference_request(
-    payload: Dict[str, Any], endpoint_name: str, v2: bool = False
+    payload: Dict[str, Any],
+    endpoint_name: str,
+    files: Optional[List[Tuple[Any, ...]]] = None,
+    v2: bool = False,
 ) -> Dict[str, Any]:
     try:
         if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
@@ -44,7 +47,7 @@ def send_inference_request(
             response={},
             error=None,
         )
-        headers = {"Content-Type": "application/json", "apikey": _LND_API_KEY}
+        headers = {"apikey": _LND_API_KEY}
         if "TOOL_ENDPOINT_AUTH" in os.environ:
             headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
             headers.pop("apikey")
@@ -54,7 +57,11 @@ def send_inference_request(
             num_retry=3,
             headers=headers,
         )
-        res = session.post(url, json=payload)
+
+        if files is not None:
+            res = session.post(url, data=payload, files=files)
+        else:
+            res = session.post(url, json=payload)
         if res.status_code != 200:
             tool_call_trace.error = Error(
                 name="RemoteToolCallFailed",
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 0e64fbed..b1f0af26 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -2,34 +2,36 @@
 import json
 import logging
 import tempfile
-from pathlib import Path
 from importlib import resources
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 import cv2
-import requests
 import numpy as np
-from pytube import YouTube  # type: ignore
+import requests
 from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
+from pytube import YouTube  # type: ignore
 
 from vision_agent.tools.tool_utils import (
-    send_inference_request,
     get_tool_descriptions,
     get_tool_documentation,
     get_tools_df,
+    send_inference_request,
 )
 from vision_agent.utils import extract_frames_from_video
 from vision_agent.utils.execute import FileSerializer, MimeType
 from vision_agent.utils.image_utils import (
     b64_to_pil,
+    convert_quad_box_to_bbox,
     convert_to_b64,
     denormalize_bbox,
     get_image_size,
     normalize_bbox,
-    convert_quad_box_to_bbox,
+    numpy_to_bytes,
     rle_decode,
+    rle_decode_array,
 )
 
 register_heif_opener()
@@ -242,6 +244,59 @@ def grounding_sam(
     return return_data
 
 
+def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+    """'florence2_sam2_image' is a tool that can segment multiple objects given a
+    text prompt such as category names or referring expressions. The categories in text
+    prompt are separated by commas. It returns a list of bounding boxes, label names,
+    mask file names and associated probability scores.
+
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
+
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
+
+    Example
+    -------
+        >>> florence2_sam2_image("car, dinosaur", image)
+        [
+            {
+                'score': 0.99,
+                'label': 'dinosaur',
+                'bbox': [0.1, 0.11, 0.35, 0.4],
+                'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+            },
+        ]
+    """
+    buffer_bytes = numpy_to_bytes(image)
+
+    files = [("image", buffer_bytes)]
+    payload = {
+        "prompts": prompt.split(","),
+        "function_name": "florence2_sam2_image",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "florence2-sam2", files=files, v2=True
+    )
+    return_data = []
+    for _, data_i in data["0"].items():
+        mask = rle_decode_array(data_i["mask"])
+        label = data_i["label"]
+        bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
+        return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
+    return return_data
+
+
 def extract_frames(
     video_uri: Union[str, Path], fps: float = 0.5
 ) -> List[Tuple[np.ndarray, float]]:
diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
index ddbd14b3..93956638 100644
--- a/vision_agent/utils/image_utils.py
+++ b/vision_agent/utils/image_utils.py
@@ -1,6 +1,7 @@
 """Utility functions for image processing."""
 
 import base64
+import io
 from importlib import resources
 from io import BytesIO
 from pathlib import Path
@@ -63,6 +64,28 @@ def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
     return img.reshape(shape)
 
 
+def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray:
+    r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
+
+    Parameters:
+        mask: The mask in run-length encoded as an array.
+    """
+    size = rle["size"]
+    counts = rle["counts"]
+
+    total_elements = size[0] * size[1]
+    flattened_mask = np.zeros(total_elements, dtype=np.uint8)
+
+    current_pos = 0
+    for i, count in enumerate(counts):
+        if i % 2 == 1:
+            flattened_mask[current_pos : current_pos + count] = 1
+        current_pos += count
+
+    binary_mask = flattened_mask.reshape(size, order="F")
+    return binary_mask
+
+
 def b64_to_pil(b64_str: str) -> ImageType:
     r"""Convert a base64 string to a PIL Image.
 
@@ -78,6 +101,15 @@ def b64_to_pil(b64_str: str) -> ImageType:
     return Image.open(BytesIO(base64.b64decode(b64_str)))
 
 
+def numpy_to_bytes(image: np.ndarray) -> bytes:
+    pil_image = Image.fromarray(image).convert("RGB")
+    image_buffer = io.BytesIO()
+    pil_image.save(image_buffer, format="PNG")
+    buffer_bytes = image_buffer.getvalue()
+    image_buffer.close()
+    return buffer_bytes
+
+
 def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
     r"""Get the size of an image.
 

From 9ce9ec8e2b1c1325ab016279cd5eb4adc1f8ed8d Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 18 Aug 2024 16:54:39 -0700
Subject: [PATCH 03/21] added ixc 2.5

---
 tests/integ/test_tools.py      | 10 ++++++++++
 vision_agent/tools/__init__.py |  1 +
 vision_agent/tools/tools.py    | 32 +++++++++++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index a93ca050..0a52a0b2 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -13,6 +13,7 @@
     florence2_roberta_vqa,
     florence2_ocr,
     florence2_sam2_image,
+    ixc25_image_vqa,
     generate_pose_image,
     generate_soft_edge_image,
     git_vqa_v2,
@@ -187,6 +188,15 @@ def test_image_qa_with_context() -> None:
     assert "night" in result.strip()
 
 
+def test_ixc25_image_vqa() -> None:
+    img = ski.data.cat()
+    result = ixc25_image_vqa(
+        prompt="What animal is in this image?",
+        image=img,
+    )
+    assert "cat" in result.strip()
+
+
 def test_ocr() -> None:
     img = ski.data.page()
     result = ocr(
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 5025d28d..6179062f 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -27,6 +27,7 @@
     git_vqa_v2,
     grounding_dino,
     grounding_sam,
+    ixc25_image_vqa,
     load_image,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index b1f0af26..5cc35311 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -477,7 +477,7 @@ def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
 
     Example
     -------
-        >>> florence2_roberta_vqa('What is the top left animal in this image ?', image)
+        >>> florence2_roberta_vqa('What is the top left animal in this image?', image)
         'white tiger'
     """
 
@@ -492,6 +492,36 @@ def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
     return answer  # type: ignore
 
 
+def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
+    """'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
+    including regular images or images of documents or presentations. It returns text
+    as an answer to the question.
+
+    Parameters:
+        prompt (str): The question about the image
+        image (np.ndarray): The reference image used for the question
+
+    Returns:
+        str: A string which is the answer to the given prompt.
+
+    Example
+    -------
+        >>> ixc25_image_vqa('What is the cat doing?', image)
+        'drinking milk'
+    """
+
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    payload = {
+        "prompt": prompt,
+        "function_name": "ixc25_image_vqa",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "internlm-xcomposer2", files=files, v2=True
+    )
+    return data["answer"]
+
+
 def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
     """'git_vqa_v2' is a tool that can answer questions about the visual
     contents of an image given a question and an image. It returns an answer to the

From e24762e303a15f73f6cca9fad4794f9d4f7f9e8a Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 18 Aug 2024 19:45:46 -0700
Subject: [PATCH 04/21] added florence2+sam2 for video

---
 vision_agent/tools/__init__.py    |   1 +
 vision_agent/tools/tools.py       | 165 +++++++++++++++++++++++++-----
 vision_agent/utils/image_utils.py |  20 ++++
 3 files changed, 158 insertions(+), 28 deletions(-)

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 6179062f..3369499d 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -21,6 +21,7 @@
     florence2_ocr,
     florence2_roberta_vqa,
     florence2_sam2_image,
+    florence2_sam2_video,
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 5cc35311..46951916 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -27,6 +27,7 @@
     convert_quad_box_to_bbox,
     convert_to_b64,
     denormalize_bbox,
+    frames_to_bytes,
     get_image_size,
     normalize_bbox,
     numpy_to_bytes,
@@ -184,10 +185,10 @@ def grounding_sam(
     box_threshold: float = 0.20,
     iou_threshold: float = 0.20,
 ) -> List[Dict[str, Any]]:
-    """'grounding_sam' is a tool that can segment multiple objects given a
-    text prompt such as category names or referring expressions. The categories in text
-    prompt are separated by commas or periods. It returns a list of bounding boxes,
-    label names, mask file names and associated probability scores.
+    """'grounding_sam' is a tool that can segment multiple objects given a text prompt
+    such as category names or referring expressions. The categories in text prompt are
+    separated by commas or periods. It returns a list of bounding boxes, label names,
+    mask file names and associated probability scores.
 
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -245,8 +246,8 @@ def grounding_sam(
 
 
 def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florence2_sam2_image' is a tool that can segment multiple objects given a
-    text prompt such as category names or referring expressions. The categories in text
+    """'florence2_sam2_image' is a tool that can segment multiple objects given a text
+    prompt such as category names or referring expressions. The categories in the text
     prompt are separated by commas. It returns a list of bounding boxes, label names,
     mask file names and associated probability scores.
 
@@ -297,6 +298,63 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
     return return_data
 
 
+def florence2_sam2_video(
+    prompt: str, frames: List[np.ndarray]
+) -> List[List[Dict[str, Any]]]:
+    """'florence2_sam2_video' is a tool that can segment and track multiple objects
+    in a video given a text prompt such as category names or referring expressions. The
+    categories in the text prompt are separated by commas. It returns tracked objects
+    as masks, labels, and scores for each frame.
+
+    Parameters:
+        prompt (str): The prompt to ground to the video.
+        frames (List[np.ndarray]): The list of frames to ground the prompt to.
+
+    Returns:
+        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label,
+        score and mask of the detected objects. The outer list represents each frame
+        and the inner list is the objects per frame. The label contains the object ID
+        followed by the label name. The objects are only identified in the first framed
+        and tracked throughout the video.
+
+    Example
+    -------
+        >>> florence2_sam2_video("car, dinosaur", frames)
+        [
+            [
+                {
+                    'label': '0: dinosaur',
+                    'score': 1.0,
+                    'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                        [0, 0, 0, ..., 0, 0, 0],
+                        ...,
+                        [0, 0, 0, ..., 0, 0, 0],
+                        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+                },
+            ],
+        ]
+    """
+
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    payload = {
+        "prompts": prompt.split(","),
+        "function_name": "florence2_sam2_video",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "florence2-sam2", files=files, v2=True
+    )
+    return_data = []
+    for frame_i in data.keys():
+        return_frame_data = []
+        for obj_id, data_j in data[frame_i].items():
+            mask = rle_decode_array(data_j["mask"])
+            label = obj_id + ": " + data_j["label"]
+            return_frame_data.append({"label": label, "mask": mask, "score": 1.0})
+        return_data.append(return_frame_data)
+    return return_data
+
+
 def extract_frames(
     video_uri: Union[str, Path], fps: float = 0.5
 ) -> List[Tuple[np.ndarray, float]]:
@@ -1274,15 +1332,43 @@ def overlay_bounding_boxes(
     return np.array(pil_image)
 
 
+def _get_text_coords_from_mask(
+    mask: np.ndarray, v_gap: int = 10, h_gap: int = 10
+) -> Tuple[int, int]:
+    mask = mask.astype(np.uint8)
+    if np.sum(mask) == 0:
+        return (0, 0)
+
+    rows, cols = np.nonzero(mask)
+    top = rows.min()
+    bottom = rows.max()
+    left = cols.min()
+    right = cols.max()
+
+    if top - v_gap < 0:
+        if bottom + v_gap > mask.shape[0]:
+            top = top
+        else:
+            top = bottom + v_gap
+    else:
+        top = top - v_gap
+
+    return left + (right - left) // 2 - h_gap, top
+
+
 def overlay_segmentation_masks(
-    image: np.ndarray, masks: List[Dict[str, Any]]
-) -> np.ndarray:
+    medias: Union[np.ndarray, List[np.ndarray]],
+    masks: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
+    draw_label: bool = True,
+) -> Union[np.ndarray, List[np.ndarray]]:
     """'overlay_segmentation_masks' is a utility function that displays segmentation
     masks.
 
     Parameters:
-        image (np.ndarray): The image to display the masks on.
-        masks (List[Dict[str, Any]]): A list of dictionaries containing the masks.
+        medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
+            the masks on.
+        masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
+            dictionaries containing the masks.
 
     Returns:
         np.ndarray: The image with the masks displayed.
@@ -1302,27 +1388,50 @@ def overlay_segmentation_masks(
             }],
         )
     """
-    pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA")
+    medias_int: List[np.ndarray] = (
+        [medias] if isinstance(medias, np.ndarray) else medias
+    )
+    masks_int = [masks] if isinstance(masks[0], dict) else masks
+    masks_int = cast(List[List[Dict[str, Any]]], masks_int)
 
-    if len(set([mask["label"] for mask in masks])) > len(COLORS):
-        _LOGGER.warning(
-            "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
-        )
+    labels = set()
+    for mask_i in masks_int:
+        for mask_j in mask_i:
+            labels.add(mask_j["label"])
+    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
 
-    color = {
-        label: COLORS[i % len(COLORS)]
-        for i, label in enumerate(set([mask["label"] for mask in masks]))
-    }
-    masks = sorted(masks, key=lambda x: x["label"], reverse=True)
+    width, height = Image.fromarray(medias_int[0]).size
+    fontsize = max(12, int(min(width, height) / 40))
+    font = ImageFont.truetype(
+        str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
+        fontsize,
+    )
 
-    for elt in masks:
-        mask = elt["mask"]
-        label = elt["label"]
-        np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
-        np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
-        mask_img = Image.fromarray(np_mask.astype(np.uint8))
-        pil_image = Image.alpha_composite(pil_image, mask_img)
-    return np.array(pil_image)
+    frame_out = []
+    for i, frame in enumerate(medias_int):
+        pil_image = Image.fromarray(frame.astype(np.uint8)).convert("RGBA")
+        for elt in masks_int[i]:
+            mask = elt["mask"]
+            label = elt["label"]
+            np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4))
+            np_mask[mask > 0, :] = color[label] + (255 * 0.5,)
+            mask_img = Image.fromarray(np_mask.astype(np.uint8))
+            pil_image = Image.alpha_composite(pil_image, mask_img)
+
+            if draw_label:
+                draw = ImageDraw.Draw(pil_image)
+                text_box = draw.textbbox((0, 0), text=label, font=font)
+                x, y = _get_text_coords_from_mask(
+                    mask,
+                    v_gap=(text_box[3] - text_box[1]) + 10,
+                    h_gap=(text_box[2] - text_box[0]) // 2,
+                )
+                if x != 0 and y != 0:
+                    text_box = draw.textbbox((x, y), text=label, font=font)
+                    draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
+                    draw.text((x, y), label, fill="black", font=font)
+        frame_out.append(np.array(pil_image))  # type: ignore
+    return frame_out[0] if len(frame_out) == 1 else frame_out
 
 
 def overlay_heat_map(
diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
index 93956638..d2bc8a6d 100644
--- a/vision_agent/utils/image_utils.py
+++ b/vision_agent/utils/image_utils.py
@@ -2,12 +2,14 @@
 
 import base64
 import io
+import tempfile
 from importlib import resources
 from io import BytesIO
 from pathlib import Path
 from typing import Dict, List, Tuple, Union
 
 import numpy as np
+from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageFont
 from PIL.Image import Image as ImageType
 
@@ -86,6 +88,24 @@ def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray:
     return binary_mask
 
 
+def frames_to_bytes(
+    frames: List[np.ndarray], fps: float = 10, file_ext: str = "mp4"
+) -> bytes:
+    r"""Convert a list of frames to a video file encoded into a byte string.
+
+    Parameters:
+        frames: the list of frames
+        fps: the frames per second of the video
+        file_ext: the file extension of the video file
+    """
+    with tempfile.NamedTemporaryFile(delete=True) as temp_file:
+        clip = ImageSequenceClip(frames, fps=fps)
+        clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps)
+        with open(temp_file.name + f".{file_ext}", "rb") as f:
+            buffer_bytes = f.read()
+    return buffer_bytes
+
+
 def b64_to_pil(b64_str: str) -> ImageType:
     r"""Convert a base64 string to a PIL Image.
 

From 07832e3c1dd15e91e41fffd897563ca5e44ffeb9 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 18 Aug 2024 20:04:11 -0700
Subject: [PATCH 05/21] added ixc 2.5 for video

---
 tests/integ/test_tools.py      |  31 ++++++++-
 vision_agent/tools/__init__.py |   1 +
 vision_agent/tools/tools.py    | 115 +++++++++++++++++++--------------
 3 files changed, 97 insertions(+), 50 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 0a52a0b2..afa9dcb4 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -1,5 +1,6 @@
 import numpy as np
 import skimage as ski
+from PIL import Image
 
 from vision_agent.tools import (
     blip_image_caption,
@@ -10,15 +11,17 @@
     dpt_hybrid_midas,
     florence2_image_caption,
     florence2_object_detection,
-    florence2_roberta_vqa,
     florence2_ocr,
+    florence2_roberta_vqa,
     florence2_sam2_image,
-    ixc25_image_vqa,
+    florence2_sam2_video,
     generate_pose_image,
     generate_soft_edge_image,
     git_vqa_v2,
     grounding_dino,
     grounding_sam,
+    ixc25_image_vqa,
+    ixc25_video_vqa,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
     ocr,
@@ -101,6 +104,19 @@ def test_florence2_sam2_image():
     assert len([res["mask"] for res in result]) == 25
 
 
+def test_florence2_sam2_video():
+    frames = [
+        np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
+    ]
+    result = florence2_sam2_video(
+        prompt="coin",
+        frames=frames,
+    )
+    assert len(result) == 10
+    assert len([res["label"] for res in result[0]]) == 25
+    assert len([res["mask"] for res in result[0]]) == 25
+
+
 def test_segmentation():
     img = ski.data.coins()
     result = detr_segmentation(
@@ -197,6 +213,17 @@ def test_ixc25_image_vqa() -> None:
     assert "cat" in result.strip()
 
 
+def test_ixc25_video_vqa() -> None:
+    frames = [
+        np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
+    ]
+    result = ixc25_video_vqa(
+        prompt="What animal is in this video?",
+        frames=frames,
+    )
+    assert "cat" in result.strip()
+
+
 def test_ocr() -> None:
     img = ski.data.page()
     result = ocr(
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 3369499d..d5da4ad8 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -29,6 +29,7 @@
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
+    ixc25_video_vqa,
     load_image,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 46951916..f27cd225 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -355,54 +355,6 @@ def florence2_sam2_video(
     return return_data
 
 
-def extract_frames(
-    video_uri: Union[str, Path], fps: float = 0.5
-) -> List[Tuple[np.ndarray, float]]:
-    """'extract_frames' extracts frames from a video which can be a file path or youtube
-    link, returns a list of tuples (frame, timestamp), where timestamp is the relative
-    time in seconds where the frame was captured. The frame is a numpy array.
-
-    Parameters:
-        video_uri (Union[str, Path]): The path to the video file or youtube link
-        fps (float, optional): The frame rate per second to extract the frames. Defaults
-            to 0.5.
-
-    Returns:
-        List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
-            as a numpy array and the timestamp in seconds.
-
-    Example
-    -------
-        >>> extract_frames("path/to/video.mp4")
-        [(frame1, 0.0), (frame2, 0.5), ...]
-    """
-
-    if str(video_uri).startswith(
-        (
-            "http://www.youtube.com/",
-            "https://www.youtube.com/",
-            "http://youtu.be/",
-            "https://youtu.be/",
-        )
-    ):
-        with tempfile.TemporaryDirectory() as temp_dir:
-            yt = YouTube(str(video_uri))
-            # Download the highest resolution video
-            video = (
-                yt.streams.filter(progressive=True, file_extension="mp4")
-                .order_by("resolution")
-                .desc()
-                .first()
-            )
-            if not video:
-                raise Exception("No suitable video stream found")
-            video_file_path = video.download(output_path=temp_dir)
-
-            return extract_frames_from_video(video_file_path, fps)
-
-    return extract_frames_from_video(str(video_uri), fps)
-
-
 def ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     """'ocr' extracts text from an image. It returns a list of detected text, bounding
     boxes with normalized coordinates, and confidence scores. The results are sorted
@@ -580,6 +532,25 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
     return data["answer"]
 
 
+def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
+    """'ixc25_video_vqa' is a tool that can answer any questions about arbitrary videos
+    including regular videos or videos of documents or presentations. It returns text
+    as an answer to the question.
+
+
+    """
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    payload = {
+        "prompt": prompt,
+        "function_name": "ixc25_video_vqa",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "internlm-xcomposer2", files=files, v2=True
+    )
+    return data["answer"]
+
+
 def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
     """'git_vqa_v2' is a tool that can answer questions about the visual
     contents of an image given a question and an image. It returns an answer to the
@@ -1166,6 +1137,54 @@ def closest_box_distance(
 # Utility and visualization functions
 
 
+def extract_frames(
+    video_uri: Union[str, Path], fps: float = 0.5
+) -> List[Tuple[np.ndarray, float]]:
+    """'extract_frames' extracts frames from a video which can be a file path or youtube
+    link, returns a list of tuples (frame, timestamp), where timestamp is the relative
+    time in seconds where the frame was captured. The frame is a numpy array.
+
+    Parameters:
+        video_uri (Union[str, Path]): The path to the video file or youtube link
+        fps (float, optional): The frame rate per second to extract the frames. Defaults
+            to 0.5.
+
+    Returns:
+        List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
+            as a numpy array and the timestamp in seconds.
+
+    Example
+    -------
+        >>> extract_frames("path/to/video.mp4")
+        [(frame1, 0.0), (frame2, 0.5), ...]
+    """
+
+    if str(video_uri).startswith(
+        (
+            "http://www.youtube.com/",
+            "https://www.youtube.com/",
+            "http://youtu.be/",
+            "https://youtu.be/",
+        )
+    ):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            yt = YouTube(str(video_uri))
+            # Download the highest resolution video
+            video = (
+                yt.streams.filter(progressive=True, file_extension="mp4")
+                .order_by("resolution")
+                .desc()
+                .first()
+            )
+            if not video:
+                raise Exception("No suitable video stream found")
+            video_file_path = video.download(output_path=temp_dir)
+
+            return extract_frames_from_video(video_file_path, fps)
+
+    return extract_frames_from_video(str(video_uri), fps)
+
+
 def save_json(data: Any, file_path: str) -> None:
     """'save_json' is a utility function that saves data as a JSON file. It is helpful
     for saving data that contains NumPy arrays which are not JSON serializable.

From 968b3dfd8ae7fd064aaa30dfceb1b63f3d208a21 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 18 Aug 2024 20:06:45 -0700
Subject: [PATCH 06/21] fixed type errors

---
 vision_agent/tools/tools.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index f27cd225..5aaef147 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -529,7 +529,7 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
     data: Dict[str, Any] = send_inference_request(
         payload, "internlm-xcomposer2", files=files, v2=True
     )
-    return data["answer"]
+    return cast(str, data["answer"])
 
 
 def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
@@ -548,7 +548,7 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     data: Dict[str, Any] = send_inference_request(
         payload, "internlm-xcomposer2", files=files, v2=True
     )
-    return data["answer"]
+    return cast(str, data["answer"])
 
 
 def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
@@ -1449,7 +1449,7 @@ def overlay_segmentation_masks(
                     text_box = draw.textbbox((x, y), text=label, font=font)
                     draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label])
                     draw.text((x, y), label, fill="black", font=font)
-        frame_out.append(np.array(pil_image))  # type: ignore
+        frame_out.append(np.array(pil_image))
     return frame_out[0] if len(frame_out) == 1 else frame_out
 
 

From c4e50c852bc6d97799ad43e0c398d9d97849ab6a Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 19 Aug 2024 09:33:25 -0700
Subject: [PATCH 07/21] updated prompts

---
 .../agent/vision_agent_coder_prompts.py       |  4 +-
 vision_agent/tools/tools.py                   | 40 +++++++++++++------
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index cb4c3eeb..9f4020f8 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -39,6 +39,7 @@
     "plan1":
         [
             {{
+                "thoughts": str # your thought process for this plan
                 "instructions": str # what you should do in this task associated with a tool
             }}
         ],
@@ -127,7 +128,8 @@
 
 **Instructions**:
 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Output a JSON object with the following format:
+2. Try solving the problem yourself given the image and pick the plan which matches your solution the best.
+3. Output a JSON object with the following format:
 {{
     "thoughts": str # your thought process for choosing the best plan
     "best_plan": str # the best plan you have chosen
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 5aaef147..7025a823 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -303,8 +303,8 @@ def florence2_sam2_video(
 ) -> List[List[Dict[str, Any]]]:
     """'florence2_sam2_video' is a tool that can segment and track multiple objects
     in a video given a text prompt such as category names or referring expressions. The
-    categories in the text prompt are separated by commas. It returns tracked objects
-    as masks, labels, and scores for each frame.
+    categories in the text prompt are separated by commas. It is useful for tracking
+    and counting across frames without counting duplicates.
 
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -421,12 +421,19 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]:
 
     Returns:
         Dict[str, Any]: A dictionary containing the key 'count' and the count as a
-            value. E.g. {count: 12}.
+            value, e.g. {count: 12} and a heat map for visaulization purposes.
 
     Example
     -------
         >>> loca_zero_shot_counting(image)
-        {'count': 45},
+        {'count': 83,
+        'heat_map': array([[ 0,  0,  0, ...,  0,  0,  0],
+            [ 0,  0,  0, ...,  0,  0,  0],
+            [ 0,  0,  0, ...,  0,  0,  1],
+            ...,
+            [ 0,  0,  0, ..., 30, 35, 41],
+            [ 0,  0,  0, ..., 41, 47, 53],
+            [ 0,  0,  0, ..., 53, 59, 64]], dtype=uint8)}
     """
 
     image_b64 = convert_to_b64(image)
@@ -451,12 +458,19 @@ def loca_visual_prompt_counting(
 
     Returns:
         Dict[str, Any]: A dictionary containing the key 'count' and the count as a
-            value. E.g. {count: 12}.
+            value, e.g. {count: 12} and a heat map for visaulization purposes.
 
     Example
     -------
         >>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]})
-        {'count': 45},
+        {'count': 83,
+        'heat_map': array([[ 0,  0,  0, ...,  0,  0,  0],
+            [ 0,  0,  0, ...,  0,  0,  0],
+            [ 0,  0,  0, ...,  0,  0,  1],
+            ...,
+            [ 0,  0,  0, ..., 30, 35, 41],
+            [ 0,  0,  0, ..., 41, 47, 53],
+            [ 0,  0,  0, ..., 53, 59, 64]], dtype=uint8)}
     """
 
     image_size = get_image_size(image)
@@ -1138,7 +1152,7 @@ def closest_box_distance(
 
 
 def extract_frames(
-    video_uri: Union[str, Path], fps: float = 0.5
+    video_uri: Union[str, Path], fps: float = 1
 ) -> List[Tuple[np.ndarray, float]]:
     """'extract_frames' extracts frames from a video which can be a file path or youtube
     link, returns a list of tuples (frame, timestamp), where timestamp is the relative
@@ -1147,7 +1161,7 @@ def extract_frames(
     Parameters:
         video_uri (Union[str, Path]): The path to the video file or youtube link
         fps (float, optional): The frame rate per second to extract the frames. Defaults
-            to 0.5.
+            to 10.
 
     Returns:
         List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
@@ -1249,7 +1263,7 @@ def save_image(image: np.ndarray, file_path: str) -> None:
 
 
 def save_video(
-    frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 4
+    frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1
 ) -> str:
     """'save_video' is a utility function that saves a list of frames as a mp4 video file on disk.
 
@@ -1500,7 +1514,6 @@ def overlay_heat_map(
 
 TOOLS = [
     owl_v2,
-    grounding_sam,
     extract_frames,
     ocr,
     clip,
@@ -1508,13 +1521,14 @@ def overlay_heat_map(
     vit_nsfw_classification,
     loca_zero_shot_counting,
     loca_visual_prompt_counting,
-    florence2_roberta_vqa,
     florence2_image_caption,
     florence2_ocr,
+    florence2_sam2_image,
+    florence2_sam2_video,
+    ixc25_image_vqa,
+    ixc25_video_vqa,
     detr_segmentation,
     depth_anything_v2,
-    generate_soft_edge_image,
-    dpt_hybrid_midas,
     generate_pose_image,
     closest_mask_distance,
     closest_box_distance,

From 78671695ad2161e55c520999dd53cdbd9fb1c421 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 20 Aug 2024 16:34:35 -0700
Subject: [PATCH 08/21] added florence2 od

---
 vision_agent/tools/tools.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 7025a823..a894800c 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -738,9 +738,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
 
 
 def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florence2_object_detection' is a tool that can detect objects given a text
-    prompt such as a phrase or class names separated by commas. It returns a list of
-    detected objects as labels and their location as bounding boxes with score of 1.0.
+    """'florence2_object_detection' that can detect and count multiple objects given a
+    text prompt such as category names or referring expressions. The categories in text
+    prompt are separated by commas. It returns a list of bounding boxes with normalized
+    coordinates, label names and associated probability scores set to 1.0
 
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -1525,6 +1526,7 @@ def overlay_heat_map(
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video,
+    florence2_object_detection,
     ixc25_image_vqa,
     ixc25_video_vqa,
     detr_segmentation,

From d4b5b269fd2b8bba288d1bb67d6228b817a9dfec Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 20 Aug 2024 17:16:57 -0700
Subject: [PATCH 09/21] revert back to original doc

---
 vision_agent/tools/tools.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index a894800c..7868cb75 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -738,10 +738,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
 
 
 def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florence2_object_detection' that can detect and count multiple objects given a
-    text prompt such as category names or referring expressions. The categories in text
-    prompt are separated by commas. It returns a list of bounding boxes with normalized
-    coordinates, label names and associated probability scores set to 1.0
+    """'florencev2_object_detection' is a tool that can detect objects given a text
+    prompt such as a phrase or class names separated by commas. It returns a list of
+    detected objects as labels and their location as bounding boxes with score of 1.0.
 
     Parameters:
         prompt (str): The prompt to ground to the image.

From c8a36fcdee07c43c427d9e8e4d38a3aee1711dc3 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 20 Aug 2024 18:48:18 -0700
Subject: [PATCH 10/21] fixed workspace prompt

---
 vision_agent/agent/vision_agent_prompts.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 6f0fdf74..4774d84d 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -15,7 +15,7 @@
 **Examples**:
 Here is an example of how you can interact with a user and Actions to complete a task:
 --- START EXAMPLES ---
-[Current directory: /workspace/test]
+[Current directory: /example/workspace]
 {examples}
 --- END EXAMPLES ---
 
@@ -27,16 +27,17 @@
 Here is the current conversation so far:
 --- START CONVERSATION ---
 [Current directory: {dir}]
+
 {conversation}
 """
 
 EXAMPLES_CODE1 = """
 USER: Can you detect the dogs in this image? Media name dog.jpg
 
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /workspace/test/dog_detector.py]
+[File /example/workspace/dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
@@ -44,7 +45,7 @@
 4|    return dogs
 [End of file]
 
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
@@ -55,10 +56,10 @@
 
 USER: The the image only has one dog, can you fix this?
 
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /workspace/test/dog_detector.py]
+[File /example/workspace/dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
@@ -66,7 +67,7 @@
 4|    return dogs
 [End of file]
 
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
@@ -82,10 +83,10 @@
 
 USER: Yes you can use workers.png
 
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /workspace/test/code.py]
+[File /example/workspace/code.py]
 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
 1|def count_workers_with_helmets(image_path: str):
 2|    image = load_image(image_path)
@@ -104,7 +105,7 @@
 15|    return count
 [End of file]
 
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----

From 8a363087c0da32a46adc01f410c49977bacbe763 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 21 Aug 2024 09:13:53 -0700
Subject: [PATCH 11/21] fixed extra space

---
 vision_agent/tools/tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 7868cb75..441201a7 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -283,7 +283,7 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
 
     files = [("image", buffer_bytes)]
     payload = {
-        "prompts": prompt.split(","),
+        "prompts": [s.strip() for s in prompt.split(",")],
         "function_name": "florence2_sam2_image",
     }
     data: Dict[str, Any] = send_inference_request(

From 55d093aaf86ce8b5a7373ff24fb200c351f89ca1 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 23 Aug 2024 09:51:13 -0700
Subject: [PATCH 12/21] updated docs

---
 vision_agent/tools/tools.py | 38 +++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 441201a7..fd37ac53 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -132,9 +132,9 @@ def owl_v2(
     box_threshold: float = 0.10,
 ) -> List[Dict[str, Any]]:
     """'owl_v2' is a tool that can detect and count multiple objects given a text
-    prompt such as category names or referring expressions. The categories in text prompt
-    are separated by commas. It returns a list of bounding boxes with
-    normalized coordinates, label names and associated probability scores.
+    prompt such as category names or referring expressions. The categories in text
+    prompt are separated by commas. It returns a list of bounding boxes with normalized
+    coordinates, label names and associated probability scores.
 
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -249,7 +249,7 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
     """'florence2_sam2_image' is a tool that can segment multiple objects given a text
     prompt such as category names or referring expressions. The categories in the text
     prompt are separated by commas. It returns a list of bounding boxes, label names,
-    mask file names and associated probability scores.
+    mask file names and associated probability scores of 1.0.
 
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -268,7 +268,7 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
         >>> florence2_sam2_image("car, dinosaur", image)
         [
             {
-                'score': 0.99,
+                'score': 1.0,
                 'label': 'dinosaur',
                 'bbox': [0.1, 0.11, 0.35, 0.4],
                 'mask': array([[0, 0, 0, ..., 0, 0, 0],
@@ -301,21 +301,22 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
 def florence2_sam2_video(
     prompt: str, frames: List[np.ndarray]
 ) -> List[List[Dict[str, Any]]]:
-    """'florence2_sam2_video' is a tool that can segment and track multiple objects
-    in a video given a text prompt such as category names or referring expressions. The
-    categories in the text prompt are separated by commas. It is useful for tracking
-    and counting across frames without counting duplicates.
+    """'florence2_sam2_video' is a tool that can segment and track multiple entities
+    in a video given a text prompt such as category names or referring expressions. You
+    can optionally separate the categories in the text with commas. It only tracks
+    entities present in the first frame and only returns segmentation masks. It is
+    useful for tracking and counting without duplicating counts.
 
     Parameters:
         prompt (str): The prompt to ground to the video.
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
 
     Returns:
-        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label,
-        score and mask of the detected objects. The outer list represents each frame
-        and the inner list is the objects per frame. The label contains the object ID
-        followed by the label name. The objects are only identified in the first framed
-        and tracked throughout the video.
+        List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
+        and segment mask. The outer list represents each frame and the inner list is
+        the entities per frame. The label contains the object ID followed by the label
+        name. The objects are only identified in the first framed and tracked
+        throughout the video.
 
     Example
     -------
@@ -324,7 +325,6 @@ def florence2_sam2_video(
             [
                 {
                     'label': '0: dinosaur',
-                    'score': 1.0,
                     'mask': array([[0, 0, 0, ..., 0, 0, 0],
                         [0, 0, 0, ..., 0, 0, 0],
                         ...,
@@ -738,9 +738,11 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
 
 
 def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florencev2_object_detection' is a tool that can detect objects given a text
-    prompt such as a phrase or class names separated by commas. It returns a list of
-    detected objects as labels and their location as bounding boxes with score of 1.0.
+    """'florencev2_object_detection' is a tool that can detect and count multiple
+    objects given a text prompt such as category names or referring expressions. You
+    can optionally separate the categories in the text with commas. It returns a list
+    of bounding boxes with normalized coordinates, label names and associated
+    probability scores of 1.0.
 
     Parameters:
         prompt (str): The prompt to ground to the image.

From 13501c1256ff6b003b5f814cd95e628068ae3fd0 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 23 Aug 2024 15:21:04 -0700
Subject: [PATCH 13/21] retry on judge

---
 vision_agent/agent/agent_utils.py        | 10 ++--------
 vision_agent/agent/vision_agent_coder.py | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py
index e4e678d7..6e08ad88 100644
--- a/vision_agent/agent/agent_utils.py
+++ b/vision_agent/agent/agent_utils.py
@@ -4,14 +4,12 @@
 from typing import Any, Dict
 
 logging.basicConfig(stream=sys.stdout)
-_LOGGER = logging.getLogger(__name__)
 
 
 def extract_json(json_str: str) -> Dict[str, Any]:
     try:
         json_dict = json.loads(json_str)
     except json.JSONDecodeError:
-        input_json_str = json_str
         if "```json" in json_str:
             json_str = json_str[json_str.find("```json") + len("```json") :]
             json_str = json_str[: json_str.find("```")]
@@ -19,12 +17,8 @@ def extract_json(json_str: str) -> Dict[str, Any]:
             json_str = json_str[json_str.find("```") + len("```") :]
             # get the last ``` not one from an intermediate string
             json_str = json_str[: json_str.find("}```")]
-        try:
-            json_dict = json.loads(json_str)
-        except json.JSONDecodeError as e:
-            error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
-            _LOGGER.exception(error_msg)
-            raise ValueError(error_msg) from e
+
+        json_dict = json.loads(json_str)
     return json_dict  # type: ignore
 
 
diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index 3a370c5e..22574ba4 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -4,6 +4,7 @@
 import os
 import sys
 import tempfile
+from json import JSONDecodeError
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
 
@@ -251,7 +252,18 @@ def pick_plan(
         tool_output=tool_output_str[:20_000],
     )
     chat[-1]["content"] = prompt
-    best_plan = extract_json(model(chat, stream=False))  # type: ignore
+
+    count = 0
+    best_plan = None
+    while best_plan is None or count < max_retries:
+        try:
+            best_plan = extract_json(model(chat, stream=False))  # type: ignore
+        except JSONDecodeError as _:
+            pass
+        count += 1
+
+    if count == max_retries:
+        best_plan = {"best_plan": list(plans.keys())[0]}
 
     if verbosity >= 1:
         _LOGGER.info(f"Best plan:\n{best_plan}")

From f30969a3ccf8e2a72bd4defe10a0460872f67054 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 23 Aug 2024 16:11:26 -0700
Subject: [PATCH 14/21] spelling mistakes

---
 vision_agent/agent/vision_agent_coder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index 22574ba4..40b69257 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -235,7 +235,7 @@ def pick_plan(
 
         if verbosity == 2:
             _print_code("Code and test after attempted fix:", code)
-            _LOGGER.info(f"Code execution result after attempte {count}")
+            _LOGGER.info(f"Code execution result after attempt {count}")
 
         count += 1
 

From cea438bee6bc9890cf7a413154570c85b172bafe Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 23 Aug 2024 18:20:55 -0700
Subject: [PATCH 15/21] fixed json decode error

---
 vision_agent/agent/agent_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py
index 6e08ad88..5d55e963 100644
--- a/vision_agent/agent/agent_utils.py
+++ b/vision_agent/agent/agent_utils.py
@@ -8,6 +8,7 @@
 
 def extract_json(json_str: str) -> Dict[str, Any]:
     try:
+        json_str = json_str.replace("\n", " ")
         json_dict = json.loads(json_str)
     except json.JSONDecodeError:
         if "```json" in json_str:

From 5e2689c7eca15ac65f54d27ea8c250f7ca306d53 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 23 Aug 2024 18:21:28 -0700
Subject: [PATCH 16/21] updated plan structure, fixed bug with testing plan
 tool output

---
 vision_agent/agent/vision_agent_coder.py | 33 ++++++++++--------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index 40b69257..65a46a6f 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -87,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
 def format_plans(plans: Dict[str, Any]) -> str:
     plan_str = ""
     for k, v in plans.items():
-        plan_str += f"{k}:\n"
-        plan_str += "-" + "\n-".join([e["instructions"] for e in v])
+        plan_str += "\n" + f"{k}: {v['thoughts']}\n"
+        plan_str += "    -" + "\n    -".join([e for e in v["instructions"]])
 
     return plan_str
 
@@ -229,9 +229,7 @@ def pick_plan(
                 "status": "completed" if tool_output.success else "failed",
             }
         )
-        tool_output_str = ""
-        if len(tool_output.logs.stdout) > 0:
-            tool_output_str = tool_output.logs.stdout[0]
+        tool_output_str = tool_output.text().strip()
 
         if verbosity == 2:
             _print_code("Code and test after attempted fix:", code)
@@ -255,14 +253,15 @@ def pick_plan(
 
     count = 0
     best_plan = None
-    while best_plan is None or count < max_retries:
+    while best_plan is None and count < max_retries:
         try:
             best_plan = extract_json(model(chat, stream=False))  # type: ignore
         except JSONDecodeError as _:
+            _LOGGER.exception("Error while extracting JSON during picking best plan")
             pass
         count += 1
 
-    if count == max_retries:
+    if best_plan is None:
         best_plan = {"best_plan": list(plans.keys())[0]}
 
     if verbosity >= 1:
@@ -537,7 +536,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
 
 
 def retrieve_tools(
-    plans: Dict[str, List[Dict[str, str]]],
+    plans: Dict[str, Dict[str, Any]],
     tool_recommender: Sim,
     log_progress: Callable[[Dict[str, Any]], None],
     verbosity: int = 0,
@@ -554,8 +553,8 @@ def retrieve_tools(
     tool_lists: Dict[str, List[Dict[str, str]]] = {}
     for k, plan in plans.items():
         tool_lists[k] = []
-        for task in plan:
-            tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
+        for task in plan["instructions"]:
+            tools = tool_recommender.top_k(task, k=2, thresh=0.3)
             tool_info.extend([e["doc"] for e in tools])
             tool_desc.extend([e["desc"] for e in tools])
             tool_lists[k].extend(
@@ -749,14 +748,7 @@ def chat_with_workflow(
             if self.verbosity >= 1:
                 for p in plans:
                     # tabulate will fail if the keys are not the same for all elements
-                    p_fixed = [
-                        {
-                            "instructions": (
-                                e["instructions"] if "instructions" in e else ""
-                            )
-                        }
-                        for e in plans[p]
-                    ]
+                    p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
                     _LOGGER.info(
                         f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
                     )
@@ -805,13 +797,14 @@ def chat_with_workflow(
             )
 
             if self.verbosity >= 1:
+                plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
                 _LOGGER.info(
-                    f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
+                    f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
                 )
 
             results = write_and_test_code(
                 chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
-                plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
+                plan=f"\n{plan_i['thoughts']}\n-" + "\n-".join([e for e in plan_i["instructions"]]),
                 tool_info=tool_info,
                 tool_output=tool_output_str,
                 tool_utils=T.UTILITIES_DOCSTRING,

From 416ae380d7c69ede89061992543465e3598872c4 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 23 Aug 2024 18:21:44 -0700
Subject: [PATCH 17/21] fixed plan format

---
 vision_agent/agent/vision_agent_coder_prompts.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index 9f4020f8..c68f73fe 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -30,19 +30,19 @@
 
 **Instructions**:
 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
-2. Output three different plans each utilize a different strategy or tool.
+2. Output three different plans each utilize a different strategy or set of tools.
 
 Output a list of jsons in the following format
 
 ```json
 {{
     "plan1":
-        [
-            {{
-                "thoughts": str # your thought process for this plan
-                "instructions": str # what you should do in this task associated with a tool
-            }}
-        ],
+        {{
+            "thoughts": str # your thought process for choosing this plan
+            "instructions": [
+                str # what you should do in this task associated with a tool
+            ]
+        }},
     "plan2": ...,
     "plan3": ...
 }}
@@ -128,7 +128,7 @@
 
 **Instructions**:
 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Try solving the problem yourself given the image and pick the plan which matches your solution the best.
+2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
 3. Output a JSON object with the following format:
 {{
     "thoughts": str # your thought process for choosing the best plan

From 6129d199b0e05692cab2a2430161504b657cc250 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 23 Aug 2024 18:22:06 -0700
Subject: [PATCH 18/21] remove template match, fix ixc25 video doc

---
 vision_agent/tools/tools.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index fd37ac53..6334d4bc 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -551,8 +551,19 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     including regular videos or videos of documents or presentations. It returns text
     as an answer to the question.
 
+    Parameters:
+        prompt (str): The question about the video
+        frames (List[np.ndarray]): The reference frames used for the question
+
+    Returns:
+        str: A string which is the answer to the given prompt.
 
+    Example
+    -------
+        >>> ixc25_video_vqa('Which football player made the goal?', frames)
+        'Lionel Messi'
     """
+
     buffer_bytes = frames_to_bytes(frames)
     files = [("video", buffer_bytes)]
     payload = {
@@ -1542,7 +1553,6 @@ def overlay_heat_map(
     overlay_bounding_boxes,
     overlay_segmentation_masks,
     overlay_heat_map,
-    template_match,
 ]
 TOOLS_DF = get_tools_df(TOOLS)  # type: ignore
 TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS)  # type: ignore

From a63bcd7d26eb1103aa5fb383f4a8158133629162 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 23 Aug 2024 19:41:09 -0700
Subject: [PATCH 19/21] fixed flake8

---
 vision_agent/agent/vision_agent_coder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index 65a46a6f..8134aba3 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -256,8 +256,8 @@ def pick_plan(
     while best_plan is None and count < max_retries:
         try:
             best_plan = extract_json(model(chat, stream=False))  # type: ignore
-        except JSONDecodeError as _:
-            _LOGGER.exception("Error while extracting JSON during picking best plan")
+        except JSONDecodeError as e:
+            _LOGGER.exception(f"Error while extracting JSON during picking best plan {str(e)}")
             pass
         count += 1
 

From a35f1d75019fdc6123ff87cb843e3adb3e183f24 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 23 Aug 2024 19:41:32 -0700
Subject: [PATCH 20/21] flake8 black

---
 vision_agent/agent/vision_agent_coder.py   | 7 +++++--
 vision_agent/clients/landing_public_api.py | 4 ++--
 vision_agent/tools/meta_tools.py           | 6 +++---
 vision_agent/utils/execute.py              | 1 -
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index 8134aba3..3f445d80 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -257,7 +257,9 @@ def pick_plan(
         try:
             best_plan = extract_json(model(chat, stream=False))  # type: ignore
         except JSONDecodeError as e:
-            _LOGGER.exception(f"Error while extracting JSON during picking best plan {str(e)}")
+            _LOGGER.exception(
+                f"Error while extracting JSON during picking best plan {str(e)}"
+            )
             pass
         count += 1
 
@@ -804,7 +806,8 @@ def chat_with_workflow(
 
             results = write_and_test_code(
                 chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
-                plan=f"\n{plan_i['thoughts']}\n-" + "\n-".join([e for e in plan_i["instructions"]]),
+                plan=f"\n{plan_i['thoughts']}\n-"
+                + "\n-".join([e for e in plan_i["instructions"]]),
                 tool_info=tool_info,
                 tool_output=tool_output_str,
                 tool_utils=T.UTILITIES_DOCSTRING,
diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py
index 4c50c388..8e1158d4 100644
--- a/vision_agent/clients/landing_public_api.py
+++ b/vision_agent/clients/landing_public_api.py
@@ -1,10 +1,10 @@
 import os
-from uuid import UUID
 from typing import List
+from uuid import UUID
 
 from vision_agent.clients.http import BaseHTTP
-from vision_agent.utils.type_defs import LandingaiAPIKey
 from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
+from vision_agent.utils.type_defs import LandingaiAPIKey
 
 
 class LandingPublicAPI(BaseHTTP):
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 851aab18..c38aa925 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -1,16 +1,16 @@
 import os
 import subprocess
-from uuid import UUID
 from pathlib import Path
 from typing import Any, Dict, List, Union
+from uuid import UUID
 
 import vision_agent as va
+from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.lmm.types import Message
+from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
 from vision_agent.utils.image_utils import convert_to_b64
-from vision_agent.clients.landing_public_api import LandingPublicAPI
-from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
 
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
 
diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index b157b1df..b62308ff 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -416,7 +416,6 @@ def download_file(self, file_path: str) -> Path:
 
 
 class E2BCodeInterpreter(CodeInterpreter):
-
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         assert os.getenv("E2B_API_KEY"), "E2B_API_KEY environment variable must be set"

From 25154c2ee19885af5d58cd3ac923c06ebcf36c3e Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 26 Aug 2024 12:50:28 -0700
Subject: [PATCH 21/21] fix linting error

---
 vision_agent/tools/meta_tools.py |  5 -----
 vision_agent/tools/tools.py      | 25 ++++++++++++-------------
 2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 73d7a020..4a82436d 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -2,16 +2,11 @@
 import subprocess
 from pathlib import Path
 from typing import Any, Dict, List, Union
-from uuid import UUID
 
 import vision_agent as va
-from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.lmm.types import Message
-from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
-from vision_agent.utils.image_utils import convert_to_b64
-
 
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
 
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index e4e8262c..2dade7f7 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -2,11 +2,10 @@
 import json
 import logging
 import tempfile
-from uuid import UUID
-from pathlib import Path
 from importlib import resources
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from uuid import UUID
 
 import cv2
 import numpy as np
@@ -16,15 +15,24 @@
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
 
+from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.tools.tool_utils import (
     get_tool_descriptions,
     get_tool_documentation,
     get_tools_df,
-    send_inference_request,
     get_tools_info,
+    send_inference_request,
+)
+from vision_agent.tools.tools_types import (
+    BboxInput,
+    BboxInputBase64,
+    FineTuning,
+    Florencev2FtRequest,
+    JobStatus,
+    PromptTask,
 )
-from vision_agent.utils.exceptions import FineTuneModelIsNotReady
 from vision_agent.utils import extract_frames_from_video
+from vision_agent.utils.exceptions import FineTuneModelIsNotReady
 from vision_agent.utils.execute import FileSerializer, MimeType
 from vision_agent.utils.image_utils import (
     b64_to_pil,
@@ -38,15 +46,6 @@
     rle_decode,
     rle_decode_array,
 )
-from vision_agent.tools.tools_types import (
-    BboxInput,
-    BboxInputBase64,
-    PromptTask,
-    Florencev2FtRequest,
-    FineTuning,
-    JobStatus,
-)
-from vision_agent.clients.landing_public_api import LandingPublicAPI
 
 register_heif_opener()