From 1dade3041f113651e30cb5393f595427d250d3f3 Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Mon, 30 Sep 2024 17:37:32 -0300
Subject: [PATCH 01/11] get first frame

---
 vision_agent/tools/tools.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 95fdd56c..93ee9207 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -200,14 +200,15 @@ def owl_v2_image(
         )
         data = data_obj.model_dump(by_alias=True)
         detections = send_inference_request(data, "tools", v2=False)
-        detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
+        # get the first frame detections
+        detection = detections[0]
         bboxes_formatted = [
             ODResponseData(
-                label=detections["labels"][i],
-                bbox=normalize_bbox(detections["bboxes"][i], image_size),
+                label=detection["labels"][i],
+                bbox=normalize_bbox(detection["bboxes"][i], image_size),
                 score=1.0,
             )
-            for i in range(len(detections["bboxes"]))
+            for i in range(len(detection["bboxes"]))
         ]
         return [bbox.model_dump() for bbox in bboxes_formatted]
 
@@ -428,15 +429,16 @@ def florence2_sam2_image(
         )
         req_data = req_data_obj.model_dump(by_alias=True)
         detections_ft = send_inference_request(req_data, "tools", v2=False)
-        detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
+        # get the first frame detections
+        detection = detections_ft[0]
         return_data = []
-        all_masks = np.array(detections_ft["masks"])
-        for i in range(len(detections_ft["bboxes"])):
+        all_masks = np.array(detection["masks"])
+        for i in range(len(detection["bboxes"])):
             return_data.append(
                 {
                     "score": 1.0,
-                    "label": detections_ft["labels"][i],
-                    "bbox": detections_ft["bboxes"][i],
+                    "label": detection["labels"][i],
+                    "bbox": detection["bboxes"][i],
                     "mask": all_masks[i, :, :].astype(np.uint8),
                 }
             )
@@ -1187,6 +1189,8 @@ def florence2_phrase_grounding(
             v2=False,
             metadata_payload={"function_name": "florence2_phrase_grounding"},
         )
+        # get the first frame detections
+        detection = detections[0]
     else:
         data = {
             "image": image_b64,
@@ -1195,14 +1199,14 @@ def florence2_phrase_grounding(
             "function_name": "florence2_phrase_grounding",
         }
         detections = send_inference_request(data, "florence2", v2=True)
+        detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
 
-    detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
     return_data = []
-    for i in range(len(detections["bboxes"])):
+    for i in range(len(detection["bboxes"])):
         return_data.append(
             ODResponseData(
-                label=detections["labels"][i],
-                bbox=normalize_bbox(detections["bboxes"][i], image_size),
+                label=detection["labels"][i],
+                bbox=normalize_bbox(detection["bboxes"][i], image_size),
                 score=1.0,
             )
         )

From 13e415da6fa7a89508001301318cfc53b607c61d Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Mon, 30 Sep 2024 17:40:00 -0300
Subject: [PATCH 02/11] adjust postprocessing

---
 vision_agent/tools/tools.py       | 2 +-
 vision_agent/tools/tools_types.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 93ee9207..f0537254 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -422,9 +422,9 @@ def florence2_sam2_image(
             task=PromptTask.PHRASE_GROUNDING,
             tool="florencev2_fine_tuning",
             prompt=prompt,
+            postprocessing="sam2",
             fine_tuning=FineTuning(
                 job_id=UUID(fine_tune_id),
-                postprocessing="sam2",
             ),
         )
         req_data = req_data_obj.model_dump(by_alias=True)
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index aa0e430f..25c2ec93 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -28,7 +28,6 @@ class FineTuning(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
 
     job_id: UUID = Field(alias="jobId")
-    postprocessing: Optional[str] = None
 
     @field_serializer("job_id")
     def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
@@ -42,6 +41,7 @@ class Florence2FtRequest(BaseModel):
     task: PromptTask
     tool: str
     prompt: Optional[str] = ""
+    postprocessing: Optional[str] = None
     fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning")
 
 

From d661cd3e298e093fbc1b6c9251265553ba4ae348 Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Mon, 30 Sep 2024 17:45:48 -0300
Subject: [PATCH 03/11] rle enconding

---
 vision_agent/tools/tools.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index f0537254..70101692 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -200,7 +200,7 @@ def owl_v2_image(
         )
         data = data_obj.model_dump(by_alias=True)
         detections = send_inference_request(data, "tools", v2=False)
-        # get the first frame detections
+        # get the first frame
         detection = detections[0]
         bboxes_formatted = [
             ODResponseData(
@@ -429,17 +429,18 @@ def florence2_sam2_image(
         )
         req_data = req_data_obj.model_dump(by_alias=True)
         detections_ft = send_inference_request(req_data, "tools", v2=False)
-        # get the first frame detections
+        # get the first frame
         detection = detections_ft[0]
         return_data = []
-        all_masks = np.array(detection["masks"])
         for i in range(len(detection["bboxes"])):
             return_data.append(
                 {
                     "score": 1.0,
                     "label": detection["labels"][i],
-                    "bbox": detection["bboxes"][i],
-                    "mask": all_masks[i, :, :].astype(np.uint8),
+                    "bbox": normalize_bbox(
+                        detection["bboxes"][i], detection["masks"][i]["size"]
+                    ),
+                    "mask": rle_decode_array(detection["masks"][i]),
                 }
             )
         return return_data
@@ -453,6 +454,7 @@ def florence2_sam2_image(
     detections: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
     )
+
     return_data = []
     for _, data_i in detections["0"].items():
         mask = rle_decode_array(data_i["mask"])
@@ -1189,7 +1191,7 @@ def florence2_phrase_grounding(
             v2=False,
             metadata_payload={"function_name": "florence2_phrase_grounding"},
         )
-        # get the first frame detections
+        # get the first frame
         detection = detections[0]
     else:
         data = {
@@ -1607,7 +1609,7 @@ def extract_frames_and_timestamps(
     """
 
     def reformat(
-        frames_and_timestamps: List[Tuple[np.ndarray, float]]
+        frames_and_timestamps: List[Tuple[np.ndarray, float]],
     ) -> List[Dict[str, Union[np.ndarray, float]]]:
         return [
             {"frame": frame, "timestamp": timestamp}

From 075b897227d86652f72f00b32c14b2bcdec56e87 Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Mon, 30 Sep 2024 22:36:03 -0300
Subject: [PATCH 04/11] adjust payload

---
 vision_agent/tools/tool_utils.py  |  4 +--
 vision_agent/tools/tools.py       | 51 ++++++++++++++++++-------------
 vision_agent/tools/tools_types.py | 23 ++++++--------
 3 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py
index 605f5511..772d6bc3 100644
--- a/vision_agent/tools/tool_utils.py
+++ b/vision_agent/tools/tool_utils.py
@@ -1,6 +1,6 @@
+import os
 import inspect
 import logging
-import os
 from base64 import b64encode
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
 
@@ -38,7 +38,7 @@ def send_inference_request(
     v2: bool = False,
     metadata_payload: Optional[Dict[str, Any]] = None,
 ) -> Any:
-    # TODO: runtime_tag and function_name should be metadata_payload and now included
+    # TODO: runtime_tag and function_name should be metadata_payload and not included
     # in the service payload
     if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
         payload["runtime_tag"] = runtime_tag
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 70101692..344726db 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1,20 +1,20 @@
+import os
 import io
 import json
 import logging
-import os
 import tempfile
 import urllib.request
-from importlib import resources
+from uuid import UUID
 from pathlib import Path
+from importlib import resources
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
-from uuid import UUID
 
 import cv2
-import numpy as np
 import requests
-from PIL import Image, ImageDraw, ImageEnhance, ImageFont
-from pillow_heif import register_heif_opener  # type: ignore
+import numpy as np
 from pytube import YouTube  # type: ignore
+from pillow_heif import register_heif_opener  # type: ignore
+from PIL import Image, ImageDraw, ImageEnhance, ImageFont
 
 from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.lmm.lmm import OpenAILMM
@@ -28,7 +28,6 @@
     send_task_inference_request,
 )
 from vision_agent.tools.tools_types import (
-    FineTuning,
     Florence2FtRequest,
     JobStatus,
     ODResponseData,
@@ -194,12 +193,16 @@ def owl_v2_image(
         data_obj = Florence2FtRequest(
             image=image_b64,
             task=PromptTask.PHRASE_GROUNDING,
-            tool="florencev2_fine_tuning",
             prompt=prompt,
-            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+            job_id=UUID(fine_tune_id),
+        )
+        data = data_obj.model_dump(by_alias=True, exclude_none=True)
+        detections = send_inference_request(
+            data,
+            "florence2-ft",
+            v2=True,
+            metadata_payload={"function_name": "owl_v2_image"},
         )
-        data = data_obj.model_dump(by_alias=True)
-        detections = send_inference_request(data, "tools", v2=False)
         # get the first frame
         detection = detections[0]
         bboxes_formatted = [
@@ -420,15 +423,17 @@ def florence2_sam2_image(
         req_data_obj = Florence2FtRequest(
             image=image_b64,
             task=PromptTask.PHRASE_GROUNDING,
-            tool="florencev2_fine_tuning",
             prompt=prompt,
             postprocessing="sam2",
-            fine_tuning=FineTuning(
-                job_id=UUID(fine_tune_id),
-            ),
+            job_id=UUID(fine_tune_id),
+        )
+        req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
+        detections_ft = send_inference_request(
+            req_data,
+            "florence2-ft",
+            v2=True,
+            metadata_payload={"function_name": "florence2_sam2_image"},
         )
-        req_data = req_data_obj.model_dump(by_alias=True)
-        detections_ft = send_inference_request(req_data, "tools", v2=False)
         # get the first frame
         detection = detections_ft[0]
         return_data = []
@@ -1136,6 +1141,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
 
 
+# TODO: add video
+
+
 def florence2_phrase_grounding(
     prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
 ) -> List[Dict[str, Any]]:
@@ -1180,15 +1188,14 @@ def florence2_phrase_grounding(
         data_obj = Florence2FtRequest(
             image=image_b64,
             task=PromptTask.PHRASE_GROUNDING,
-            tool="florencev2_fine_tuning",
             prompt=prompt,
-            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+            job_id=UUID(fine_tune_id),
         )
-        data = data_obj.model_dump(by_alias=True)
+        data = data_obj.model_dump(by_alias=True, exclude_none=True)
         detections = send_inference_request(
             data,
-            "tools",
-            v2=False,
+            "florence2-ft",
+            v2=True,
             metadata_payload={"function_name": "florence2_phrase_grounding"},
         )
         # get the first frame
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index 25c2ec93..4b24aabb 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -1,6 +1,6 @@
 from enum import Enum
-from typing import List, Optional, Tuple, Union
 from uuid import UUID
+from typing import List, Optional, Tuple, Union
 
 from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
 
@@ -24,25 +24,20 @@ class PromptTask(str, Enum):
     PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
 
 
-class FineTuning(BaseModel):
-    model_config = ConfigDict(populate_by_name=True)
-
-    job_id: UUID = Field(alias="jobId")
-
-    @field_serializer("job_id")
-    def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
-        return str(job_id)
-
-
 class Florence2FtRequest(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
 
-    image: str
+    image: str | None
+    video: bytes | None
     task: PromptTask
-    tool: str
     prompt: Optional[str] = ""
+    chunk_length_frames: Optional[int] = None
     postprocessing: Optional[str] = None
-    fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning")
+    job_id: Optional[UUID] = Field(None, alias="jobId")
+
+    @field_serializer("job_id")
+    def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
+        return str(job_id)
 
 
 class JobStatus(str, Enum):

From 294fecd2b5ebf7cd198cbcabea8ebe45fc9acdc2 Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Tue, 1 Oct 2024 18:13:33 -0300
Subject: [PATCH 05/11] add video support

---
 tests/integ/test_tools.py                     |  37 +++++-
 tests/unit/test_meta_tools.py                 |  16 +--
 .../agent/vision_agent_coder_prompts.py       |  10 +-
 vision_agent/agent/vision_agent_prompts.py    |  14 +--
 vision_agent/tools/__init__.py                |   3 +-
 vision_agent/tools/meta_tools.py              |   8 +-
 vision_agent/tools/tools.py                   | 108 +++++++++++++++---
 7 files changed, 155 insertions(+), 41 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 4954738c..9bd195eb 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -11,7 +11,8 @@
     dpt_hybrid_midas,
     florence2_image_caption,
     florence2_ocr,
-    florence2_phrase_grounding,
+    florence2_phrase_grounding_image,
+    florence2_phrase_grounding_video,
     florence2_roberta_vqa,
     florence2_sam2_image,
     florence2_sam2_video_tracking,
@@ -92,9 +93,9 @@ def test_owl_v2_video():
     assert 24 <= len([res["label"] for res in result[0]]) <= 26
 
 
-def test_florence2_phrase_grounding():
+def test_florence2_phrase_grounding_image():
     img = ski.data.coins()
-    result = florence2_phrase_grounding(
+    result = florence2_phrase_grounding_image(
         image=img,
         prompt="coin",
     )
@@ -102,9 +103,9 @@ def test_florence2_phrase_grounding():
     assert [res["label"] for res in result] == ["coin"] * 25
 
 
-def test_florence2_phrase_grounding_fine_tune_id():
+def test_florence2_phrase_grounding_image_fine_tune_id():
     img = ski.data.coins()
-    result = florence2_phrase_grounding(
+    result = florence2_phrase_grounding_image(
         prompt="coin",
         image=img,
         fine_tune_id=FINE_TUNE_ID,
@@ -114,6 +115,32 @@ def test_florence2_phrase_grounding_fine_tune_id():
     assert [res["label"] for res in result] == ["coin"] * len(result)
 
 
+def test_florence2_phrase_grounding_video():
+    frames = [
+        np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
+    ]
+    result = florence2_phrase_grounding_video(
+        prompt="coin",
+        frames=frames,
+    )
+    assert len(result) == 10
+    assert 24 <= len([res["label"] for res in result[0]]) <= 26
+
+
+def test_florence2_phrase_grounding_video_fine_tune_id():
+    frames = [
+        np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
+    ]
+    # this calls a fine-tuned florence2 model which is going to be worse at this task
+    result = florence2_phrase_grounding_video(
+        prompt="coin",
+        frames=frames,
+        fine_tune_id=FINE_TUNE_ID,
+    )
+    assert len(result) == 10
+    assert 24 <= len([res["label"] for res in result[0]]) <= 26
+
+
 def test_template_match():
     img = ski.data.coins()
     result = template_match(
diff --git a/tests/unit/test_meta_tools.py b/tests/unit/test_meta_tools.py
index fced644b..ef07bb9e 100644
--- a/tests/unit/test_meta_tools.py
+++ b/tests/unit/test_meta_tools.py
@@ -33,16 +33,16 @@ def test_use_object_detection_fine_tuning_none():
 
 def test_use_object_detection_fine_tuning():
     artifacts = Artifacts("test")
-    code = """florence2_phrase_grounding('one', image1)
+    code = """florence2_phrase_grounding_image('one', image1)
 owl_v2_image('two', image2)
 florence2_sam2_image('three', image3)"""
-    expected_code = """florence2_phrase_grounding("one", image1, "123")
+    expected_code = """florence2_phrase_grounding_image("one", image1, "123")
 owl_v2_image("two", image2, "123")
 florence2_sam2_image("three", image3, "123")"""
     artifacts["code"] = code
 
     output = use_object_detection_fine_tuning(artifacts, "code", "123")
-    assert 'florence2_phrase_grounding("one", image1, "123")' in output
+    assert 'florence2_phrase_grounding_image("one", image1, "123")' in output
     assert 'owl_v2_image("two", image2, "123")' in output
     assert 'florence2_sam2_image("three", image3, "123")' in output
     assert artifacts["code"] == expected_code
@@ -50,24 +50,24 @@ def test_use_object_detection_fine_tuning():
 
 def test_use_object_detection_fine_tuning_twice():
     artifacts = Artifacts("test")
-    code = """florence2_phrase_grounding('one', image1)
+    code = """florence2_phrase_grounding_image('one', image1)
 owl_v2_image('two', image2)
 florence2_sam2_image('three', image3)"""
-    expected_code1 = """florence2_phrase_grounding("one", image1, "123")
+    expected_code1 = """florence2_phrase_grounding_image("one", image1, "123")
 owl_v2_image("two", image2, "123")
 florence2_sam2_image("three", image3, "123")"""
-    expected_code2 = """florence2_phrase_grounding("one", image1, "456")
+    expected_code2 = """florence2_phrase_grounding_image("one", image1, "456")
 owl_v2_image("two", image2, "456")
 florence2_sam2_image("three", image3, "456")"""
     artifacts["code"] = code
     output = use_object_detection_fine_tuning(artifacts, "code", "123")
-    assert 'florence2_phrase_grounding("one", image1, "123")' in output
+    assert 'florence2_phrase_grounding_image("one", image1, "123")' in output
     assert 'owl_v2_image("two", image2, "123")' in output
     assert 'florence2_sam2_image("three", image3, "123")' in output
     assert artifacts["code"] == expected_code1
 
     output = use_object_detection_fine_tuning(artifacts, "code", "456")
-    assert 'florence2_phrase_grounding("one", image1, "456")' in output
+    assert 'florence2_phrase_grounding_image("one", image1, "456")' in output
     assert 'owl_v2_image("two", image2, "456")' in output
     assert 'florence2_sam2_image("three", image3, "456")' in output
     assert artifacts["code"] == expected_code2
diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index 07f2c6e2..45fc02ed 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -101,7 +101,7 @@
 - Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
 plan2:
 - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
-- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
+- Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
 plan3:
 - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
 - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
@@ -109,7 +109,7 @@
 
 ```python
 import numpy as np
-from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
+from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
 
 # sample at 1 FPS and use the first 10 frames to reduce processing time
 frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -143,7 +143,7 @@ def get_counts(preds):
 owl_v2_counts = get_counts(owl_v2_out)
 
 # plan2
-florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
+florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
 florence2_counts = get_counts(florence2_out)
 
 # plan3
@@ -153,13 +153,13 @@ def get_counts(preds):
 
 final_out = {{
     "owl_v2_video": owl_v2_out,
-    "florence2_phrase_grounding": florence2_out,
+    "florence2_phrase_grounding_image": florence2_out,
     "florence2_sam2_video_tracking": f2s2_out,
 }}
 
 counts = {{
     "owl_v2_video": owl_v2_counts,
-    "florence2_phrase_grounding": florence2_counts,
+    "florence2_phrase_grounding_image": florence2_counts,
     "florence2_sam2_video_tracking": f2s2_counts,
 }}
 
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index bc3295ef..4a668bda 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -131,10 +131,10 @@
 
 OBSERVATION:
 [Artifact code.py]
-0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
 1|def count_workers_with_helmets(image_path: str, output_path: str):
 2|    image = load_image(image_path)
-3|    detections = florence2_phrase_grounding("worker, helmet", image)
+3|    detections = florence2_phrase_grounding_image("worker, helmet", image)
 4|    workers = [d for d in detections if d['label'] == 'worker']
 5|    helmets = [d for d in detections if d['label'] == 'helmet']
 6|    count = 0
@@ -166,18 +166,18 @@
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
 
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 [Artifact code.py edits]
 ---
 +++
 @@ -1,7 +1,7 @@
- from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+ from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
  def count_workers_with_helmets(image_path: str, output_path: str):
      image = load_image(image_path)
--    detections = florence2_phrase_grounding("worker, helmet", image)
-+    detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
+-    detections = florence2_phrase_grounding_image("worker, helmet", image)
++    detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
      workers = [d for d in detections if d['label'] == 'worker']
      helmets = [d for d in detections if d['label'] == 'helmet']
      count = 0
@@ -189,5 +189,5 @@ def count_workers_with_helmets(image_path: str, output_path: str):
 ----- stdout -----
 3
 
-AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
 """
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 22453224..ebf98c08 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -24,7 +24,8 @@
     extract_frames_and_timestamps,
     florence2_image_caption,
     florence2_ocr,
-    florence2_phrase_grounding,
+    florence2_phrase_grounding_image,
+    florence2_phrase_grounding_video,
     florence2_roberta_vqa,
     florence2_sam2_image,
     florence2_sam2_video_tracking,
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 7d70e031..597bf5cc 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -665,8 +665,12 @@ def use_object_detection_fine_tuning(
 
     patterns_with_fine_tune_id = [
         (
-            r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
-            lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
+            r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
+            lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
+        ),
+        (
+            r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
+            lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
         ),
         (
             r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 344726db..ff360d87 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1141,16 +1141,13 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
 
 
-# TODO: add video
-
-
-def florence2_phrase_grounding(
+def florence2_phrase_grounding_image(
     prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
 ) -> List[Dict[str, Any]]:
-    """'florence2_phrase_grounding' is a tool that can detect multiple
-    objects given a text prompt which can be object names or caption. You
-    can optionally separate the object names in the text with commas. It returns a list
-    of bounding boxes with normalized coordinates, label names and associated
+    """'florence2_phrase_grounding_image' will run florence2 on a image. It can
+    detect multiple objects given a text prompt which can be object names or caption.
+    You can optionally separate the object names in the text with commas. It returns
+    a list of bounding boxes with normalized coordinates, label names and associated
     probability scores of 1.0.
 
     Parameters:
@@ -1168,7 +1165,7 @@ def florence2_phrase_grounding(
 
     Example
     -------
-        >>> florence2_phrase_grounding('person looking at a coyote', image)
+        >>> florence2_phrase_grounding_image('person looking at a coyote', image)
         [
             {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -1196,7 +1193,7 @@ def florence2_phrase_grounding(
             data,
             "florence2-ft",
             v2=True,
-            metadata_payload={"function_name": "florence2_phrase_grounding"},
+            metadata_payload={"function_name": "florence2_phrase_grounding_image"},
         )
         # get the first frame
         detection = detections[0]
@@ -1205,7 +1202,7 @@ def florence2_phrase_grounding(
             "image": image_b64,
             "task": "<CAPTION_TO_PHRASE_GROUNDING>",
             "prompt": prompt,
-            "function_name": "florence2_phrase_grounding",
+            "function_name": "florence2_phrase_grounding_image",
         }
         detections = send_inference_request(data, "florence2", v2=True)
         detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
@@ -1222,6 +1219,90 @@ def florence2_phrase_grounding(
     return [bbox.model_dump() for bbox in return_data]
 
 
+def florence2_phrase_grounding_video(
+    prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
+    """'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
+    It can detect multiple objects given a text prompt which can be object names or
+    caption. You can optionally separate the object names in the text with commas.
+    It returns a list of lists where each inner list contains bounding boxes with
+    normalized coordinates, label names and associated probability scores of 1.0.
+
+    Parameters:
+        prompt (str): The prompt to ground to the video.
+        frames (List[np.ndarray]): The list of frames to detect objects.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
+
+    Returns:
+        List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the score,
+            label, and bounding box of the detected objects with normalized coordinates
+            between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
+            of the top-left and xmax and ymax are the coordinates of the bottom-right of
+            the bounding box. The scores are always 1.0 and cannot be thresholded.
+
+    Example
+    -------
+        >>> florence2_phrase_grounding_video('person looking at a coyote', frames)
+        [
+            [
+                {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+                {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
+            ],
+            ...
+        ]
+    """
+    if len(frames) == 0:
+        raise ValueError("No frames provided")
+
+    image_size = frames[0].shape[:2]
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+
+    if fine_tune_id is not None:
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+
+        data_obj = Florence2FtRequest(
+            video=buffer_bytes,
+            task=PromptTask.PHRASE_GROUNDING,
+            prompt=prompt,
+            job_id=UUID(fine_tune_id),
+        )
+        data = data_obj.model_dump(by_alias=True, exclude_none=True)
+    else:
+        data_obj = Florence2FtRequest(
+            video=buffer_bytes, task=PromptTask.PHRASE_GROUNDING, prompt=prompt
+        )
+        data = data_obj.model_dump(by_alias=True, exclude_none=True)
+
+    detections = send_inference_request(
+        data,
+        "florence2-ft",
+        v2=True,
+        files=files,
+        metadata_payload={"function_name": "florence2_phrase_grounding_video"},
+    )
+
+    bboxes_formatted = []
+    for frame_data in detections:
+        bboxes_formatted_per_frame = []
+        for idx in range(len(frame_data["bboxes"])):
+            bboxes_formatted_per_frame.append(
+                ODResponseData(
+                    label=frame_data["labels"][idx],
+                    bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
+                    score=1.0,
+                )
+            )
+        bboxes_formatted.append(bboxes_formatted_per_frame)
+    return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
+
+
 def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
     """'florence2_ocr' is a tool that can detect text and text regions in an image.
     Each text region contains one line of text. It returns a list of detected text,
@@ -1233,7 +1314,7 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
 
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
-            with nornmalized coordinates, and confidence score.
+            with normalized coordinates, and confidence score.
 
     Example
     -------
@@ -2077,7 +2158,8 @@ def overlay_counting_results(
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video_tracking,
-    florence2_phrase_grounding,
+    florence2_phrase_grounding_image,
+    florence2_phrase_grounding_video,
     ixc25_image_vqa,
     ixc25_video_vqa,
     detr_segmentation,

From 0b1c88665bb3787d973946e96a26b366e22632ea Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Tue, 1 Oct 2024 18:20:52 -0300
Subject: [PATCH 06/11] linter

---
 vision_agent/tools/tools.py       | 2 +-
 vision_agent/tools/tools_types.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index ff360d87..7faa123a 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1221,7 +1221,7 @@ def florence2_phrase_grounding_image(
 
 def florence2_phrase_grounding_video(
     prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
-) -> List[Dict[str, Any]]:
+) -> List[List[Dict[str, Any]]]:
     """'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
     It can detect multiple objects given a text prompt which can be object names or
     caption. You can optionally separate the object names in the text with commas.
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index 4b24aabb..1cc765b6 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -27,8 +27,8 @@ class PromptTask(str, Enum):
 class Florence2FtRequest(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
 
-    image: str | None
-    video: bytes | None
+    image: Optional[str] = None
+    video: Optional[bytes] = None
     task: PromptTask
     prompt: Optional[str] = ""
     chunk_length_frames: Optional[int] = None

From 6aa3a2477a797acb026f17c907f04772023bb907 Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Tue, 1 Oct 2024 23:42:10 -0300
Subject: [PATCH 07/11] fix video

---
 tests/integ/test_tools.py        |  4 ++--
 vision_agent/tools/tool_utils.py |  9 +++++++--
 vision_agent/tools/tools.py      | 11 +++++------
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 9bd195eb..8c01f78d 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -124,7 +124,7 @@ def test_florence2_phrase_grounding_video():
         frames=frames,
     )
     assert len(result) == 10
-    assert 24 <= len([res["label"] for res in result[0]]) <= 26
+    assert 2 <= len([res["label"] for res in result[0]]) <= 26
 
 
 def test_florence2_phrase_grounding_video_fine_tune_id():
@@ -138,7 +138,7 @@ def test_florence2_phrase_grounding_video_fine_tune_id():
         fine_tune_id=FINE_TUNE_ID,
     )
     assert len(result) == 10
-    assert 24 <= len([res["label"] for res in result[0]]) <= 26
+    assert 16 <= len([res["label"] for res in result[0]]) <= 26
 
 
 def test_template_match():
diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py
index 772d6bc3..924b96e6 100644
--- a/vision_agent/tools/tool_utils.py
+++ b/vision_agent/tools/tool_utils.py
@@ -37,6 +37,7 @@ def send_inference_request(
     files: Optional[List[Tuple[Any, ...]]] = None,
     v2: bool = False,
     metadata_payload: Optional[Dict[str, Any]] = None,
+    is_form: bool = False,
 ) -> Any:
     # TODO: runtime_tag and function_name should be metadata_payload and not included
     # in the service payload
@@ -64,7 +65,7 @@ def send_inference_request(
     elif metadata_payload is not None and "function_name" in metadata_payload:
         function_name = metadata_payload["function_name"]
 
-    response = _call_post(url, payload, session, files, function_name)
+    response = _call_post(url, payload, session, files, function_name, is_form)
 
     # TODO: consider making the response schema the same between below two sources
     return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
@@ -75,6 +76,7 @@ def send_task_inference_request(
     task_name: str,
     files: Optional[List[Tuple[Any, ...]]] = None,
     metadata: Optional[Dict[str, Any]] = None,
+    is_form: bool = False,
 ) -> Any:
     url = f"{_LND_API_URL_v2}/{task_name}"
     headers = {"apikey": _LND_API_KEY}
@@ -87,7 +89,7 @@ def send_task_inference_request(
     function_name = "unknown"
     if metadata is not None and "function_name" in metadata:
         function_name = metadata["function_name"]
-    response = _call_post(url, payload, session, files, function_name)
+    response = _call_post(url, payload, session, files, function_name, is_form)
     return response["data"]
 
 
@@ -203,6 +205,7 @@ def _call_post(
     session: Session,
     files: Optional[List[Tuple[Any, ...]]] = None,
     function_name: str = "unknown",
+    is_form: bool = False,
 ) -> Any:
     files_in_b64 = None
     if files:
@@ -210,6 +213,8 @@ def _call_post(
     try:
         if files is not None:
             response = session.post(url, data=payload, files=files)
+        elif is_form:
+            response = session.post(url, data=payload)
         else:
             response = session.post(url, json=payload)
 
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 7faa123a..d99a38fc 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -201,6 +201,7 @@ def owl_v2_image(
             data,
             "florence2-ft",
             v2=True,
+            is_form=True,
             metadata_payload={"function_name": "owl_v2_image"},
         )
         # get the first frame
@@ -432,6 +433,7 @@ def florence2_sam2_image(
             req_data,
             "florence2-ft",
             v2=True,
+            is_form=True,
             metadata_payload={"function_name": "florence2_sam2_image"},
         )
         # get the first frame
@@ -1193,6 +1195,7 @@ def florence2_phrase_grounding_image(
             data,
             "florence2-ft",
             v2=True,
+            is_form=True,
             metadata_payload={"function_name": "florence2_phrase_grounding_image"},
         )
         # get the first frame
@@ -1268,18 +1271,14 @@ def florence2_phrase_grounding_video(
             )
 
         data_obj = Florence2FtRequest(
-            video=buffer_bytes,
             task=PromptTask.PHRASE_GROUNDING,
             prompt=prompt,
             job_id=UUID(fine_tune_id),
         )
-        data = data_obj.model_dump(by_alias=True, exclude_none=True)
     else:
-        data_obj = Florence2FtRequest(
-            video=buffer_bytes, task=PromptTask.PHRASE_GROUNDING, prompt=prompt
-        )
-        data = data_obj.model_dump(by_alias=True, exclude_none=True)
+        data_obj = Florence2FtRequest(task=PromptTask.PHRASE_GROUNDING, prompt=prompt)
 
+    data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
     detections = send_inference_request(
         data,
         "florence2-ft",

From 3b039979b0b6e7dd9e914e321d3c605c472215a9 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 3 Oct 2024 09:09:53 -0700
Subject: [PATCH 08/11] use video endpoint for florence2 instead of ft endpoint

---
 vision_agent/tools/tools.py | 40 ++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index d99a38fc..35d8487f 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1,20 +1,21 @@
-import os
+import base64
 import io
 import json
 import logging
+import os
 import tempfile
 import urllib.request
-from uuid import UUID
-from pathlib import Path
 from importlib import resources
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from uuid import UUID
 
 import cv2
-import requests
 import numpy as np
-from pytube import YouTube  # type: ignore
-from pillow_heif import register_heif_opener  # type: ignore
+import requests
 from PIL import Image, ImageDraw, ImageEnhance, ImageFont
+from pillow_heif import register_heif_opener  # type: ignore
+from pytube import YouTube  # type: ignore
 
 from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.lmm.lmm import OpenAILMM
@@ -1275,17 +1276,24 @@ def florence2_phrase_grounding_video(
             prompt=prompt,
             job_id=UUID(fine_tune_id),
         )
+
+        data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
+        detections = send_inference_request(
+            data,
+            "florence2-ft",
+            v2=True,
+            files=files,
+            metadata_payload={"function_name": "florence2_phrase_grounding_video"},
+        )
     else:
-        data_obj = Florence2FtRequest(task=PromptTask.PHRASE_GROUNDING, prompt=prompt)
-
-    data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
-    detections = send_inference_request(
-        data,
-        "florence2-ft",
-        v2=True,
-        files=files,
-        metadata_payload={"function_name": "florence2_phrase_grounding_video"},
-    )
+        data = {
+            "prompt": prompt,
+            "task": "<CAPTION_TO_PHRASE_GROUNDING>",
+            "function_name": "florence2_phrase_grounding_video",
+            "video": base64.b64encode(buffer_bytes).decode("utf-8"),
+        }
+        detections = send_inference_request(data, "florence2", v2=True)
+        detections = [d["<CAPTION_TO_PHRASE_GROUNDING>"] for d in detections]
 
     bboxes_formatted = []
     for frame_data in detections:

From 8a8c5d1dacb159418ae4b72877624532a9ee20ee Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Thu, 3 Oct 2024 22:49:10 -0300
Subject: [PATCH 09/11] fix video-temporal-localization

---
 vision_agent/tools/tools.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 35d8487f..6943a0ff 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -899,7 +899,10 @@ def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[b
         "function_name": "ixc25_temporal_localization",
     }
     data: List[int] = send_inference_request(
-        payload, "video-temporal-localization", files=files, v2=True
+        payload,
+        "video-temporal-localization?model=internlm-xcomposer",
+        files=files,
+        v2=True,
     )
     chunk_size = round(len(frames) / len(data))
     data_explode = [[elt] * chunk_size for elt in data]

From 3febe3a5195baaf26811ea660b5b93e15c68fcc1 Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Thu, 3 Oct 2024 23:11:58 -0300
Subject: [PATCH 10/11] fix countgd

---
 .github/workflows/ci_cd.yml         |  3 ---
 tests/integ/test_tools.py           | 17 +++++++++++++++++
 tests/integration_dev/__init__.py   |  0
 tests/integration_dev/test_tools.py | 18 ------------------
 vision_agent/tools/tools.py         | 14 +++++---------
 5 files changed, 22 insertions(+), 30 deletions(-)
 delete mode 100644 tests/integration_dev/__init__.py
 delete mode 100644 tests/integration_dev/test_tools.py

diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml
index 3576e10c..ce25f286 100644
--- a/.github/workflows/ci_cd.yml
+++ b/.github/workflows/ci_cd.yml
@@ -83,9 +83,6 @@ jobs:
     - name: Test with pytest
       run: |
         poetry run pytest -v tests/integ
-    - name: Test with pytest, dev env
-      run: |
-        LANDINGAI_API_KEY=$LANDINGAI_DEV_API_KEY LANDINGAI_URL=https://api.dev.landing.ai poetry run pytest -v tests/integration_dev
 
   release:
     name: Release
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 8c01f78d..9958894d 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -32,6 +32,8 @@
     template_match,
     vit_image_classification,
     vit_nsfw_classification,
+    countgd_counting,
+    countgd_example_based_counting,
 )
 
 FINE_TUNE_ID = "65ebba4a-88b7-419f-9046-0750e30250da"
@@ -387,3 +389,18 @@ def test_generate_hed():
     )
 
     assert result.shape == img.shape
+
+
+def test_countgd_counting() -> None:
+    img = ski.data.coins()
+    result = countgd_counting(image=img, prompt="coin")
+    assert len(result) == 24
+
+
+def test_countgd_example_based_counting() -> None:
+    img = ski.data.coins()
+    result = countgd_example_based_counting(
+        visual_prompts=[[85, 106, 122, 145]],
+        image=img,
+    )
+    assert len(result) == 24
diff --git a/tests/integration_dev/__init__.py b/tests/integration_dev/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/integration_dev/test_tools.py b/tests/integration_dev/test_tools.py
deleted file mode 100644
index 246c5642..00000000
--- a/tests/integration_dev/test_tools.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import skimage as ski
-
-from vision_agent.tools import countgd_counting, countgd_example_based_counting
-
-
-def test_countgd_counting() -> None:
-    img = ski.data.coins()
-    result = countgd_counting(image=img, prompt="coin")
-    assert len(result) == 24
-
-
-def test_countgd_example_based_counting() -> None:
-    img = ski.data.coins()
-    result = countgd_example_based_counting(
-        visual_prompts=[[85, 106, 122, 145]],
-        image=img,
-    )
-    assert len(result) == 24
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 6943a0ff..67f78307 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -700,22 +700,18 @@ def countgd_counting(
             {'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
         ]
     """
-    buffer_bytes = numpy_to_bytes(image)
-    files = [("image", buffer_bytes)]
+    image_b64 = convert_to_b64(image)
     prompt = prompt.replace(", ", " .")
-    payload = {"prompts": [prompt], "model": "countgd"}
+    payload = {"prompt": prompt, "image": image_b64}
     metadata = {"function_name": "countgd_counting"}
-    resp_data = send_task_inference_request(
-        payload, "text-to-object-detection", files=files, metadata=metadata
-    )
-    bboxes_per_frame = resp_data[0]
+    resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
     bboxes_formatted = [
         ODResponseData(
             label=bbox["label"],
-            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
+            bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
             score=round(bbox["score"], 2),
         )
-        for bbox in bboxes_per_frame
+        for bbox in resp_data
     ]
     filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
     return [bbox.model_dump() for bbox in filtered_bboxes]

From 6d6895c87ba37994874d8a04d9407c5961938db6 Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Thu, 3 Oct 2024 23:29:03 -0300
Subject: [PATCH 11/11] hide florence2_phrase_grounding_video

---
 tests/integ/test_tools.py      | 50 +++++++++++++++++-----------------
 vision_agent/tools/__init__.py |  1 -
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 9958894d..796fcdce 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -12,7 +12,7 @@
     florence2_image_caption,
     florence2_ocr,
     florence2_phrase_grounding_image,
-    florence2_phrase_grounding_video,
+    # florence2_phrase_grounding_video,
     florence2_roberta_vqa,
     florence2_sam2_image,
     florence2_sam2_video_tracking,
@@ -117,30 +117,30 @@ def test_florence2_phrase_grounding_image_fine_tune_id():
     assert [res["label"] for res in result] == ["coin"] * len(result)
 
 
-def test_florence2_phrase_grounding_video():
-    frames = [
-        np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
-    ]
-    result = florence2_phrase_grounding_video(
-        prompt="coin",
-        frames=frames,
-    )
-    assert len(result) == 10
-    assert 2 <= len([res["label"] for res in result[0]]) <= 26
-
-
-def test_florence2_phrase_grounding_video_fine_tune_id():
-    frames = [
-        np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
-    ]
-    # this calls a fine-tuned florence2 model which is going to be worse at this task
-    result = florence2_phrase_grounding_video(
-        prompt="coin",
-        frames=frames,
-        fine_tune_id=FINE_TUNE_ID,
-    )
-    assert len(result) == 10
-    assert 16 <= len([res["label"] for res in result[0]]) <= 26
+# def test_florence2_phrase_grounding_video():
+#     frames = [
+#         np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
+#     ]
+#     result = florence2_phrase_grounding_video(
+#         prompt="coin",
+#         frames=frames,
+#     )
+#     assert len(result) == 10
+#     assert 2 <= len([res["label"] for res in result[0]]) <= 26
+
+
+# def test_florence2_phrase_grounding_video_fine_tune_id():
+#     frames = [
+#         np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
+#     ]
+#     # this calls a fine-tuned florence2 model which is going to be worse at this task
+#     result = florence2_phrase_grounding_video(
+#         prompt="coin",
+#         frames=frames,
+#         fine_tune_id=FINE_TUNE_ID,
+#     )
+#     assert len(result) == 10
+#     assert 16 <= len([res["label"] for res in result[0]]) <= 26
 
 
 def test_template_match():
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index ebf98c08..2ed88789 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -25,7 +25,6 @@
     florence2_image_caption,
     florence2_ocr,
     florence2_phrase_grounding_image,
-    florence2_phrase_grounding_video,
     florence2_roberta_vqa,
     florence2_sam2_image,
     florence2_sam2_video_tracking,