added florence2+sam2 for images

dillonalaird · dillonalaird · commit 28ad47116444 · 2024-08-20T17:15:11.000-07:00
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
@@ -12,6 +12,7 @@
     florence2_object_detection,
     florence2_roberta_vqa,
     florence2_ocr,
+    florence2_sam2_image,
     generate_pose_image,
     generate_soft_edge_image,
     git_vqa_v2,
@@ -88,6 +89,17 @@ def test_grounding_sam():
     assert len([res["mask"] for res in result]) == 24
 
 
+def test_florence2_sam2_image():
+    img = ski.data.coins()
+    result = florence2_sam2_image(
+        prompt="coin",
+        image=img,
+    )
+    assert len(result) == 25
+    assert [res["label"] for res in result] == ["coin"] * 25
+    assert len([res["mask"] for res in result]) == 25
+
+
 def test_segmentation():
     img = ski.data.coins()
     result = detr_segmentation(
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
@@ -18,8 +18,9 @@
     extract_frames,
     florence2_image_caption,
     florence2_object_detection,
-    florence2_roberta_vqa,
     florence2_ocr,
+    florence2_roberta_vqa,
+    florence2_sam2_image,
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,
diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py
@@ -1,7 +1,7 @@
 import inspect
 import logging
 import os
-from typing import Any, Callable, Dict, List, MutableMapping, Optional
+from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
 
 import pandas as pd
 from IPython.display import display
@@ -28,7 +28,10 @@ class ToolCallTrace(BaseModel):
 
 
 def send_inference_request(
-    payload: Dict[str, Any], endpoint_name: str, v2: bool = False
+    payload: Dict[str, Any],
+    endpoint_name: str,
+    files: Optional[List[Tuple[Any, ...]]] = None,
+    v2: bool = False,
 ) -> Dict[str, Any]:
     try:
         if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
@@ -44,7 +47,7 @@ def send_inference_request(
             response={},
             error=None,
         )
-        headers = {"Content-Type": "application/json", "apikey": _LND_API_KEY}
+        headers = {"apikey": _LND_API_KEY}
         if "TOOL_ENDPOINT_AUTH" in os.environ:
             headers["Authorization"] = os.environ["TOOL_ENDPOINT_AUTH"]
             headers.pop("apikey")
@@ -54,7 +57,11 @@ def send_inference_request(
             num_retry=3,
             headers=headers,
         )
-        res = session.post(url, json=payload)
+
+        if files is not None:
+            res = session.post(url, data=payload, files=files)
+        else:
+            res = session.post(url, json=payload)
         if res.status_code != 200:
             tool_call_trace.error = Error(
                 name="RemoteToolCallFailed",
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -2,34 +2,36 @@
 import json
 import logging
 import tempfile
-from pathlib import Path
 from importlib import resources
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 import cv2
-import requests
 import numpy as np
-from pytube import YouTube  # type: ignore
+import requests
 from moviepy.editor import ImageSequenceClip
 from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
+from pytube import YouTube  # type: ignore
 
 from vision_agent.tools.tool_utils import (
-    send_inference_request,
     get_tool_descriptions,
     get_tool_documentation,
     get_tools_df,
+    send_inference_request,
 )
 from vision_agent.utils import extract_frames_from_video
 from vision_agent.utils.execute import FileSerializer, MimeType
 from vision_agent.utils.image_utils import (
     b64_to_pil,
+    convert_quad_box_to_bbox,
     convert_to_b64,
     denormalize_bbox,
     get_image_size,
     normalize_bbox,
-    convert_quad_box_to_bbox,
+    numpy_to_bytes,
     rle_decode,
+    rle_decode_array,
 )
 
 register_heif_opener()
@@ -242,6 +244,59 @@ def grounding_sam(
     return return_data
 
 
+def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+    """'florence2_sam2_image' is a tool that can segment multiple objects given a
+    text prompt such as category names or referring expressions. The categories in text
+    prompt are separated by commas. It returns a list of bounding boxes, label names,
+    mask file names and associated probability scores.
+
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
+
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label,
+            bounding box, and mask of the detected objects with normalized coordinates
+            (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left
+            and xmax and ymax are the coordinates of the bottom-right of the bounding box.
+            The mask is binary 2D numpy array where 1 indicates the object and 0 indicates
+            the background.
+
+    Example
+    -------
+        >>> florence2_sam2_image("car, dinosaur", image)
+        [
+            {
+                'score': 0.99,
+                'label': 'dinosaur',
+                'bbox': [0.1, 0.11, 0.35, 0.4],
+                'mask': array([[0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0],
+                    ...,
+                    [0, 0, 0, ..., 0, 0, 0],
+                    [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
+            },
+        ]
+    """
+    buffer_bytes = numpy_to_bytes(image)
+
+    files = [("image", buffer_bytes)]
+    payload = {
+        "prompts": prompt.split(","),
+        "function_name": "florence2_sam2_image",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "florence2-sam2", files=files, v2=True
+    )
+    return_data = []
+    for _, data_i in data["0"].items():
+        mask = rle_decode_array(data_i["mask"])
+        label = data_i["label"]
+        bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
+        return_data.append({"label": label, "bbox": bbox, "mask": mask, "score": 1.0})
+    return return_data
+
+
 def extract_frames(
     video_uri: Union[str, Path], fps: float = 0.5
 ) -> List[Tuple[np.ndarray, float]]:
diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
@@ -1,6 +1,7 @@
 """Utility functions for image processing."""
 
 import base64
+import io
 from importlib import resources
 from io import BytesIO
 from pathlib import Path
@@ -63,6 +64,28 @@ def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray:
     return img.reshape(shape)
 
 
+def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray:
+    r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
+
+    Parameters:
+        mask: The mask in run-length encoded as an array.
+    """
+    size = rle["size"]
+    counts = rle["counts"]
+
+    total_elements = size[0] * size[1]
+    flattened_mask = np.zeros(total_elements, dtype=np.uint8)
+
+    current_pos = 0
+    for i, count in enumerate(counts):
+        if i % 2 == 1:
+            flattened_mask[current_pos : current_pos + count] = 1
+        current_pos += count
+
+    binary_mask = flattened_mask.reshape(size, order="F")
+    return binary_mask
+
+
 def b64_to_pil(b64_str: str) -> ImageType:
     r"""Convert a base64 string to a PIL Image.
 
@@ -78,6 +101,15 @@ def b64_to_pil(b64_str: str) -> ImageType:
     return Image.open(BytesIO(base64.b64decode(b64_str)))
 
 
+def numpy_to_bytes(image: np.ndarray) -> bytes:
+    pil_image = Image.fromarray(image).convert("RGB")
+    image_buffer = io.BytesIO()
+    pil_image.save(image_buffer, format="PNG")
+    buffer_bytes = image_buffer.getvalue()
+    image_buffer.close()
+    return buffer_bytes
+
+
 def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
     r"""Get the size of an image.