added ixc 2.5

dillonalaird · dillonalaird · commit 9ce9ec8e2b1c · 2024-08-20T17:15:11.000-07:00
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
@@ -13,6 +13,7 @@
     florence2_roberta_vqa,
     florence2_ocr,
     florence2_sam2_image,
+    ixc25_image_vqa,
     generate_pose_image,
     generate_soft_edge_image,
     git_vqa_v2,
@@ -187,6 +188,15 @@ def test_image_qa_with_context() -> None:
     assert "night" in result.strip()
 
 
+def test_ixc25_image_vqa() -> None:
+    img = ski.data.cat()
+    result = ixc25_image_vqa(
+        prompt="What animal is in this image?",
+        image=img,
+    )
+    assert "cat" in result.strip()
+
+
 def test_ocr() -> None:
     img = ski.data.page()
     result = ocr(
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
@@ -27,6 +27,7 @@
     git_vqa_v2,
     grounding_dino,
     grounding_sam,
+    ixc25_image_vqa,
     load_image,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -477,7 +477,7 @@ def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
 
     Example
     -------
-        >>> florence2_roberta_vqa('What is the top left animal in this image ?', image)
+        >>> florence2_roberta_vqa('What is the top left animal in this image?', image)
         'white tiger'
     """
 
@@ -492,6 +492,36 @@ def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
     return answer  # type: ignore
 
 
+def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
+    """'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
+    including regular images or images of documents or presentations. It returns text
+    as an answer to the question.
+
+    Parameters:
+        prompt (str): The question about the image
+        image (np.ndarray): The reference image used for the question
+
+    Returns:
+        str: A string which is the answer to the given prompt.
+
+    Example
+    -------
+        >>> ixc25_image_vqa('What is the cat doing?', image)
+        'drinking milk'
+    """
+
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    payload = {
+        "prompt": prompt,
+        "function_name": "ixc25_image_vqa",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "internlm-xcomposer2", files=files, v2=True
+    )
+    return data["answer"]
+
+
 def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
     """'git_vqa_v2' is a tool that can answer questions about the visual
     contents of an image given a question and an image. It returns an answer to the