Skip to content

Commit 9ce9ec8

Browse files
committed
added ixc 2.5
1 parent 28ad471 commit 9ce9ec8

File tree

3 files changed

+42
-1
lines changed

3 files changed

+42
-1
lines changed

tests/integ/test_tools.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
florence2_roberta_vqa,
1414
florence2_ocr,
1515
florence2_sam2_image,
16+
ixc25_image_vqa,
1617
generate_pose_image,
1718
generate_soft_edge_image,
1819
git_vqa_v2,
@@ -187,6 +188,15 @@ def test_image_qa_with_context() -> None:
187188
assert "night" in result.strip()
188189

189190

191+
def test_ixc25_image_vqa() -> None:
192+
img = ski.data.cat()
193+
result = ixc25_image_vqa(
194+
prompt="What animal is in this image?",
195+
image=img,
196+
)
197+
assert "cat" in result.strip()
198+
199+
190200
def test_ocr() -> None:
191201
img = ski.data.page()
192202
result = ocr(

vision_agent/tools/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
git_vqa_v2,
2828
grounding_dino,
2929
grounding_sam,
30+
ixc25_image_vqa,
3031
load_image,
3132
loca_visual_prompt_counting,
3233
loca_zero_shot_counting,

vision_agent/tools/tools.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
477477
478478
Example
479479
-------
480-
>>> florence2_roberta_vqa('What is the top left animal in this image ?', image)
480+
>>> florence2_roberta_vqa('What is the top left animal in this image?', image)
481481
'white tiger'
482482
"""
483483

@@ -492,6 +492,36 @@ def florence2_roberta_vqa(prompt: str, image: np.ndarray) -> str:
492492
return answer # type: ignore
493493

494494

495+
def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
496+
"""'ixc25_image_vqa' is a tool that can answer any questions about arbitrary images
497+
including regular images or images of documents or presentations. It returns text
498+
as an answer to the question.
499+
500+
Parameters:
501+
prompt (str): The question about the image
502+
image (np.ndarray): The reference image used for the question
503+
504+
Returns:
505+
str: A string which is the answer to the given prompt.
506+
507+
Example
508+
-------
509+
>>> ixc25_image_vqa('What is the cat doing?', image)
510+
'drinking milk'
511+
"""
512+
513+
buffer_bytes = numpy_to_bytes(image)
514+
files = [("image", buffer_bytes)]
515+
payload = {
516+
"prompt": prompt,
517+
"function_name": "ixc25_image_vqa",
518+
}
519+
data: Dict[str, Any] = send_inference_request(
520+
payload, "internlm-xcomposer2", files=files, v2=True
521+
)
522+
return data["answer"]
523+
524+
495525
def git_vqa_v2(prompt: str, image: np.ndarray) -> str:
496526
"""'git_vqa_v2' is a tool that can answer questions about the visual
497527
contents of an image given a question and an image. It returns an answer to the

0 commit comments

Comments
 (0)