Skip to content

Commit 995c169

Browse files
authored
Add new tools (#202)
* corrected name florencev2 to florence2 * added florence2+sam2 for images * added ixc 2.5 * added florence2+sam2 for video * added ixc 2.5 for video * fixed type errors * updated prompts * added florence2 od * revert back to original doc * fixed workspace prompt * fixed extra space * updated docs * retry on judge * spelling mistakes * fixed json decode error * updated plan structure, fixed bug with testing plan tool output * fixed plan format * remove template match, fix ixc25 video doc * fixed flake8 * flake8 black * fix linting error
1 parent 7875feb commit 995c169

File tree

12 files changed

+528
-174
lines changed

12 files changed

+528
-174
lines changed

tests/integ/test_tools.py

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22
import skimage as ski
3+
from PIL import Image
34

45
from vision_agent.tools import (
56
blip_image_caption,
@@ -8,15 +9,19 @@
89
depth_anything_v2,
910
detr_segmentation,
1011
dpt_hybrid_midas,
11-
florencev2_image_caption,
12-
florencev2_object_detection,
13-
florencev2_roberta_vqa,
14-
florencev2_ocr,
12+
florence2_image_caption,
13+
florence2_object_detection,
14+
florence2_ocr,
15+
florence2_roberta_vqa,
16+
florence2_sam2_image,
17+
florence2_sam2_video,
1518
generate_pose_image,
1619
generate_soft_edge_image,
1720
git_vqa_v2,
1821
grounding_dino,
1922
grounding_sam,
23+
ixc25_image_vqa,
24+
ixc25_video_vqa,
2025
loca_visual_prompt_counting,
2126
loca_zero_shot_counting,
2227
ocr,
@@ -60,7 +65,7 @@ def test_owl():
6065

6166
def test_object_detection():
6267
img = ski.data.coins()
63-
result = florencev2_object_detection(
68+
result = florence2_object_detection(
6469
image=img,
6570
prompt="coin",
6671
)
@@ -88,6 +93,30 @@ def test_grounding_sam():
8893
assert len([res["mask"] for res in result]) == 24
8994

9095

96+
def test_florence2_sam2_image():
97+
img = ski.data.coins()
98+
result = florence2_sam2_image(
99+
prompt="coin",
100+
image=img,
101+
)
102+
assert len(result) == 25
103+
assert [res["label"] for res in result] == ["coin"] * 25
104+
assert len([res["mask"] for res in result]) == 25
105+
106+
107+
def test_florence2_sam2_video():
108+
frames = [
109+
np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
110+
]
111+
result = florence2_sam2_video(
112+
prompt="coin",
113+
frames=frames,
114+
)
115+
assert len(result) == 10
116+
assert len([res["label"] for res in result[0]]) == 25
117+
assert len([res["mask"] for res in result[0]]) == 25
118+
119+
91120
def test_segmentation():
92121
img = ski.data.coins()
93122
result = detr_segmentation(
@@ -133,7 +162,7 @@ def test_image_caption() -> None:
133162

134163
def test_florence_image_caption() -> None:
135164
img = ski.data.rocket()
136-
result = florencev2_image_caption(
165+
result = florence2_image_caption(
137166
image=img,
138167
)
139168
assert "The image shows a rocket on a launch pad at night" in result.strip()
@@ -168,13 +197,33 @@ def test_git_vqa_v2() -> None:
168197

169198
def test_image_qa_with_context() -> None:
170199
img = ski.data.rocket()
171-
result = florencev2_roberta_vqa(
200+
result = florence2_roberta_vqa(
172201
prompt="Is the scene captured during day or night ?",
173202
image=img,
174203
)
175204
assert "night" in result.strip()
176205

177206

207+
def test_ixc25_image_vqa() -> None:
208+
img = ski.data.cat()
209+
result = ixc25_image_vqa(
210+
prompt="What animal is in this image?",
211+
image=img,
212+
)
213+
assert "cat" in result.strip()
214+
215+
216+
def test_ixc25_video_vqa() -> None:
217+
frames = [
218+
np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
219+
]
220+
result = ixc25_video_vqa(
221+
prompt="What animal is in this video?",
222+
frames=frames,
223+
)
224+
assert "cat" in result.strip()
225+
226+
178227
def test_ocr() -> None:
179228
img = ski.data.page()
180229
result = ocr(
@@ -183,9 +232,9 @@ def test_ocr() -> None:
183232
assert any("Region-based segmentation" in res["label"] for res in result)
184233

185234

186-
def test_florencev2_ocr() -> None:
235+
def test_florence2_ocr() -> None:
187236
img = ski.data.page()
188-
result = florencev2_ocr(
237+
result = florence2_ocr(
189238
image=img,
190239
)
191240
assert any("Region-based segmentation" in res["label"] for res in result)

vision_agent/agent/agent_utils.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,22 @@
44
from typing import Any, Dict
55

66
logging.basicConfig(stream=sys.stdout)
7-
_LOGGER = logging.getLogger(__name__)
87

98

109
def extract_json(json_str: str) -> Dict[str, Any]:
1110
try:
11+
json_str = json_str.replace("\n", " ")
1212
json_dict = json.loads(json_str)
1313
except json.JSONDecodeError:
14-
input_json_str = json_str
1514
if "```json" in json_str:
1615
json_str = json_str[json_str.find("```json") + len("```json") :]
1716
json_str = json_str[: json_str.find("```")]
1817
elif "```" in json_str:
1918
json_str = json_str[json_str.find("```") + len("```") :]
2019
# get the last ``` not one from an intermediate string
2120
json_str = json_str[: json_str.find("}```")]
22-
try:
23-
json_dict = json.loads(json_str)
24-
except json.JSONDecodeError as e:
25-
error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
26-
_LOGGER.exception(error_msg)
27-
raise ValueError(error_msg) from e
21+
22+
json_dict = json.loads(json_str)
2823
return json_dict # type: ignore
2924

3025

vision_agent/agent/vision_agent_coder.py

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import sys
66
import tempfile
7+
from json import JSONDecodeError
78
from pathlib import Path
89
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
910

@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
8687
def format_plans(plans: Dict[str, Any]) -> str:
8788
plan_str = ""
8889
for k, v in plans.items():
89-
plan_str += f"{k}:\n"
90-
plan_str += "-" + "\n-".join([e["instructions"] for e in v])
90+
plan_str += "\n" + f"{k}: {v['thoughts']}\n"
91+
plan_str += " -" + "\n -".join([e for e in v["instructions"]])
9192

9293
return plan_str
9394

@@ -228,13 +229,11 @@ def pick_plan(
228229
"status": "completed" if tool_output.success else "failed",
229230
}
230231
)
231-
tool_output_str = ""
232-
if len(tool_output.logs.stdout) > 0:
233-
tool_output_str = tool_output.logs.stdout[0]
232+
tool_output_str = tool_output.text().strip()
234233

235234
if verbosity == 2:
236235
_print_code("Code and test after attempted fix:", code)
237-
_LOGGER.info(f"Code execution result after attempte {count}")
236+
_LOGGER.info(f"Code execution result after attempt {count}")
238237

239238
count += 1
240239

@@ -251,7 +250,21 @@ def pick_plan(
251250
tool_output=tool_output_str[:20_000],
252251
)
253252
chat[-1]["content"] = prompt
254-
best_plan = extract_json(model(chat, stream=False)) # type: ignore
253+
254+
count = 0
255+
best_plan = None
256+
while best_plan is None and count < max_retries:
257+
try:
258+
best_plan = extract_json(model(chat, stream=False)) # type: ignore
259+
except JSONDecodeError as e:
260+
_LOGGER.exception(
261+
f"Error while extracting JSON during picking best plan {str(e)}"
262+
)
263+
pass
264+
count += 1
265+
266+
if best_plan is None:
267+
best_plan = {"best_plan": list(plans.keys())[0]}
255268

256269
if verbosity >= 1:
257270
_LOGGER.info(f"Best plan:\n{best_plan}")
@@ -525,7 +538,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
525538

526539

527540
def retrieve_tools(
528-
plans: Dict[str, List[Dict[str, str]]],
541+
plans: Dict[str, Dict[str, Any]],
529542
tool_recommender: Sim,
530543
log_progress: Callable[[Dict[str, Any]], None],
531544
verbosity: int = 0,
@@ -542,8 +555,8 @@ def retrieve_tools(
542555
tool_lists: Dict[str, List[Dict[str, str]]] = {}
543556
for k, plan in plans.items():
544557
tool_lists[k] = []
545-
for task in plan:
546-
tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
558+
for task in plan["instructions"]:
559+
tools = tool_recommender.top_k(task, k=2, thresh=0.3)
547560
tool_info.extend([e["doc"] for e in tools])
548561
tool_desc.extend([e["desc"] for e in tools])
549562
tool_lists[k].extend(
@@ -737,14 +750,7 @@ def chat_with_workflow(
737750
if self.verbosity >= 1:
738751
for p in plans:
739752
# tabulate will fail if the keys are not the same for all elements
740-
p_fixed = [
741-
{
742-
"instructions": (
743-
e["instructions"] if "instructions" in e else ""
744-
)
745-
}
746-
for e in plans[p]
747-
]
753+
p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
748754
_LOGGER.info(
749755
f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
750756
)
@@ -793,13 +799,15 @@ def chat_with_workflow(
793799
)
794800

795801
if self.verbosity >= 1:
802+
plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
796803
_LOGGER.info(
797-
f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
804+
f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
798805
)
799806

800807
results = write_and_test_code(
801808
chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
802-
plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
809+
plan=f"\n{plan_i['thoughts']}\n-"
810+
+ "\n-".join([e for e in plan_i["instructions"]]),
803811
tool_info=tool_info,
804812
tool_output=tool_output_str,
805813
tool_utils=T.UTILITIES_DOCSTRING,

vision_agent/agent/vision_agent_coder_prompts.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,19 @@
3030
3131
**Instructions**:
3232
1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
33-
2. Output three different plans each utilize a different strategy or tool.
33+
2. Output three different plans each utilize a different strategy or set of tools.
3434
3535
Output a list of jsons in the following format
3636
3737
```json
3838
{{
3939
"plan1":
40-
[
41-
{{
42-
"instructions": str # what you should do in this task associated with a tool
43-
}}
44-
],
40+
{{
41+
"thoughts": str # your thought process for choosing this plan
42+
"instructions": [
43+
str # what you should do in this task associated with a tool
44+
]
45+
}},
4546
"plan2": ...,
4647
"plan3": ...
4748
}}
@@ -127,7 +128,8 @@
127128
128129
**Instructions**:
129130
1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
130-
2. Output a JSON object with the following format:
131+
2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
132+
3. Output a JSON object with the following format:
131133
{{
132134
"thoughts": str # your thought process for choosing the best plan
133135
"best_plan": str # the best plan you have chosen

vision_agent/agent/vision_agent_prompts.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
**Examples**:
1616
Here is an example of how you can interact with a user and Actions to complete a task:
1717
--- START EXAMPLES ---
18-
[Current directory: /workspace/test]
18+
[Current directory: /example/workspace]
1919
{examples}
2020
--- END EXAMPLES ---
2121
@@ -27,24 +27,25 @@
2727
Here is the current conversation so far:
2828
--- START CONVERSATION ---
2929
[Current directory: {dir}]
30+
3031
{conversation}
3132
"""
3233

3334
EXAMPLES_CODE1 = """
3435
USER: Can you detect the dogs in this image? Media name dog.jpg
3536
36-
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
37+
AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
3738
3839
OBSERVATION:
39-
[File /workspace/test/dog_detector.py]
40+
[File /example/workspace/dog_detector.py]
4041
0|from vision_agent.tools import load_image, owl_v2
4142
1|def detect_dogs(image_path: str):
4243
2| image = load_image(image_path)
4344
3| dogs = owl_v2("dog", image)
4445
4| return dogs
4546
[End of file]
4647
47-
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
48+
AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
4849
4950
OBSERVATION:
5051
----- stdout -----
@@ -55,18 +56,18 @@
5556
5657
USER: The the image only has one dog, can you fix this?
5758
58-
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
59+
AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
5960
6061
OBSERVATION:
61-
[File /workspace/test/dog_detector.py]
62+
[File /example/workspace/dog_detector.py]
6263
0|from vision_agent.tools import load_image, owl_v2
6364
1|def detect_dogs(image_path: str):
6465
2| image = load_image(image_path)
6566
3| dogs = owl_v2("dog", image, threshold=0.24)
6667
4| return dogs
6768
[End of file]
6869
69-
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
70+
AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
7071
7172
OBSERVATION:
7273
----- stdout -----
@@ -82,10 +83,10 @@
8283
8384
USER: Yes you can use workers.png
8485
85-
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
86+
AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
8687
8788
OBSERVATION:
88-
[File /workspace/test/code.py]
89+
[File /example/workspace/code.py]
8990
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
9091
1|def count_workers_with_helmets(image_path: str):
9192
2| image = load_image(image_path)
@@ -104,7 +105,7 @@
104105
15| return count
105106
[End of file]
106107
107-
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
108+
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
108109
109110
OBSERVATION:
110111
----- stdout -----

vision_agent/clients/landing_public_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os
2-
from uuid import UUID
32
from typing import List
3+
from uuid import UUID
44

55
from requests.exceptions import HTTPError
66

0 commit comments

Comments
 (0)