landing-ai · dillonalaird · Aug 26, 2024 · Aug 18, 2024 · Aug 18, 2024 · Aug 18, 2024
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
@@ -1,5 +1,6 @@
 import numpy as np
 import skimage as ski
+from PIL import Image
 
 from vision_agent.tools import (
  blip_image_caption,
@@ -8,15 +9,19 @@
  depth_anything_v2,
  detr_segmentation,
  dpt_hybrid_midas,
- florencev2_image_caption,
- florencev2_object_detection,
- florencev2_roberta_vqa,
- florencev2_ocr,
+ florence2_image_caption,
+ florence2_object_detection,
+ florence2_ocr,
+ florence2_roberta_vqa,
+ florence2_sam2_image,
+ florence2_sam2_video,
  generate_pose_image,
  generate_soft_edge_image,
  git_vqa_v2,
  grounding_dino,
  grounding_sam,
+ ixc25_image_vqa,
+ ixc25_video_vqa,
  loca_visual_prompt_counting,
  loca_zero_shot_counting,
  ocr,
@@ -60,7 +65,7 @@ def test_owl():
 
 def test_object_detection():
  img = ski.data.coins()
- result = florencev2_object_detection(
+ result = florence2_object_detection(
  image=img,
  prompt="coin",
  )
@@ -88,6 +93,30 @@ def test_grounding_sam():
  assert len([res["mask"] for res in result]) == 24
 
 
+def test_florence2_sam2_image():
+ img = ski.data.coins()
+ result = florence2_sam2_image(
+ prompt="coin",
+ image=img,
+ )
+ assert len(result) == 25
+ assert [res["label"] for res in result] == ["coin"] * 25
+ assert len([res["mask"] for res in result]) == 25
+
+
+def test_florence2_sam2_video():
+ frames = [
+ np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
+ ]
+ result = florence2_sam2_video(
+ prompt="coin",
+ frames=frames,
+ )
+ assert len(result) == 10
+ assert len([res["label"] for res in result[0]]) == 25
+ assert len([res["mask"] for res in result[0]]) == 25
+
+
 def test_segmentation():
  img = ski.data.coins()
  result = detr_segmentation(
@@ -133,7 +162,7 @@ def test_image_caption() -> None:
 
 def test_florence_image_caption() -> None:
  img = ski.data.rocket()
- result = florencev2_image_caption(
+ result = florence2_image_caption(
  image=img,
  )
  assert "The image shows a rocket on a launch pad at night" in result.strip()
@@ -168,13 +197,33 @@ def test_git_vqa_v2() -> None:
 
 def test_image_qa_with_context() -> None:
  img = ski.data.rocket()
- result = florencev2_roberta_vqa(
+ result = florence2_roberta_vqa(
  prompt="Is the scene captured during day or night ?",
  image=img,
  )
  assert "night" in result.strip()
 
 
+def test_ixc25_image_vqa() -> None:
+ img = ski.data.cat()
+ result = ixc25_image_vqa(
+ prompt="What animal is in this image?",
+ image=img,
+ )
+ assert "cat" in result.strip()
+
+
+def test_ixc25_video_vqa() -> None:
+ frames = [
+ np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
+ ]
+ result = ixc25_video_vqa(
+ prompt="What animal is in this video?",
+ frames=frames,
+ )
+ assert "cat" in result.strip()
+
+
 def test_ocr() -> None:
  img = ski.data.page()
  result = ocr(
@@ -183,9 +232,9 @@ def test_ocr() -> None:
  assert any("Region-based segmentation" in res["label"] for res in result)
 
 
-def test_florencev2_ocr() -> None:
+def test_florence2_ocr() -> None:
  img = ski.data.page()
- result = florencev2_ocr(
+ result = florence2_ocr(
  image=img,
  )
  assert any("Region-based segmentation" in res["label"] for res in result)

diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py
@@ -4,27 +4,22 @@
 from typing import Any, Dict
 
 logging.basicConfig(stream=sys.stdout)
-_LOGGER = logging.getLogger(__name__)
 
 
 def extract_json(json_str: str) -> Dict[str, Any]:
  try:
+ json_str = json_str.replace("\n", " ")
  json_dict = json.loads(json_str)
  except json.JSONDecodeError:
- input_json_str = json_str
  if "```json" in json_str:
  json_str = json_str[json_str.find("```json") + len("```json") :]
  json_str = json_str[: json_str.find("```")]
  elif "```" in json_str:
  json_str = json_str[json_str.find("```") + len("```") :]
  # get the last ``` not one from an intermediate string
  json_str = json_str[: json_str.find("}```")]
- try:
- json_dict = json.loads(json_str)
- except json.JSONDecodeError as e:
- error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
- _LOGGER.exception(error_msg)
- raise ValueError(error_msg) from e
+
+ json_dict = json.loads(json_str)
  return json_dict # type: ignore
 
 

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
@@ -4,6 +4,7 @@
 import os
 import sys
 import tempfile
+from json import JSONDecodeError
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
 
@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
 def format_plans(plans: Dict[str, Any]) -> str:
  plan_str = ""
  for k, v in plans.items():
- plan_str += f"{k}:\n"
- plan_str += "-" + "\n-".join([e["instructions"] for e in v])
+ plan_str += "\n" + f"{k}: {v['thoughts']}\n"
+ plan_str += " -" + "\n -".join([e for e in v["instructions"]])
 
  return plan_str
 
@@ -228,13 +229,11 @@ def pick_plan(
  "status": "completed" if tool_output.success else "failed",
  }
  )
- tool_output_str = ""
- if len(tool_output.logs.stdout) > 0:
- tool_output_str = tool_output.logs.stdout[0]
+ tool_output_str = tool_output.text().strip()
 
  if verbosity == 2:
  _print_code("Code and test after attempted fix:", code)
- _LOGGER.info(f"Code execution result after attempte {count}")
+ _LOGGER.info(f"Code execution result after attempt {count}")
 
  count += 1
 
@@ -251,7 +250,21 @@ def pick_plan(
  tool_output=tool_output_str[:20_000],
  )
  chat[-1]["content"] = prompt
- best_plan = extract_json(model(chat, stream=False)) # type: ignore
+
+ count = 0
+ best_plan = None
+ while best_plan is None and count < max_retries:
+ try:
+ best_plan = extract_json(model(chat, stream=False)) # type: ignore
+ except JSONDecodeError as e:
+ _LOGGER.exception(
+ f"Error while extracting JSON during picking best plan {str(e)}"
+ )
+ pass
+ count += 1
+
+ if best_plan is None:
+ best_plan = {"best_plan": list(plans.keys())[0]}
 
  if verbosity >= 1:
  _LOGGER.info(f"Best plan:\n{best_plan}")
@@ -525,7 +538,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
 
 
 def retrieve_tools(
- plans: Dict[str, List[Dict[str, str]]],
+ plans: Dict[str, Dict[str, Any]],
  tool_recommender: Sim,
  log_progress: Callable[[Dict[str, Any]], None],
  verbosity: int = 0,
@@ -542,8 +555,8 @@ def retrieve_tools(
  tool_lists: Dict[str, List[Dict[str, str]]] = {}
  for k, plan in plans.items():
  tool_lists[k] = []
- for task in plan:
- tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
+ for task in plan["instructions"]:
+ tools = tool_recommender.top_k(task, k=2, thresh=0.3)
  tool_info.extend([e["doc"] for e in tools])
  tool_desc.extend([e["desc"] for e in tools])
  tool_lists[k].extend(
@@ -737,14 +750,7 @@ def chat_with_workflow(
  if self.verbosity >= 1:
  for p in plans:
  # tabulate will fail if the keys are not the same for all elements
- p_fixed = [
- {
- "instructions": (
- e["instructions"] if "instructions" in e else ""
- )
- }
- for e in plans[p]
- ]
+ p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
  _LOGGER.info(
  f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
  )
@@ -793,13 +799,15 @@ def chat_with_workflow(
  )
 
  if self.verbosity >= 1:
+ plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
  _LOGGER.info(
- f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
+ f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
  )
 
  results = write_and_test_code(
  chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
- plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
+ plan=f"\n{plan_i['thoughts']}\n-"
+ + "\n-".join([e for e in plan_i["instructions"]]),
  tool_info=tool_info,
  tool_output=tool_output_str,
  tool_utils=T.UTILITIES_DOCSTRING,

diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -30,18 +30,19 @@
 
 **Instructions**:
 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
-2. Output three different plans each utilize a different strategy or tool.
+2. Output three different plans each utilize a different strategy or set of tools.
 
 Output a list of jsons in the following format
 
 ```json
 {{
  "plan1":
- [
- {{
- "instructions": str # what you should do in this task associated with a tool
- }}
- ],
+ {{
+ "thoughts": str # your thought process for choosing this plan
+ "instructions": [
+ str # what you should do in this task associated with a tool
+ ]
+ }},
  "plan2": ...,
  "plan3": ...
 }}
@@ -127,7 +128,8 @@
 
 **Instructions**:
 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Output a JSON object with the following format:
+2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
+3. Output a JSON object with the following format:
 {{
  "thoughts": str # your thought process for choosing the best plan
  "best_plan": str # the best plan you have chosen

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
@@ -15,7 +15,7 @@
 **Examples**:
 Here is an example of how you can interact with a user and Actions to complete a task:
 --- START EXAMPLES ---
-[Current directory: /workspace/test]
+[Current directory: /example/workspace]
 {examples}
 --- END EXAMPLES ---
 
@@ -27,24 +27,25 @@
 Here is the current conversation so far:
 --- START CONVERSATION ---
 [Current directory: {dir}]
+
 {conversation}
 """
 
 EXAMPLES_CODE1 = """
 USER: Can you detect the dogs in this image? Media name dog.jpg
 
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /workspace/test/dog_detector.py]
+[File /example/workspace/dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2| image = load_image(image_path)
 3| dogs = owl_v2("dog", image)
 4| return dogs
 [End of file]
 
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
@@ -55,18 +56,18 @@
 
 USER: The the image only has one dog, can you fix this?
 
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /workspace/test/dog_detector.py]
+[File /example/workspace/dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2| image = load_image(image_path)
 3| dogs = owl_v2("dog", image, threshold=0.24)
 4| return dogs
 [End of file]
 
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
@@ -82,10 +83,10 @@
 
 USER: Yes you can use workers.png
 
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /workspace/test/code.py]
+[File /example/workspace/code.py]
 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
 1|def count_workers_with_helmets(image_path: str):
 2| image = load_image(image_path)
@@ -104,7 +105,7 @@
 15| return count
 [End of file]
 
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----

diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py
@@ -1,6 +1,6 @@
 import os
-from uuid import UUID
 from typing import List
+from uuid import UUID
 
 from requests.exceptions import HTTPError