landing-ai
diff --git a/‎tests/integ/test_tools.py
Lines changed: 58 additions & 9 deletions b/‎tests/integ/test_tools.py
Lines changed: 58 additions & 9 deletions
diff --git a/‎vision_agent/agent/agent_utils.py
Lines changed: 3 additions & 8 deletions b/‎vision_agent/agent/agent_utils.py
Lines changed: 3 additions & 8 deletions
diff --git a/‎vision_agent/agent/vision_agent_coder.py
Lines changed: 28 additions & 20 deletions b/‎vision_agent/agent/vision_agent_coder.py
Lines changed: 28 additions & 20 deletions
diff --git a/‎vision_agent/agent/vision_agent_coder_prompts.py
Lines changed: 9 additions & 7 deletions b/‎vision_agent/agent/vision_agent_coder_prompts.py
Lines changed: 9 additions & 7 deletions
diff --git a/‎vision_agent/agent/vision_agent_prompts.py
Lines changed: 11 additions & 10 deletions b/‎vision_agent/agent/vision_agent_prompts.py
Lines changed: 11 additions & 10 deletions
diff --git a/‎vision_agent/clients/landing_public_api.py
Lines changed: 1 addition & 1 deletion b/‎vision_agent/clients/landing_public_api.py
Lines changed: 1 addition & 1 deletion
@@ -1,5 +1,6 @@
 import numpy as np
 import skimage as ski
+from PIL import Image
 
 from vision_agent.tools import (
     blip_image_caption,
@@ -8,15 +9,19 @@
     depth_anything_v2,
     detr_segmentation,
     dpt_hybrid_midas,
-    florencev2_image_caption,
-    florencev2_object_detection,
-    florencev2_roberta_vqa,
-    florencev2_ocr,
+    florence2_image_caption,
+    florence2_object_detection,
+    florence2_ocr,
+    florence2_roberta_vqa,
+    florence2_sam2_image,
+    florence2_sam2_video,
     generate_pose_image,
     generate_soft_edge_image,
     git_vqa_v2,
     grounding_dino,
     grounding_sam,
+    ixc25_image_vqa,
+    ixc25_video_vqa,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
     ocr,
@@ -60,7 +65,7 @@ def test_owl():
 
 def test_object_detection():
     img = ski.data.coins()
-    result = florencev2_object_detection(
+    result = florence2_object_detection(
         image=img,
         prompt="coin",
     )
@@ -88,6 +93,30 @@ def test_grounding_sam():
     assert len([res["mask"] for res in result]) == 24
 
 
+def test_florence2_sam2_image():
+    img = ski.data.coins()
+    result = florence2_sam2_image(
+        prompt="coin",
+        image=img,
+    )
+    assert len(result) == 25
+    assert [res["label"] for res in result] == ["coin"] * 25
+    assert len([res["mask"] for res in result]) == 25
+
+
+def test_florence2_sam2_video():
+    frames = [
+        np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
+    ]
+    result = florence2_sam2_video(
+        prompt="coin",
+        frames=frames,
+    )
+    assert len(result) == 10
+    assert len([res["label"] for res in result[0]]) == 25
+    assert len([res["mask"] for res in result[0]]) == 25
+
+
 def test_segmentation():
     img = ski.data.coins()
     result = detr_segmentation(
@@ -133,7 +162,7 @@ def test_image_caption() -> None:
 
 def test_florence_image_caption() -> None:
     img = ski.data.rocket()
-    result = florencev2_image_caption(
+    result = florence2_image_caption(
         image=img,
     )
     assert "The image shows a rocket on a launch pad at night" in result.strip()
@@ -168,13 +197,33 @@ def test_git_vqa_v2() -> None:
 
 def test_image_qa_with_context() -> None:
     img = ski.data.rocket()
-    result = florencev2_roberta_vqa(
+    result = florence2_roberta_vqa(
         prompt="Is the scene captured during day or night ?",
         image=img,
     )
     assert "night" in result.strip()
 
 
+def test_ixc25_image_vqa() -> None:
+    img = ski.data.cat()
+    result = ixc25_image_vqa(
+        prompt="What animal is in this image?",
+        image=img,
+    )
+    assert "cat" in result.strip()
+
+
+def test_ixc25_video_vqa() -> None:
+    frames = [
+        np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
+    ]
+    result = ixc25_video_vqa(
+        prompt="What animal is in this video?",
+        frames=frames,
+    )
+    assert "cat" in result.strip()
+
+
 def test_ocr() -> None:
     img = ski.data.page()
     result = ocr(
@@ -183,9 +232,9 @@ def test_ocr() -> None:
     assert any("Region-based segmentation" in res["label"] for res in result)
 
 
-def test_florencev2_ocr() -> None:
+def test_florence2_ocr() -> None:
     img = ski.data.page()
-    result = florencev2_ocr(
+    result = florence2_ocr(
         image=img,
     )
     assert any("Region-based segmentation" in res["label"] for res in result)
 
@@ -4,27 +4,22 @@
 from typing import Any, Dict
 
 logging.basicConfig(stream=sys.stdout)
-_LOGGER = logging.getLogger(__name__)
 
 
 def extract_json(json_str: str) -> Dict[str, Any]:
     try:
+        json_str = json_str.replace("\n", " ")
         json_dict = json.loads(json_str)
     except json.JSONDecodeError:
-        input_json_str = json_str
         if "```json" in json_str:
             json_str = json_str[json_str.find("```json") + len("```json") :]
             json_str = json_str[: json_str.find("```")]
         elif "```" in json_str:
             json_str = json_str[json_str.find("```") + len("```") :]
             # get the last ``` not one from an intermediate string
             json_str = json_str[: json_str.find("}```")]
-        try:
-            json_dict = json.loads(json_str)
-        except json.JSONDecodeError as e:
-            error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}"
-            _LOGGER.exception(error_msg)
-            raise ValueError(error_msg) from e
+
+        json_dict = json.loads(json_str)
     return json_dict  # type: ignore
 
 
 
@@ -4,6 +4,7 @@
 import os
 import sys
 import tempfile
+from json import JSONDecodeError
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
 
@@ -86,8 +87,8 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
 def format_plans(plans: Dict[str, Any]) -> str:
     plan_str = ""
     for k, v in plans.items():
-        plan_str += f"{k}:\n"
-        plan_str += "-" + "\n-".join([e["instructions"] for e in v])
+        plan_str += "\n" + f"{k}: {v['thoughts']}\n"
+        plan_str += "    -" + "\n    -".join([e for e in v["instructions"]])
 
     return plan_str
 
@@ -228,13 +229,11 @@ def pick_plan(
                 "status": "completed" if tool_output.success else "failed",
             }
         )
-        tool_output_str = ""
-        if len(tool_output.logs.stdout) > 0:
-            tool_output_str = tool_output.logs.stdout[0]
+        tool_output_str = tool_output.text().strip()
 
         if verbosity == 2:
             _print_code("Code and test after attempted fix:", code)
-            _LOGGER.info(f"Code execution result after attempte {count}")
+            _LOGGER.info(f"Code execution result after attempt {count}")
 
         count += 1
 
@@ -251,7 +250,21 @@ def pick_plan(
         tool_output=tool_output_str[:20_000],
     )
     chat[-1]["content"] = prompt
-    best_plan = extract_json(model(chat, stream=False))  # type: ignore
+
+    count = 0
+    best_plan = None
+    while best_plan is None and count < max_retries:
+        try:
+            best_plan = extract_json(model(chat, stream=False))  # type: ignore
+        except JSONDecodeError as e:
+            _LOGGER.exception(
+                f"Error while extracting JSON during picking best plan {str(e)}"
+            )
+            pass
+        count += 1
+
+    if best_plan is None:
+        best_plan = {"best_plan": list(plans.keys())[0]}
 
     if verbosity >= 1:
         _LOGGER.info(f"Best plan:\n{best_plan}")
@@ -525,7 +538,7 @@ def _print_code(title: str, code: str, test: Optional[str] = None) -> None:
 
 
 def retrieve_tools(
-    plans: Dict[str, List[Dict[str, str]]],
+    plans: Dict[str, Dict[str, Any]],
     tool_recommender: Sim,
     log_progress: Callable[[Dict[str, Any]], None],
     verbosity: int = 0,
@@ -542,8 +555,8 @@ def retrieve_tools(
     tool_lists: Dict[str, List[Dict[str, str]]] = {}
     for k, plan in plans.items():
         tool_lists[k] = []
-        for task in plan:
-            tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
+        for task in plan["instructions"]:
+            tools = tool_recommender.top_k(task, k=2, thresh=0.3)
             tool_info.extend([e["doc"] for e in tools])
             tool_desc.extend([e["desc"] for e in tools])
             tool_lists[k].extend(
@@ -737,14 +750,7 @@ def chat_with_workflow(
             if self.verbosity >= 1:
                 for p in plans:
                     # tabulate will fail if the keys are not the same for all elements
-                    p_fixed = [
-                        {
-                            "instructions": (
-                                e["instructions"] if "instructions" in e else ""
-                            )
-                        }
-                        for e in plans[p]
-                    ]
+                    p_fixed = [{"instructions": e} for e in plans[p]["instructions"]]
                     _LOGGER.info(
                         f"\n{tabulate(tabular_data=p_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
                     )
@@ -793,13 +799,15 @@ def chat_with_workflow(
             )
 
             if self.verbosity >= 1:
+                plan_i_fixed = [{"instructions": e} for e in plan_i["instructions"]]
                 _LOGGER.info(
-                    f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
+                    f"Picked best plan:\n{tabulate(tabular_data=plan_i_fixed, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"
                 )
 
             results = write_and_test_code(
                 chat=[{"role": c["role"], "content": c["content"]} for c in int_chat],
-                plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]),
+                plan=f"\n{plan_i['thoughts']}\n-"
+                + "\n-".join([e for e in plan_i["instructions"]]),
                 tool_info=tool_info,
                 tool_output=tool_output_str,
                 tool_utils=T.UTILITIES_DOCSTRING,
 
@@ -30,18 +30,19 @@
 
 **Instructions**:
 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
-2. Output three different plans each utilize a different strategy or tool.
+2. Output three different plans each utilize a different strategy or set of tools.
 
 Output a list of jsons in the following format
 
 ```json
 {{
     "plan1":
-        [
-            {{
-                "instructions": str # what you should do in this task associated with a tool
-            }}
-        ],
+        {{
+            "thoughts": str # your thought process for choosing this plan
+            "instructions": [
+                str # what you should do in this task associated with a tool
+            ]
+        }},
     "plan2": ...,
     "plan3": ...
 }}
@@ -127,7 +128,8 @@
 
 **Instructions**:
 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Output a JSON object with the following format:
+2. Try solving the problem yourself given the image and pick the plan that matches your solution the best.
+3. Output a JSON object with the following format:
 {{
     "thoughts": str # your thought process for choosing the best plan
     "best_plan": str # the best plan you have chosen
 
@@ -15,7 +15,7 @@
 **Examples**:
 Here is an example of how you can interact with a user and Actions to complete a task:
 --- START EXAMPLES ---
-[Current directory: /workspace/test]
+[Current directory: /example/workspace]
 {examples}
 --- END EXAMPLES ---
 
@@ -27,24 +27,25 @@
 Here is the current conversation so far:
 --- START CONVERSATION ---
 [Current directory: {dir}]
+
 {conversation}
 """
 
 EXAMPLES_CODE1 = """
 USER: Can you detect the dogs in this image? Media name dog.jpg
 
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /workspace/test/dog_detector.py]
+[File /example/workspace/dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
 3|    dogs = owl_v2("dog", image)
 4|    return dogs
 [End of file]
 
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
@@ -55,18 +56,18 @@
 
 USER: The the image only has one dog, can you fix this?
 
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /workspace/test/dog_detector.py]
+[File /example/workspace/dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
 3|    dogs = owl_v2("dog", image, threshold=0.24)
 4|    return dogs
 [End of file]
 
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/workspace/test/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
@@ -82,10 +83,10 @@
 
 USER: Yes you can use workers.png
 
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /workspace/test/code.py]
+[File /example/workspace/code.py]
 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
 1|def count_workers_with_helmets(image_path: str):
 2|    image = load_image(image_path)
@@ -104,7 +105,7 @@
 15|    return count
 [End of file]
 
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/workspace/test/workers.png'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
 
@@ -1,6 +1,6 @@
 import os
-from uuid import UUID
 from typing import List
+from uuid import UUID
 
 from requests.exceptions import HTTPError