landing-ai · dillonalaird · Oct 4, 2024 · Oct 2, 2024 · Oct 2, 2024 · Oct 3, 2024
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
@@ -1,4 +1,8 @@
-from vision_agent.agent.agent_utils import extract_code, extract_json
+from vision_agent.agent.agent_utils import (
+ extract_code,
+ extract_json,
+ remove_installs_from_code,
+)
 
 
 def test_basic_json_extract():
@@ -43,3 +47,19 @@ def test_basic_json_extract():
  a_code = extract_code(a)
  assert "def test_basic_json_extract():" in a_code
  assert "assert extract_json(a) == {" in a_code
+
+
+def test_remove_installs_from_code():
+ a = """import os
+imoprt sys
+
+!pip install pandas
+
+
+def test():
+ print("!pip install dummy")
+"""
+ out = remove_installs_from_code(a)
+ assert "import os" in out
+ assert "!pip install pandas" not in out
+ assert "!pip install dummy" in out
diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py
@@ -1,25 +1,69 @@
-# Generated by CodiumAI
+import os
+import tempfile
 from pathlib import Path
 
 import numpy as np
 
-from vision_agent.tools.tools import save_video
+from vision_agent.tools.tools import save_image, save_video
 
 
-class TestSaveVideo:
- def test_saves_frames_without_output_path(self):
- frames = [
- np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10)
- ]
- output_path = save_video(frames)
- assert Path(output_path).exists()
+def test_saves_frames_without_output_path():
+ frames = [
+ np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10)
+ ]
+ output_path = save_video(frames)
+ assert Path(output_path).exists()
+ os.remove(output_path)
+
 
- def test_saves_frames_with_output_path(self, tmp_path):
- frames = [
- np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10)
- ]
- video_output_path = str(tmp_path / "output.mp4")
- output_path = save_video(frames, video_output_path)
+def test_saves_frames_with_output_path():
+ frames = [
+ np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10)
+ ]
 
- assert output_path == video_output_path
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ video_output_path = Path(tmp_dir) / "output.mp4"
+ output_path = save_video(frames, str(video_output_path))
+
+ assert output_path == str(video_output_path)
  assert Path(output_path).exists()
+
+
+def test_save_null_image():
+ image = None
+ try:
+ save_image(image, "tmp.jpg")
+ except ValueError as e:
+ assert str(e) == "The image is not a valid NumPy array with shape (H, W, C)"
+
+
+def test_save_empty_image():
+ image = np.zeros((0, 0, 3), dtype=np.uint8)
+ try:
+ save_image(image, "tmp.jpg")
+ except ValueError as e:
+ assert str(e) == "The image is not a valid NumPy array with shape (H, W, C)"
+
+
+def test_save_null_video():
+ frames = None
+ try:
+ save_video(frames, "tmp.mp4")
+ except ValueError as e:
+ assert str(e) == "Frames must be a list of NumPy arrays"
+
+
+def test_save_empty_list():
+ frames = []
+ try:
+ save_video(frames, "tmp.mp4")
+ except ValueError as e:
+ assert str(e) == "Frames must be a list of NumPy arrays"
+
+
+def test_save_invalid_frame():
+ frames = [np.zeros((0, 0, 3), dtype=np.uint8)]
+ try:
+ save_video(frames, "tmp.mp4")
+ except ValueError as e:
+ assert str(e) == "A frame is not a valid NumPy array with shape (H, W, C)"
diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py
@@ -77,3 +77,9 @@ def extract_code(code: str) -> str:
  if code.startswith("python\n"):
  code = code[len("python\n") :]
  return code
+
+
+def remove_installs_from_code(code: str) -> str:
+ pattern = r"\n!pip install.*?(\n|\Z)\n"
+ code = re.sub(pattern, "", code, flags=re.DOTALL)
+ return code
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
@@ -407,8 +407,6 @@ def chat_with_code(
  code_interpreter.download_file(
  str(remote_artifacts_path.name), str(self.local_artifacts_path)
  )
- artifacts.load(self.local_artifacts_path)
- artifacts.save()
  return orig_chat, artifacts
 
  def streaming_message(self, message: Dict[str, Any]) -> None:

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
@@ -13,7 +13,11 @@
 
 import vision_agent.tools as T
 from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_code, extract_json
+from vision_agent.agent.agent_utils import (
+ extract_code,
+ extract_json,
+ remove_installs_from_code,
+)
 from vision_agent.agent.vision_agent_coder_prompts import (
  CODE,
  FIX_BUG,
@@ -836,8 +840,8 @@ def chat_with_workflow(
  media=media_list,
  )
  success = cast(bool, results["success"])
- code = cast(str, results["code"])
- test = cast(str, results["test"])
+ code = remove_installs_from_code(cast(str, results["code"]))
+ test = remove_installs_from_code(cast(str, results["test"]))
  working_memory.extend(results["working_memory"]) # type: ignore
  plan.append({"code": code, "test": test, "plan": plan_i})
 

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
@@ -28,7 +28,8 @@
 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
-4. **Output in JSON**: Respond in the following format in JSON:
+4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
+5. **Output in JSON**: Respond in the following format in JSON:
 
 ```json
 {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
@@ -62,7 +63,7 @@
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
 
 
-AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
+AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
 """
 
 EXAMPLES_CODE1_EXTRA = """
@@ -91,7 +92,7 @@
 ----- stdout -----
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
 
-AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
+AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
 """
 
 EXAMPLES_CODE2 = """
@@ -157,16 +158,16 @@
 ----- stdout -----
 2
 
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
 
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
 
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
 
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 [Artifact code.py edits]

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
@@ -45,7 +45,6 @@
  loca_zero_shot_counting,
  ocr,
  overlay_bounding_boxes,
- overlay_counting_results,
  overlay_heat_map,
  overlay_segmentation_masks,
  owl_v2_image,

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
@@ -116,7 +116,9 @@ def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str:
  )
  output_str = "[Artifacts loaded]\n"
  for k in self.artifacts.keys():
- output_str += f"Artifact {k} loaded to {str(loaded_path / k)}\n"
+ output_str += (
+ f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n"
+ )
  output_str += "[End of artifacts]\n"
  print(output_str)
  return output_str

diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py
@@ -1,6 +1,6 @@
-import os
 import inspect
 import logging
+import os
 from base64 import b64encode
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple