add ability to view images

landing-ai · Sep 11, 2024 · 2472112 · 2472112
1 parent 341924d
commit 2472112
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 16 deletions.
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
@@ -10,11 +10,16 @@
 from vision_agent.agent.vision_agent_prompts import (
  EXAMPLES_CODE1,
  EXAMPLES_CODE2,
+ EXAMPLES_CODE3,
  VA_CODE,
 )
 from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING, load_image, save_image
-from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
+from vision_agent.tools.meta_tools import (
+ Artifacts,
+ check_and_load_image,
+ use_extra_vision_agent_args,
+)
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter, Execution
 
@@ -30,7 +35,7 @@ class BoilerplateCode:
  pre_code = [
  "from typing import *",
  "from vision_agent.utils.execute import CodeInterpreter",
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
  "artifacts = Artifacts('{remote_path}')",
  "artifacts.load('{remote_path}')",
  ]
@@ -68,10 +73,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
 
  prompt = VA_CODE.format(
  documentation=META_TOOL_DOCSTRING,
- examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
+ examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
  conversation=conversation,
  )
- return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore
+ message: Message = {"role": "user", "content": prompt}
+ if (
+ chat[-1]["role"] == "observation"
+ and "media" in chat[-1]
+ and len(chat[-1]["media"]) > 0 # type: ignore
+ ):
+ message["media"] = chat[-1]["media"]
+ return extract_json(orch([message], stream=False)) # type: ignore
 
 
 def run_code_action(
@@ -296,13 +308,22 @@ def chat_with_code(
  code_action, code_interpreter, str(remote_artifacts_path)
  )
 
+ media_obs = check_and_load_image(code_action)
+
  if self.verbosity >= 1:
  _LOGGER.info(obs)
+
+ chat_elt: Message = {"role": "observation", "content": obs}
+ if media_obs and result.success:
+ chat_elt["media"] = [
+ Path(code_interpreter.remote_path) / media_ob
+ for media_ob in media_obs
+ ]
+
  # don't add execution results to internal chat
- int_chat.append({"role": "observation", "content": obs})
- orig_chat.append(
- {"role": "observation", "content": obs, "execution": result}
- )
+ int_chat.append(chat_elt)
+ chat_elt["execution"] = result
+ orig_chat.append(chat_elt)
  self.streaming_message(
  {
  "role": "observation",

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
@@ -18,19 +18,24 @@
 {examples}
 --- END EXAMPLES ---
 
-**Instructions**:
-1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
-2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
-
 **Conversation**:
 Here is the current conversation so far:
 --- START CONVERSATION ---
 {conversation}
+--- END CONVERSATION ---
+
+**Instructions**:
+1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
+2. **Output in JSON**: Respond in the following format in JSON:
+
+```json
+{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
+```
 """
 
 
 EXAMPLES_CODE1 = """
-USER: Can you detect the dogs in this image? Media name dog.jpg
+USER: Can you write code to detect the dogs in this image? Media name dog.jpg
 
 OBSERVATION:
 [Artifacts loaded]
@@ -61,6 +66,7 @@
 EXAMPLES_CODE1_EXTRA = """
 USER: The the image only has one dog, can you fix this?
 
+OBSERVATION:
 [Artifacts loaded]
 Artifact dog.jpg loaded to /path/to/images/dog.jpg
 Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
@@ -86,8 +92,24 @@
 AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
 """
 
-
 EXAMPLES_CODE2 = """
+USER: Can you describe this image?
+
+OBSERVATION:
+[Artifacts loaded]
+Artifact image.jpg loaded to /path/to/images/image.jpg
+[End of artifacts]
+
+AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
+
+OBSERVATION:
+[Image image.jpg displayed]
+
+AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
+"""
+
+
+EXAMPLES_CODE3 = """
 USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
 
 OBSERVATION:

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
@@ -437,6 +437,33 @@ def list_artifacts(artifacts: Artifacts) -> str:
  return output_str
 
 
+def check_and_load_image(code: str) -> List[str]:
+ if not code.strip():
+ return []
+
+ pattern = r"show_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)"
+ match = re.search(pattern, code)
+ if match:
+ name = match.group(2)
+ return [name]
+ return []
+
+
+def view_media_artifact(artifacts: Artifacts, name: str) -> str:
+ """Views the image artifact with the given name.
+
+ Parameters:
+ artifacts (Artifacts): The artifacts object to show the image from.
+ name (str): The name of the image artifact to show.
+ """
+ if name not in artifacts:
+ output_str = f"[Artifact {name} does not exist]"
+ else:
+ output_str = f"[Image {name} displayed]"
+ print(output_str)
+ return output_str
+
+
 def get_tool_descriptions() -> str:
  """Returns a description of all the tools that `generate_vision_code` has access to.
  Helpful for answering questions about what types of vision tasks you can do with
@@ -515,7 +542,7 @@ def use_extra_vision_agent_args(
  Returns:
  str: The edited code.
  """
- generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
+ generate_pattern = r"generate_vision_code\(\s*([^\)]+)\s*\)"
 
  def generate_replacer(match: re.Match) -> str:
  arg = match.group(1)
@@ -526,7 +553,7 @@ def generate_replacer(match: re.Match) -> str:
  out_str += ")"
  return out_str
 
- edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
+ edit_pattern = r"edit_vision_code\(\s*([^\)]+)\s*\)"
 
  def edit_replacer(match: re.Match) -> str:
  arg = match.group(1)
@@ -604,6 +631,7 @@ def replacer(match: re.Match) -> str:
  generate_vision_code,
  edit_vision_code,
  write_media_artifact,
+ view_media_artifact,
  florence2_fine_tuning,
  use_florence2_fine_tuning,
  list_artifacts,