landing-ai · dillonalaird · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py
@@ -72,7 +72,9 @@ def extract_json(json_str: str) -> Dict[str, Any]:
  if json_dict is None:
  error_msg = f"Could not extract JSON from the given str: {json_orig}"
  _LOGGER.exception(error_msg)
- raise ValueError(error_msg)
+ raise json.JSONDecodeError(
+ msg="Could not extract JSON", doc=json_orig, pos=0
+ )
 
  return json_dict
 

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
@@ -12,6 +12,7 @@
  EXAMPLES_CODE1,
  EXAMPLES_CODE2,
  EXAMPLES_CODE3,
+ EXAMPLES_CODE3_EXTRA2,
  VA_CODE,
 )
 from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
@@ -97,8 +98,9 @@ def _clean_response(response: str) -> str:
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
  chat = copy.deepcopy(chat)
 
+ # only add 10 most recent messages in the chat to not go over token limit
  conversation = ""
- for chat_i in chat:
+ for chat_i in chat[-10:]:
  if chat_i["role"] == "user":
  conversation += f"USER: {chat_i['content']}\n\n"
  elif chat_i["role"] == "observation":
@@ -110,7 +112,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
 
  prompt = VA_CODE.format(
  documentation=META_TOOL_DOCSTRING,
- examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
+ examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}\n{EXAMPLES_CODE3_EXTRA2}",
  conversation=conversation,
  )
  message: Message = {"role": "user", "content": prompt}
@@ -153,7 +155,7 @@ def execute_code_action(
  obs = str(result.logs)
  if result.error:
  obs += f"\n{result.error}"
- extract_and_save_files_to_artifacts(artifacts, code, obs)
+ extract_and_save_files_to_artifacts(artifacts, code, obs, result)
  return result, obs
 
 
@@ -182,10 +184,46 @@ def execute_user_code_action(
  )
  if user_result.error:
  user_obs += f"\n{user_result.error}"
- extract_and_save_files_to_artifacts(artifacts, user_code_action, user_obs)
+ extract_and_save_files_to_artifacts(
+ artifacts, user_code_action, user_obs, user_result
+ )
  return user_result, user_obs
 
 
+def _add_media_obs(
+ code_action: str,
+ artifacts: Artifacts,
+ result: Execution,
+ obs: str,
+ code_interpreter: CodeInterpreter,
+ remote_artifacts_path: Path,
+ local_artifacts_path: Path,
+) -> Dict[str, Any]:
+ obs_chat_elt: Message = {"role": "observation", "content": obs}
+ media_obs = check_and_load_image(code_action)
+ if media_obs and result.success:
+ # for view_media_artifact, we need to ensure the media is loaded
+ # locally so the conversation agent can actually see it
+ code_interpreter.download_file(
+ str(remote_artifacts_path.name),
+ str(local_artifacts_path),
+ )
+ artifacts.load(
+ local_artifacts_path,
+ local_artifacts_path.parent,
+ )
+
+ # check if the media is actually in the artifacts
+ media_obs_chat = []
+ for media_ob in media_obs:
+ if media_ob in artifacts.artifacts:
+ media_obs_chat.append(local_artifacts_path.parent / media_ob)
+ if len(media_obs_chat) > 0:
+ obs_chat_elt["media"] = media_obs_chat
+
+ return obs_chat_elt
+
+
 def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
  response = copy.deepcopy(response)
 
@@ -231,9 +269,18 @@ def old_format_to_new_format(old_format_str: str) -> str:
  except json.JSONDecodeError:
  return old_format_str
 
- thinking = old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
- let_user_respond = old_format["let_user_respond"]
- if "<execute_python>" in old_format["response"]:
+ if "thoughts" in old_format:
+ thinking = (
+ old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
+ )
+ else:
+ thinking = None
+
+ let_user_respond = (
+ old_format["let_user_respond"] if "let_user_respond" in old_format else True
+ )
+
+ if "response" in old_format and "<execute_python>" in old_format["response"]:
  execute_python = extract_tag(old_format["response"], "execute_python")
  response = (
  old_format["response"]
@@ -244,7 +291,7 @@ def old_format_to_new_format(old_format_str: str) -> str:
  )
  else:
  execute_python = None
- response = old_format["response"]
+ response = old_format["response"] if "response" in old_format else None
 
  return json.dumps(
  {
@@ -535,29 +582,19 @@ def chat_with_artifacts(
  code_interpreter,
  str(remote_artifacts_path),
  )
-
- media_obs = check_and_load_image(code_action)
+ obs_chat_elt = _add_media_obs(
+ code_action,
+ artifacts,
+ result,
+ obs,
+ code_interpreter,
+ Path(remote_artifacts_path),
+ Path(self.local_artifacts_path),
+ )
 
  if self.verbosity >= 1:
  _LOGGER.info(obs)
 
- obs_chat_elt: Message = {"role": "observation", "content": obs}
- if media_obs and result.success:
- # for view_media_artifact, we need to ensure the media is loaded
- # locally so the conversation agent can actually see it
- code_interpreter.download_file(
- str(remote_artifacts_path.name),
- str(self.local_artifacts_path),
- )
- artifacts.load(
- self.local_artifacts_path,
- Path(self.local_artifacts_path).parent,
- )
- obs_chat_elt["media"] = [
- Path(self.local_artifacts_path).parent / media_ob
- for media_ob in media_obs
- ]
-
  # don't add execution results to internal chat
  int_chat.append(obs_chat_elt)
  obs_chat_elt["execution"] = result

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
@@ -1,7 +1,7 @@
 VA_CODE = """
 **Role**: You are a helpful agent that assists users with writing code.
 
-**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
+**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved only AFTER you execute python code. The user can see all `artifacts`.
 
 <execute_python>
 print("Hello World!")
@@ -26,10 +26,11 @@
 
 **Instructions**:
 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
-2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
+2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help.
 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
-4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
-5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
+4. **Artifacts**: Files are only saved in `artifacts` after <execute_python>, do not try to access artifacts until you observe that they are loaded.
+5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
+6. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
 
 <thinking>Your thoughts here...</thinking>
 <response>Your response to the user here...</response>
@@ -53,7 +54,7 @@
 <let_user_respond>false</let_user_respond>
 
 OBSERVATION:
-[Artifact dog_detector.py]
+[Artifact dog_detector.py (5 lines total)]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2| image = load_image(image_path)
@@ -94,7 +95,7 @@
 <let_user_respond>false</let_user_respond>
 
 OBSERVATION:
-[Artifact dog_detector.py]
+[Artifact dog_detector.py (5 lines total)]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2| image = load_image(image_path)
@@ -166,7 +167,7 @@
 <let_user_respond>false</let_user_respond>
 
 OBSERVATION:
-[Artifact code.py]
+[Artifact code.py (17 lines total)]
 0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
 1|def count_workers_with_helmets(image_path: str, output_path: str):
 2| image = load_image(image_path)
@@ -201,7 +202,9 @@
 AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
 <response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
 <let_user_respond>true</let_user_respond>
+"""
 
+EXAMPLES_CODE3_EXTRA1 = """
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
 
 AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
@@ -249,3 +252,60 @@ def count_workers_with_helmets(image_path: str, output_path: str):
 <response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
 <let_user_respond>true</let_user_respond>
 """
+
+EXAMPLES_CODE3_EXTRA2 = """
+USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets?
+
+AGENT: <thinking>I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets.</thinking>
+<execute_python>
+CODE = '''
+import streamlit as st
+from code imoprt count_workers_with_helmets
+import os
+
+def main():
+ st.title("Worker with Helmet Counter")
+ uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
+ if uploaded_file is not None:
+ with open("temp_image.jpg", "wb") as f:
+ f.write(uploaded_file.read())
+
+ count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
+ st.image("temp_image_viz.jpg")
+ st.write(f"Number of workers with helmets: {count}")
+ os.remove("temp_image.jpg")
+ os.remove("temp_image_viz.jpg")
+
+if __name__ == "__main__":
+ main()
+'''
+edit_code_artifact(artifacts, 'streamlit_app.py', CODE, 0, 0)
+</execute_python>
+
+OBSERVATION:
+[Artifact streamlit_app.py (19 lines total)]
+0|import streamlit as st
+1|from code imoprt count_workers_with_helmets
+2|import os
+3|
+4|def main():
+5| st.title("Worker with Helmet Counter")
+6| uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
+7| if uploaded_file is not None:
+8| with open("temp_image.jpg", "wb") as f:
+9| f.write(uploaded_file.read())
+10|
+11| count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
+12| st.image("temp_image_viz.jpg")
+13| st.write(f"Number of workers with helmets: {count}")
+14| os.remove("temp_image.jpg")
+15| os.remove("temp_image_viz.jpg")
+16|
+17|if __name__ == "__main__":
+18| main()
+[End of artifact]
+
+AGENT: <thinking>I have generated the streamlit app that allows the user to upload an image and count the workers</thinking>
+<response>The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`.</response>
+<let_user_respond>true</let_user_respond>
+"""