Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Issues for App #275

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
cfe4201
add more checks around changing new/old format
dillonalaird Oct 15, 2024
79151bb
fix issue with raising json error type
dillonalaird Oct 15, 2024
9b28706
only add artifacts to media list if they exist
dillonalaird Oct 15, 2024
9405de0
only add artifacts to media list if they exist
dillonalaird Oct 15, 2024
398ff23
only add artifacts to media list if they exist
dillonalaird Oct 15, 2024
d011617
extract media from ipython display
dillonalaird Oct 15, 2024
d92e192
add better prompts for dealing with artifacts
dillonalaird Oct 15, 2024
250bbaa
add another prmopt example, reformat to reduce complex
dillonalaird Oct 15, 2024
b2bcc9f
add another prompt for editing code
dillonalaird Oct 15, 2024
9ab9e1d
fix issues with edit code
dillonalaird Oct 15, 2024
0247de1
remove verbosity arg
dillonalaird Oct 16, 2024
c514da3
limit context length and amount of debugging VA does
dillonalaird Oct 16, 2024
3f86993
fix side case with edit_artifact
dillonalaird Oct 16, 2024
04c57f9
remove generate vision plan
dillonalaird Oct 16, 2024
5fa377e
better parsing for json
dillonalaird Oct 16, 2024
c8eea97
loop over formats
dillonalaird Oct 16, 2024
666ab3c
updated prompt to tell it save_image/video will save to artifacts
dillonalaird Oct 16, 2024
906ee66
ensure artifact is saved
dillonalaird Oct 16, 2024
802c7e2
remove break point
dillonalaird Oct 16, 2024
b58e48d
fixed type errors
dillonalaird Oct 16, 2024
55fc598
upload and download artifacts per turn
dillonalaird Oct 16, 2024
8161c48
fixed streamlit app for new updates
dillonalaird Oct 16, 2024
dea8756
reduced code complexity
dillonalaird Oct 16, 2024
38f23d3
fix test case
dillonalaird Oct 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion vision_agent/agent/agent_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ def extract_json(json_str: str) -> Dict[str, Any]:
if json_dict is None:
error_msg = f"Could not extract JSON from the given str: {json_orig}"
_LOGGER.exception(error_msg)
raise ValueError(error_msg)
raise json.JSONDecodeError(
msg="Could not extract JSON", doc=json_orig, pos=0
)

return json_dict

Expand Down
91 changes: 64 additions & 27 deletions vision_agent/agent/vision_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
EXAMPLES_CODE1,
EXAMPLES_CODE2,
EXAMPLES_CODE3,
EXAMPLES_CODE3_EXTRA2,
VA_CODE,
)
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
Expand Down Expand Up @@ -97,8 +98,9 @@ def _clean_response(response: str) -> str:
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
chat = copy.deepcopy(chat)

# only add 10 most recent messages in the chat to not go over token limit
conversation = ""
for chat_i in chat:
for chat_i in chat[-10:]:
if chat_i["role"] == "user":
conversation += f"USER: {chat_i['content']}\n\n"
elif chat_i["role"] == "observation":
Expand All @@ -110,7 +112,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:

prompt = VA_CODE.format(
documentation=META_TOOL_DOCSTRING,
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}\n{EXAMPLES_CODE3_EXTRA2}",
conversation=conversation,
)
message: Message = {"role": "user", "content": prompt}
Expand Down Expand Up @@ -153,7 +155,7 @@ def execute_code_action(
obs = str(result.logs)
if result.error:
obs += f"\n{result.error}"
extract_and_save_files_to_artifacts(artifacts, code, obs)
extract_and_save_files_to_artifacts(artifacts, code, obs, result)
return result, obs


Expand Down Expand Up @@ -182,10 +184,46 @@ def execute_user_code_action(
)
if user_result.error:
user_obs += f"\n{user_result.error}"
extract_and_save_files_to_artifacts(artifacts, user_code_action, user_obs)
extract_and_save_files_to_artifacts(
artifacts, user_code_action, user_obs, user_result
)
return user_result, user_obs


def _add_media_obs(
code_action: str,
artifacts: Artifacts,
result: Execution,
obs: str,
code_interpreter: CodeInterpreter,
remote_artifacts_path: Path,
local_artifacts_path: Path,
) -> Dict[str, Any]:
obs_chat_elt: Message = {"role": "observation", "content": obs}
media_obs = check_and_load_image(code_action)
if media_obs and result.success:
# for view_media_artifact, we need to ensure the media is loaded
# locally so the conversation agent can actually see it
code_interpreter.download_file(
str(remote_artifacts_path.name),
str(local_artifacts_path),
)
artifacts.load(
local_artifacts_path,
local_artifacts_path.parent,
)

# check if the media is actually in the artifacts
media_obs_chat = []
for media_ob in media_obs:
if media_ob in artifacts.artifacts:
media_obs_chat.append(local_artifacts_path.parent / media_ob)
if len(media_obs_chat) > 0:
obs_chat_elt["media"] = media_obs_chat

return obs_chat_elt


def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
response = copy.deepcopy(response)

Expand Down Expand Up @@ -231,9 +269,18 @@ def old_format_to_new_format(old_format_str: str) -> str:
except json.JSONDecodeError:
return old_format_str

thinking = old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
let_user_respond = old_format["let_user_respond"]
if "<execute_python>" in old_format["response"]:
if "thoughts" in old_format:
thinking = (
old_format["thoughts"] if old_format["thoughts"].strip() != "" else None
)
else:
thinking = None

let_user_respond = (
old_format["let_user_respond"] if "let_user_respond" in old_format else True
)

if "response" in old_format and "<execute_python>" in old_format["response"]:
execute_python = extract_tag(old_format["response"], "execute_python")
response = (
old_format["response"]
Expand All @@ -244,7 +291,7 @@ def old_format_to_new_format(old_format_str: str) -> str:
)
else:
execute_python = None
response = old_format["response"]
response = old_format["response"] if "response" in old_format else None

return json.dumps(
{
Expand Down Expand Up @@ -535,29 +582,19 @@ def chat_with_artifacts(
code_interpreter,
str(remote_artifacts_path),
)

media_obs = check_and_load_image(code_action)
obs_chat_elt = _add_media_obs(
code_action,
artifacts,
result,
obs,
code_interpreter,
Path(remote_artifacts_path),
Path(self.local_artifacts_path),
)

if self.verbosity >= 1:
_LOGGER.info(obs)

obs_chat_elt: Message = {"role": "observation", "content": obs}
if media_obs and result.success:
# for view_media_artifact, we need to ensure the media is loaded
# locally so the conversation agent can actually see it
code_interpreter.download_file(
str(remote_artifacts_path.name),
str(self.local_artifacts_path),
)
artifacts.load(
self.local_artifacts_path,
Path(self.local_artifacts_path).parent,
)
obs_chat_elt["media"] = [
Path(self.local_artifacts_path).parent / media_ob
for media_ob in media_obs
]

# don't add execution results to internal chat
int_chat.append(obs_chat_elt)
obs_chat_elt["execution"] = result
Expand Down
74 changes: 67 additions & 7 deletions vision_agent/agent/vision_agent_prompts.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
VA_CODE = """
**Role**: You are a helpful agent that assists users with writing code.

**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execute_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved only AFTER you execute python code. The user can see all `artifacts`.

<execute_python>
print("Hello World!")
Expand All @@ -26,10 +26,11 @@

**Instructions**:
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help.
3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
4. **Artifacts**: Files are only saved in `artifacts` after <execute_python>, do not try to access artifacts until you observe that they are loaded.
5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
6. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.

<thinking>Your thoughts here...</thinking>
<response>Your response to the user here...</response>
Expand All @@ -53,7 +54,7 @@
<let_user_respond>false</let_user_respond>

OBSERVATION:
[Artifact dog_detector.py]
[Artifact dog_detector.py (5 lines total)]
0|from vision_agent.tools import load_image, owl_v2
1|def detect_dogs(image_path: str):
2| image = load_image(image_path)
Expand Down Expand Up @@ -94,7 +95,7 @@
<let_user_respond>false</let_user_respond>

OBSERVATION:
[Artifact dog_detector.py]
[Artifact dog_detector.py (5 lines total)]
0|from vision_agent.tools import load_image, owl_v2
1|def detect_dogs(image_path: str):
2| image = load_image(image_path)
Expand Down Expand Up @@ -166,7 +167,7 @@
<let_user_respond>false</let_user_respond>

OBSERVATION:
[Artifact code.py]
[Artifact code.py (17 lines total)]
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
1|def count_workers_with_helmets(image_path: str, output_path: str):
2| image = load_image(image_path)
Expand Down Expand Up @@ -201,7 +202,9 @@
AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
<response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
<let_user_respond>true</let_user_respond>
"""

EXAMPLES_CODE3_EXTRA1 = """
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"

AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
Expand Down Expand Up @@ -249,3 +252,60 @@ def count_workers_with_helmets(image_path: str, output_path: str):
<response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
<let_user_respond>true</let_user_respond>
"""

EXAMPLES_CODE3_EXTRA2 = """
USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets?

AGENT: <thinking>I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets.</thinking>
<execute_python>
CODE = '''
import streamlit as st
from code imoprt count_workers_with_helmets
import os

def main():
st.title("Worker with Helmet Counter")
uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
if uploaded_file is not None:
with open("temp_image.jpg", "wb") as f:
f.write(uploaded_file.read())

count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
st.image("temp_image_viz.jpg")
st.write(f"Number of workers with helmets: {count}")
os.remove("temp_image.jpg")
os.remove("temp_image_viz.jpg")

if __name__ == "__main__":
main()
'''
edit_code_artifact(artifacts, 'streamlit_app.py', CODE, 0, 0)
</execute_python>

OBSERVATION:
[Artifact streamlit_app.py (19 lines total)]
0|import streamlit as st
1|from code imoprt count_workers_with_helmets
2|import os
3|
4|def main():
5| st.title("Worker with Helmet Counter")
6| uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"])
7| if uploaded_file is not None:
8| with open("temp_image.jpg", "wb") as f:
9| f.write(uploaded_file.read())
10|
11| count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg")
12| st.image("temp_image_viz.jpg")
13| st.write(f"Number of workers with helmets: {count}")
14| os.remove("temp_image.jpg")
15| os.remove("temp_image_viz.jpg")
16|
17|if __name__ == "__main__":
18| main()
[End of artifact]

AGENT: <thinking>I have generated the streamlit app that allows the user to upload an image and count the workers</thinking>
<response>The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`.</response>
<let_user_respond>true</let_user_respond>
"""
Loading
Loading