Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for several issues with VisionAgent #251

Merged
merged 9 commits into from
Sep 26, 2024
35 changes: 20 additions & 15 deletions examples/chat/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,12 @@


def update_messages(messages, lock):
if Path("artifacts.pkl").exists():
artifacts.load("artifacts.pkl")
new_chat, _ = agent.chat_with_code(messages, artifacts=artifacts)
with lock:
for new_message in new_chat:
if new_message not in messages:
messages.append(new_message)
if Path("artifacts.pkl").exists():
artifacts.load("artifacts.pkl")
new_chat, _ = agent.chat_with_code(messages, artifacts=artifacts)
for new_message in new_chat[len(messages) :]:
messages.append(new_message)


def get_updates(updates, lock):
Expand Down Expand Up @@ -106,15 +105,21 @@ def main():
prompt = st.session_state.input_text

if prompt:
st.session_state.messages.append({"role": "user", "content": prompt})
messages.chat_message("user").write(prompt)
message_thread = threading.Thread(
target=update_messages,
args=(st.session_state.messages, message_lock),
)
message_thread.daemon = True
message_thread.start()
st.session_state.input_text = ""
if (
len(st.session_state.messages) == 0
or prompt != st.session_state.messages[-1]["content"]
):
st.session_state.messages.append(
{"role": "user", "content": prompt}
)
messages.chat_message("user").write(prompt)
message_thread = threading.Thread(
target=update_messages,
args=(st.session_state.messages, message_lock),
)
message_thread.daemon = True
message_thread.start()
st.session_state.input_text = ""

with tabs[1]:
updates = st.container(height=400)
Expand Down
85 changes: 47 additions & 38 deletions vision_agent/agent/vision_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
return extract_json(orch([message], stream=False)) # type: ignore


def run_code_action(
def execute_code_action(
code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
) -> Tuple[Execution, str]:
result = code_interpreter.exec_isolation(
Expand Down Expand Up @@ -115,10 +115,33 @@ def parse_execution(
return code


def execute_user_code_action(
last_user_message: Message,
code_interpreter: CodeInterpreter,
artifact_remote_path: str,
) -> Tuple[Optional[Execution], Optional[str]]:
user_result = None
user_obs = None

if last_user_message["role"] != "user":
return user_result, user_obs

last_user_content = cast(str, last_user_message.get("content", ""))

user_code_action = parse_execution(last_user_content, False)
if user_code_action is not None:
user_result, user_obs = execute_code_action(
user_code_action, code_interpreter, artifact_remote_path
)
if user_result.error:
user_obs += f"\n{user_result.error}"
return user_result, user_obs


class VisionAgent(Agent):
"""Vision Agent is an agent that can chat with the user and call tools or other
agents to generate code for it. Vision Agent uses python code to execute actions
for the user. Vision Agent is inspired by by OpenDev
for the user. Vision Agent is inspired by by OpenDevin
https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030

Example
Expand Down Expand Up @@ -278,9 +301,23 @@ def chat_with_code(
orig_chat.append({"role": "observation", "content": artifacts_loaded})
self.streaming_message({"role": "observation", "content": artifacts_loaded})

finished = self.execute_user_code_action(
last_user_message, code_interpreter, remote_artifacts_path
user_result, user_obs = execute_user_code_action(
last_user_message, code_interpreter, str(remote_artifacts_path)
)
finished = user_result is not None and user_obs is not None
if user_result is not None and user_obs is not None:
chat_elt: Message = {"role": "observation", "content": user_obs}
int_chat.append(chat_elt)
chat_elt["execution"] = user_result
orig_chat.append(chat_elt)
self.streaming_message(
{
"role": "observation",
"content": user_obs,
"execution": user_result,
"finished": finished,
}
)

while not finished and iterations < self.max_iterations:
response = run_conversation(self.agent, int_chat)
Expand Down Expand Up @@ -322,7 +359,7 @@ def chat_with_code(
)

if code_action is not None:
result, obs = run_code_action(
result, obs = execute_code_action(
code_action, code_interpreter, str(remote_artifacts_path)
)

Expand All @@ -331,17 +368,17 @@ def chat_with_code(
if self.verbosity >= 1:
_LOGGER.info(obs)

chat_elt: Message = {"role": "observation", "content": obs}
obs_chat_elt: Message = {"role": "observation", "content": obs}
if media_obs and result.success:
chat_elt["media"] = [
obs_chat_elt["media"] = [
Path(code_interpreter.remote_path) / media_ob
for media_ob in media_obs
]

# don't add execution results to internal chat
int_chat.append(chat_elt)
chat_elt["execution"] = result
orig_chat.append(chat_elt)
int_chat.append(obs_chat_elt)
obs_chat_elt["execution"] = result
orig_chat.append(obs_chat_elt)
self.streaming_message(
{
"role": "observation",
Expand All @@ -362,34 +399,6 @@ def chat_with_code(
artifacts.save()
return orig_chat, artifacts

def execute_user_code_action(
self,
last_user_message: Message,
code_interpreter: CodeInterpreter,
remote_artifacts_path: Path,
) -> bool:
if last_user_message["role"] != "user":
return False
user_code_action = parse_execution(
cast(str, last_user_message.get("content", "")), False
)
if user_code_action is not None:
user_result, user_obs = run_code_action(
user_code_action, code_interpreter, str(remote_artifacts_path)
)
if self.verbosity >= 1:
_LOGGER.info(user_obs)
self.streaming_message(
{
"role": "observation",
"content": user_obs,
"execution": user_result,
"finished": True,
}
)
return True
return False

def streaming_message(self, message: Dict[str, Any]) -> None:
if self.callback_message:
self.callback_message(message)
Expand Down
104 changes: 90 additions & 14 deletions vision_agent/agent/vision_agent_coder_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,7 @@
**Previous Attempts**:
{previous_attempts}

**Instructions**:
1. Write a program to load the media and call each tool and print it's output along with other relevant information.
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
3. Your test case MUST run only on the given images which are {media}
4. Print this final dictionary.
5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.

**Example**:
**Examples**:
--- EXAMPLE1 ---
plan1:
- Load the image from the provided file path 'image.jpg'.
Expand All @@ -100,6 +93,7 @@

final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
print(final_out)
--- END EXAMPLE1 ---

--- EXAMPLE2 ---
plan1:
Expand Down Expand Up @@ -173,6 +167,14 @@ def get_counts(preds):
print(labels_and_scores)
print(counts)
```
--- END EXAMPLE2 ---

**Instructions**:
1. Write a program to load the media and call each tool and print it's output along with other relevant information.
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
3. Your test case MUST run only on the given images which are {media}
4. Print this final dictionary.
5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
"""


Expand Down Expand Up @@ -224,11 +226,6 @@ def get_counts(preds):

{docstring}

**Input Code Snippet**:
```python
# Your code here
```

**User Instructions**:
{question}

Expand All @@ -241,11 +238,90 @@ def get_counts(preds):
**Previous Feedback**:
{feedback}

**Examples**:
--- EXAMPLE1 ---
**User Instructions**:

## User Request
Can you write a program to check if each person is wearing a helmet? First detect all the people in the image, then detect the helmets, check whether or not a person is wearing a helmet if the helmet is on the worker. Return a dictionary with the count of people with helments and people without helmets. Media name worker_helmets.webp

## Subtasks

This plan uses the owl_v2_image tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
-Use owl_v2_image with prompt 'person, helmet' to detect both people and helmets in the image
-Process the detections to match helmets with people based on bounding box proximity
-Count people with and without helmets based on the matching results
-Return a dictionary with the counts


**Tool Tests and Outputs**:
After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owl_v2_image seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.

**Tool Output Thoughts**:
```python
...
```
----- stdout -----
Plan 1 - owl_v2_image:

[{{'label': 'helmet', 'score': 0.15, 'bbox': [0.85, 0.41, 0.87, 0.45]}}, {{'label': 'helmet', 'score': 0.3, 'bbox': [0.8, 0.43, 0.81, 0.46]}}, {{'label': 'helmet', 'score': 0.31, 'bbox': [0.85, 0.45, 0.86, 0.46]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.84, 0.45, 0.88, 0.58]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.78, 0.43, 0.82, 0.57]}}, {{'label': 'helmet', 'score': 0.33, 'bbox': [0.3, 0.65, 0.32, 0.67]}}, {{'label': 'person', 'score': 0.29, 'bbox': [0.28, 0.65, 0.36, 0.84]}}, {{'label': 'helmet', 'score': 0.29, 'bbox': [0.13, 0.82, 0.15, 0.85]}}, {{'label': 'person', 'score': 0.3, 'bbox': [0.1, 0.82, 0.24, 1.0]}}]

...

**Input Code Snippet**:
```python
from vision_agent.tools import load_image, owl_v2_image

def check_helmets(image_path):
image = load_image(image_path)
# Detect people and helmets, filter out the lowest confidence helmet score of 0.15
detections = owl_v2_image("person, helmet", image, box_threshold=0.15)
height, width = image.shape[:2]

# Separate people and helmets
people = [d for d in detections if d['label'] == 'person']
helmets = [d for d in detections if d['label'] == 'helmet']

people_with_helmets = 0
people_without_helmets = 0

for person in people:
person_x = (person['bbox'][0] + person['bbox'][2]) / 2
person_y = person['bbox'][1] # Top of the bounding box

helmet_found = False
for helmet in helmets:
helmet_x = (helmet['bbox'][0] + helmet['bbox'][2]) / 2
helmet_y = (helmet['bbox'][1] + helmet['bbox'][3]) / 2

# Check if the helmet is within 20 pixels of the person's head. Unnormalize
# the coordinates so we can better compare them.
if (abs((helmet_x - person_x) * width) < 20 and
-5 < ((helmet_y - person_y) * height) < 20):
helmet_found = True
break

if helmet_found:
people_with_helmets += 1
else:
people_without_helmets += 1

return {{
"people_with_helmets": people_with_helmets,
"people_without_helmets": people_without_helmets
}}
```
--- END EXAMPLE1 ---

**Instructions**:
1. **Understand and Clarify**: Make sure you understand the task.
2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you.
3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
4. **Code Generation**: Translate your pseudocode into executable Python code.
4.1. Take in the media path as an argument and load with either `load_image` or `extract_frames_and_timestamps`.
4.2. Coordinates are always returned normalized from `vision_agent.tools`.
4.3. Do not create dummy input or functions, the code must be usable if the user provides new media.
4.4. Use unnormalized coordinates when comparing bounding boxes.
"""

TEST = """
Expand Down
10 changes: 5 additions & 5 deletions vision_agent/agent/vision_agent_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@

**Instructions**:
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
2. **Output in JSON**: Respond in the following format in JSON:
2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
4. **Output in JSON**: Respond in the following format in JSON:

```json
{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
Expand Down Expand Up @@ -149,7 +151,7 @@
16| return count
[End of artifact]

AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}

OBSERVATION:
----- stdout -----
Expand All @@ -164,7 +166,6 @@
OBSERVATION:
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]


AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}

OBSERVATION:
Expand All @@ -182,8 +183,7 @@ def count_workers_with_helmets(image_path: str, output_path: str):
count = 0
[End of edits]


AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}

OBSERVATION:
----- stdout -----
Expand Down
Loading
Loading