Skip to content

Commit

Permalink
add ability to view images
Browse files Browse the repository at this point in the history
  • Loading branch information
dillonalaird committed Sep 11, 2024
1 parent 341924d commit 2472112
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 16 deletions.
37 changes: 29 additions & 8 deletions vision_agent/agent/vision_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@
from vision_agent.agent.vision_agent_prompts import (
EXAMPLES_CODE1,
EXAMPLES_CODE2,
EXAMPLES_CODE3,
VA_CODE,
)
from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
from vision_agent.tools import META_TOOL_DOCSTRING, load_image, save_image
from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
from vision_agent.tools.meta_tools import (
Artifacts,
check_and_load_image,
use_extra_vision_agent_args,
)
from vision_agent.utils import CodeInterpreterFactory
from vision_agent.utils.execute import CodeInterpreter, Execution

Expand All @@ -30,7 +35,7 @@ class BoilerplateCode:
pre_code = [
"from typing import *",
"from vision_agent.utils.execute import CodeInterpreter",
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
"artifacts = Artifacts('{remote_path}')",
"artifacts.load('{remote_path}')",
]
Expand Down Expand Up @@ -68,10 +73,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:

prompt = VA_CODE.format(
documentation=META_TOOL_DOCSTRING,
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
conversation=conversation,
)
return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore
message: Message = {"role": "user", "content": prompt}
if (
chat[-1]["role"] == "observation"
and "media" in chat[-1]
and len(chat[-1]["media"]) > 0 # type: ignore
):
message["media"] = chat[-1]["media"]
return extract_json(orch([message], stream=False)) # type: ignore


def run_code_action(
Expand Down Expand Up @@ -296,13 +308,22 @@ def chat_with_code(
code_action, code_interpreter, str(remote_artifacts_path)
)

media_obs = check_and_load_image(code_action)

if self.verbosity >= 1:
_LOGGER.info(obs)

chat_elt: Message = {"role": "observation", "content": obs}
if media_obs and result.success:
chat_elt["media"] = [
Path(code_interpreter.remote_path) / media_ob
for media_ob in media_obs
]

# don't add execution results to internal chat
int_chat.append({"role": "observation", "content": obs})
orig_chat.append(
{"role": "observation", "content": obs, "execution": result}
)
int_chat.append(chat_elt)
chat_elt["execution"] = result
orig_chat.append(chat_elt)
self.streaming_message(
{
"role": "observation",
Expand Down
34 changes: 28 additions & 6 deletions vision_agent/agent/vision_agent_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,24 @@
{examples}
--- END EXAMPLES ---
**Instructions**:
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
**Conversation**:
Here is the current conversation so far:
--- START CONVERSATION ---
{conversation}
--- END CONVERSATION ---
**Instructions**:
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
2. **Output in JSON**: Respond in the following format in JSON:
```json
{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
```
"""


EXAMPLES_CODE1 = """
USER: Can you detect the dogs in this image? Media name dog.jpg
USER: Can you write code to detect the dogs in this image? Media name dog.jpg
OBSERVATION:
[Artifacts loaded]
Expand Down Expand Up @@ -61,6 +66,7 @@
EXAMPLES_CODE1_EXTRA = """
USER: The the image only has one dog, can you fix this?
OBSERVATION:
[Artifacts loaded]
Artifact dog.jpg loaded to /path/to/images/dog.jpg
Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
Expand All @@ -86,8 +92,24 @@
AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
"""


EXAMPLES_CODE2 = """
USER: Can you describe this image?
OBSERVATION:
[Artifacts loaded]
Artifact image.jpg loaded to /path/to/images/image.jpg
[End of artifacts]
AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
OBSERVATION:
[Image image.jpg displayed]
AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
"""


EXAMPLES_CODE3 = """
USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
OBSERVATION:
Expand Down
32 changes: 30 additions & 2 deletions vision_agent/tools/meta_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,33 @@ def list_artifacts(artifacts: Artifacts) -> str:
return output_str


def check_and_load_image(code: str) -> List[str]:
if not code.strip():
return []

pattern = r"show_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)"
match = re.search(pattern, code)
if match:
name = match.group(2)
return [name]
return []


def view_media_artifact(artifacts: Artifacts, name: str) -> str:
"""Views the image artifact with the given name.
Parameters:
artifacts (Artifacts): The artifacts object to show the image from.
name (str): The name of the image artifact to show.
"""
if name not in artifacts:
output_str = f"[Artifact {name} does not exist]"
else:
output_str = f"[Image {name} displayed]"
print(output_str)
return output_str


def get_tool_descriptions() -> str:
"""Returns a description of all the tools that `generate_vision_code` has access to.
Helpful for answering questions about what types of vision tasks you can do with
Expand Down Expand Up @@ -515,7 +542,7 @@ def use_extra_vision_agent_args(
Returns:
str: The edited code.
"""
generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
generate_pattern = r"generate_vision_code\(\s*([^\)]+)\s*\)"

def generate_replacer(match: re.Match) -> str:
arg = match.group(1)
Expand All @@ -526,7 +553,7 @@ def generate_replacer(match: re.Match) -> str:
out_str += ")"
return out_str

edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
edit_pattern = r"edit_vision_code\(\s*([^\)]+)\s*\)"

def edit_replacer(match: re.Match) -> str:
arg = match.group(1)
Expand Down Expand Up @@ -604,6 +631,7 @@ def replacer(match: re.Match) -> str:
generate_vision_code,
edit_vision_code,
write_media_artifact,
view_media_artifact,
florence2_fine_tuning,
use_florence2_fine_tuning,
list_artifacts,
Expand Down

0 comments on commit 2472112

Please sign in to comment.