diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 6f0fdf74..4774d84d 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -15,7 +15,7 @@ **Examples**: Here is an example of how you can interact with a user and Actions to complete a task: --- START EXAMPLES --- -[Current directory: /workspace/test] +[Current directory: /example/workspace] {examples} --- END EXAMPLES --- @@ -27,16 +27,17 @@ Here is the current conversation so far: --- START CONVERSATION --- [Current directory: {dir}] + {conversation} """ EXAMPLES_CODE1 = """ USER: Can you detect the dogs in this image? Media name dog.jpg -AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])", "let_user_respond": false} +AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])", "let_user_respond": false} OBSERVATION: -[File /workspace/test/dog_detector.py] +[File /example/workspace/dog_detector.py] 0|from vision_agent.tools import load_image, owl_v2 1|def detect_dogs(image_path: str): 2| image = load_image(image_path) @@ -44,7 +45,7 @@ 4| return dogs [End of file] -AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))", "let_user_respond": false} +AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false} OBSERVATION: ----- stdout ----- @@ -55,10 +56,10 @@ USER: The the image only has one dog, can you fix this? -AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])", "let_user_respond": false} +AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])", "let_user_respond": false} OBSERVATION: -[File /workspace/test/dog_detector.py] +[File /example/workspace/dog_detector.py] 0|from vision_agent.tools import load_image, owl_v2 1|def detect_dogs(image_path: str): 2| image = load_image(image_path) @@ -66,7 +67,7 @@ 4| return dogs [End of file] -AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))", "let_user_respond": false} +AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false} OBSERVATION: ----- stdout ----- @@ -82,10 +83,10 @@ USER: Yes you can use workers.png -AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])", "let_user_respond": false} +AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])", "let_user_respond": false} OBSERVATION: -[File /workspace/test/code.py] +[File /example/workspace/code.py] 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance 1|def count_workers_with_helmets(image_path: str): 2| image = load_image(image_path) @@ -104,7 +105,7 @@ 15| return count [End of file] -AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/test/workers.png'))", "let_user_respond": false} +AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))", "let_user_respond": false} OBSERVATION: ----- stdout -----