From 600667deea941bc391b25f155fd7e472b5e9cde3 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 26 Jun 2024 18:48:04 -0700
Subject: [PATCH] Fix generate tools (#156)

* fix docs

* fixed generate det

* fixed test case
---
 README.md               |  4 ++--
 docs/index.md           | 12 ++++++++----
 tests/unit/test_lmm.py  | 10 +++++-----
 vision_agent/lmm/lmm.py |  7 +++++--
 4 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 4508e5c0..87b39d22 100644
--- a/README.md
+++ b/README.md
@@ -145,8 +145,8 @@ you. For example:
 
 ```python
 >>> import vision_agent as va
->>> llm = va.llm.OpenAILMM()
->>> detector = llm.generate_detector("Can you build a jar detector for me?")
+>>> lmm = va.lmm.OpenAILMM()
+>>> detector = lmm.generate_detector("Can you build a jar detector for me?")
 >>> detector(va.tools.load_image("jar.jpg"))
 [{"labels": ["jar",],
   "scores": [0.99],
diff --git a/docs/index.md b/docs/index.md
index 364b5182..68670be1 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -33,7 +33,7 @@ export OPENAI_API_KEY="your-api-key"
 ```
 
 ### Important Note on API Usage
-Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/moutasemalakkad/vision-agent/blob/f491252f3477103b7f517c45e6dea2f9d9f7abc4/docs/index.md#L207)
+Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits)
 
 ### Vision Agent
 #### Basic Usage
@@ -137,8 +137,8 @@ you. For example:
 
 ```python
 >>> import vision_agent as va
->>> llm = va.llm.OpenAILMM()
->>> detector = llm.generate_detector("Can you build a jar detector for me?")
+>>> lmm = va.lmm.OpenAILMM()
+>>> detector = lmm.generate_detector("Can you build a jar detector for me?")
 >>> detector(va.tools.load_image("jar.jpg"))
 [{"labels": ["jar",],
   "scores": [0.99],
@@ -203,8 +203,12 @@ You can then run Vision Agent using the Azure OpenAI models:
 import vision_agent as va
 agent = va.agent.AzureVisionAgent()
 ```
+
 ******************************************************************************************************************************
-#### To get started with API credits:
+
+### Q&A
+
+#### How to get started with OpenAI API credits
 
 1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key.
 2. Follow the instructions to purchase and manage your API credits.
diff --git a/tests/unit/test_lmm.py b/tests/unit/test_lmm.py
index 7e6aea71..82871fce 100644
--- a/tests/unit/test_lmm.py
+++ b/tests/unit/test_lmm.py
@@ -98,17 +98,17 @@ def test_generate_classifier(openai_lmm_mock):  # noqa: F811
     indirect=["openai_lmm_mock"],
 )
 def test_generate_detector(openai_lmm_mock):  # noqa: F811
-    with patch("vision_agent.tools.grounding_dino") as grounding_dino_mock:
-        grounding_dino_mock.return_value = "test"
-        grounding_dino_mock.__name__ = "grounding_dino"
-        grounding_dino_mock.__doc__ = "grounding_dino"
+    with patch("vision_agent.tools.owl_v2") as owl_v2_mock:
+        owl_v2_mock.return_value = "test"
+        owl_v2_mock.__name__ = "owl_v2"
+        owl_v2_mock.__doc__ = "owl_v2"
 
         lmm = OpenAILMM()
         prompt = "Can you generate a cat classifier?"
         detector = lmm.generate_detector(prompt)
         dummy_image = np.zeros((10, 10, 3)).astype(np.uint8)
         detector(dummy_image)
-        assert grounding_dino_mock.call_args[0][0] == "cat"
+        assert owl_v2_mock.call_args[0][0] == "cat"
 
 
 @pytest.mark.parametrize(
diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py
index 71c632dd..e6c86844 100644
--- a/vision_agent/lmm/lmm.py
+++ b/vision_agent/lmm/lmm.py
@@ -164,6 +164,7 @@ def generate_classifier(self, question: str) -> Callable:
                 {"role": "system", "content": SYSTEM_PROMPT},
                 {"role": "user", "content": prompt},
             ],
+            response_format={"type": "json_object"},
         )
 
         try:
@@ -179,7 +180,7 @@ def generate_classifier(self, question: str) -> Callable:
         return lambda x: T.clip(x, params["prompt"])
 
     def generate_detector(self, question: str) -> Callable:
-        api_doc = T.get_tool_documentation([T.grounding_dino])
+        api_doc = T.get_tool_documentation([T.owl_v2])
         prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
         response = self.client.chat.completions.create(
             model=self.model_name,
@@ -187,6 +188,7 @@ def generate_detector(self, question: str) -> Callable:
                 {"role": "system", "content": SYSTEM_PROMPT},
                 {"role": "user", "content": prompt},
             ],
+            response_format={"type": "json_object"},
         )
 
         try:
@@ -199,7 +201,7 @@ def generate_detector(self, question: str) -> Callable:
             )
             raise ValueError("Failed to decode response")
 
-        return lambda x: T.grounding_dino(params["prompt"], x)
+        return lambda x: T.owl_v2(params["prompt"], x)
 
     def generate_segmentor(self, question: str) -> Callable:
         api_doc = T.get_tool_documentation([T.grounding_sam])
@@ -210,6 +212,7 @@ def generate_segmentor(self, question: str) -> Callable:
                 {"role": "system", "content": SYSTEM_PROMPT},
                 {"role": "user", "content": prompt},
             ],
+            response_format={"type": "json_object"},
         )
 
         try: