landing-ai · MingruiZhang · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
@@ -23,6 +23,7 @@
  florence2_image_caption,
  florence2_ocr,
  florence2_phrase_grounding,
+ florence2_phrase_grounding_fine_tune,
  florence2_roberta_vqa,
  florence2_sam2_image,
  florence2_sam2_video,

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -1601,20 +1601,18 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
 
 
 # TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuned_object_detection(
- image: np.ndarray, prompt: str, model_id: UUID, task: str
+def florence2_phrase_grounding_fine_tune(
+ prompt: str, image: np.ndarray, model_id: UUID
 ) -> List[Dict[str, Any]]:
- """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
+ """'florence2_phrase_grounding_fine_tune' is a tool that uses a fine tuned model
  to detect objects given a text prompt such as a phrase or class names separated by
  commas. It returns a list of detected objects as labels and their location as
  bounding boxes with score of 1.0.
 
  Parameters:
- image (np.ndarray): The image to used to detect objects.
  prompt (str): The prompt to help find objects in the image.
+ image (np.ndarray): The image to used to detect objects.
  model_id (UUID): The fine-tuned model id.
- task (PromptTask): The florencev2 fine-tuning task. The options are
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
 
  Returns:
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -1626,8 +1624,8 @@ def florencev2_fine_tuned_object_detection(
  Example
  -------
  >>> florencev2_fine_tuned_object_detection(
- image,
  'person looking at a coyote',
+ image,
  UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
  )
  [
@@ -1641,7 +1639,10 @@ def florencev2_fine_tuned_object_detection(
  if status is not JobStatus.SUCCEEDED:
  raise FineTuneModelIsNotReady()
 
- task = PromptTask[task]
+ task = PromptTask[
+ "CAPTION_TO_PHRASE_GROUNDING"
+ ] # hardcode to <CAPTION_TO_PHRASE_GROUNDING> for now
+
  if task is PromptTask.OBJECT_DETECTION:
  prompt = ""
 
@@ -1705,12 +1706,17 @@ def florencev2_fine_tuned_object_detection(
  overlay_heat_map,
 ]
 
+# non-implemented tools
+OTHER_TOOLS = [
+ florence2_phrase_grounding_fine_tune,
+]
+
 TOOLS = FUNCTION_TOOLS + UTIL_TOOLS
 
 TOOLS_DF = get_tools_df(TOOLS) # type: ignore
 TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
 TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
-TOOLS_INFO = get_tools_info(TOOLS) # type: ignore
+TOOLS_INFO = get_tools_info(TOOLS + OTHER_TOOLS) # type: ignore
 UTILITIES_DOCSTRING = get_tool_documentation(
  [
  save_json,