Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: expose florence2_phrase_grounding_fine_tune #218

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
florence2_image_caption,
florence2_ocr,
florence2_phrase_grounding,
florence2_phrase_grounding_fine_tune,
florence2_roberta_vqa,
florence2_sam2_image,
florence2_sam2_video,
Expand Down
24 changes: 15 additions & 9 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1601,20 +1601,18 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:


# TODO: add this function to the imports so that is picked in the agent
def florencev2_fine_tuned_object_detection(
image: np.ndarray, prompt: str, model_id: UUID, task: str
def florence2_phrase_grounding_fine_tune(
prompt: str, image: np.ndarray, model_id: UUID
) -> List[Dict[str, Any]]:
"""'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
"""'florence2_phrase_grounding_fine_tune' is a tool that uses a fine tuned model
to detect objects given a text prompt such as a phrase or class names separated by
commas. It returns a list of detected objects as labels and their location as
bounding boxes with score of 1.0.

Parameters:
image (np.ndarray): The image to used to detect objects.
prompt (str): The prompt to help find objects in the image.
image (np.ndarray): The image to used to detect objects.
model_id (UUID): The fine-tuned model id.
task (PromptTask): The florencev2 fine-tuning task. The options are
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.

Returns:
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
Expand All @@ -1626,8 +1624,8 @@ def florencev2_fine_tuned_object_detection(
Example
-------
>>> florencev2_fine_tuned_object_detection(
image,
'person looking at a coyote',
image,
UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
)
[
Expand All @@ -1641,7 +1639,10 @@ def florencev2_fine_tuned_object_detection(
if status is not JobStatus.SUCCEEDED:
raise FineTuneModelIsNotReady()

task = PromptTask[task]
task = PromptTask[
"CAPTION_TO_PHRASE_GROUNDING"
] # hardcode to <CAPTION_TO_PHRASE_GROUNDING> for now

if task is PromptTask.OBJECT_DETECTION:
prompt = ""

Expand Down Expand Up @@ -1705,12 +1706,17 @@ def florencev2_fine_tuned_object_detection(
overlay_heat_map,
]

# non-implemented tools
OTHER_TOOLS = [
florence2_phrase_grounding_fine_tune,
]

TOOLS = FUNCTION_TOOLS + UTIL_TOOLS

TOOLS_DF = get_tools_df(TOOLS) # type: ignore
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
TOOLS_INFO = get_tools_info(TOOLS) # type: ignore
TOOLS_INFO = get_tools_info(TOOLS + OTHER_TOOLS) # type: ignore
UTILITIES_DOCSTRING = get_tool_documentation(
[
save_json,
Expand Down
Loading