|
| 1 | +import os |
1 | 2 | import io |
2 | 3 | import json |
3 | 4 | import logging |
4 | | -import os |
5 | 5 | import tempfile |
6 | 6 | import urllib.request |
7 | | -from importlib import resources |
| 7 | +from uuid import UUID |
8 | 8 | from pathlib import Path |
| 9 | +from importlib import resources |
9 | 10 | from typing import Any, Dict, List, Optional, Tuple, Union, cast |
10 | | -from uuid import UUID |
11 | 11 |
|
12 | 12 | import cv2 |
13 | | -import numpy as np |
14 | 13 | import requests |
15 | | -from PIL import Image, ImageDraw, ImageEnhance, ImageFont |
16 | | -from pillow_heif import register_heif_opener # type: ignore |
| 14 | +import numpy as np |
17 | 15 | from pytube import YouTube # type: ignore |
| 16 | +from pillow_heif import register_heif_opener # type: ignore |
| 17 | +from PIL import Image, ImageDraw, ImageEnhance, ImageFont |
18 | 18 |
|
19 | 19 | from vision_agent.clients.landing_public_api import LandingPublicAPI |
20 | 20 | from vision_agent.lmm.lmm import OpenAILMM |
|
28 | 28 | send_task_inference_request, |
29 | 29 | ) |
30 | 30 | from vision_agent.tools.tools_types import ( |
31 | | - FineTuning, |
32 | 31 | Florence2FtRequest, |
33 | 32 | JobStatus, |
34 | 33 | ODResponseData, |
@@ -194,12 +193,16 @@ def owl_v2_image( |
194 | 193 | data_obj = Florence2FtRequest( |
195 | 194 | image=image_b64, |
196 | 195 | task=PromptTask.PHRASE_GROUNDING, |
197 | | - tool="florencev2_fine_tuning", |
198 | 196 | prompt=prompt, |
199 | | - fine_tuning=FineTuning(job_id=UUID(fine_tune_id)), |
| 197 | + job_id=UUID(fine_tune_id), |
| 198 | + ) |
| 199 | + data = data_obj.model_dump(by_alias=True, exclude_none=True) |
| 200 | + detections = send_inference_request( |
| 201 | + data, |
| 202 | + "florence2-ft", |
| 203 | + v2=True, |
| 204 | + metadata_payload={"function_name": "owl_v2_image"}, |
200 | 205 | ) |
201 | | - data = data_obj.model_dump(by_alias=True) |
202 | | - detections = send_inference_request(data, "tools", v2=False) |
203 | 206 | # get the first frame |
204 | 207 | detection = detections[0] |
205 | 208 | bboxes_formatted = [ |
@@ -420,15 +423,17 @@ def florence2_sam2_image( |
420 | 423 | req_data_obj = Florence2FtRequest( |
421 | 424 | image=image_b64, |
422 | 425 | task=PromptTask.PHRASE_GROUNDING, |
423 | | - tool="florencev2_fine_tuning", |
424 | 426 | prompt=prompt, |
425 | 427 | postprocessing="sam2", |
426 | | - fine_tuning=FineTuning( |
427 | | - job_id=UUID(fine_tune_id), |
428 | | - ), |
| 428 | + job_id=UUID(fine_tune_id), |
| 429 | + ) |
| 430 | + req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True) |
| 431 | + detections_ft = send_inference_request( |
| 432 | + req_data, |
| 433 | + "florence2-ft", |
| 434 | + v2=True, |
| 435 | + metadata_payload={"function_name": "florence2_sam2_image"}, |
429 | 436 | ) |
430 | | - req_data = req_data_obj.model_dump(by_alias=True) |
431 | | - detections_ft = send_inference_request(req_data, "tools", v2=False) |
432 | 437 | # get the first frame |
433 | 438 | detection = detections_ft[0] |
434 | 439 | return_data = [] |
@@ -1136,6 +1141,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s |
1136 | 1141 | return answer[task] # type: ignore |
1137 | 1142 |
|
1138 | 1143 |
|
| 1144 | +# TODO: add video |
| 1145 | + |
| 1146 | + |
1139 | 1147 | def florence2_phrase_grounding( |
1140 | 1148 | prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None |
1141 | 1149 | ) -> List[Dict[str, Any]]: |
@@ -1180,15 +1188,14 @@ def florence2_phrase_grounding( |
1180 | 1188 | data_obj = Florence2FtRequest( |
1181 | 1189 | image=image_b64, |
1182 | 1190 | task=PromptTask.PHRASE_GROUNDING, |
1183 | | - tool="florencev2_fine_tuning", |
1184 | 1191 | prompt=prompt, |
1185 | | - fine_tuning=FineTuning(job_id=UUID(fine_tune_id)), |
| 1192 | + job_id=UUID(fine_tune_id), |
1186 | 1193 | ) |
1187 | | - data = data_obj.model_dump(by_alias=True) |
| 1194 | + data = data_obj.model_dump(by_alias=True, exclude_none=True) |
1188 | 1195 | detections = send_inference_request( |
1189 | 1196 | data, |
1190 | | - "tools", |
1191 | | - v2=False, |
| 1197 | + "florence2-ft", |
| 1198 | + v2=True, |
1192 | 1199 | metadata_payload={"function_name": "florence2_phrase_grounding"}, |
1193 | 1200 | ) |
1194 | 1201 | # get the first frame |
|
0 commit comments