|  | 
|  | 1 | +import os | 
| 1 | 2 | import io | 
| 2 | 3 | import json | 
| 3 | 4 | import logging | 
| 4 |  | -import os | 
| 5 | 5 | import tempfile | 
| 6 | 6 | import urllib.request | 
| 7 |  | -from importlib import resources | 
|  | 7 | +from uuid import UUID | 
| 8 | 8 | from pathlib import Path | 
|  | 9 | +from importlib import resources | 
| 9 | 10 | from typing import Any, Dict, List, Optional, Tuple, Union, cast | 
| 10 |  | -from uuid import UUID | 
| 11 | 11 | 
 | 
| 12 | 12 | import cv2 | 
| 13 |  | -import numpy as np | 
| 14 | 13 | import requests | 
| 15 |  | -from PIL import Image, ImageDraw, ImageEnhance, ImageFont | 
| 16 |  | -from pillow_heif import register_heif_opener  # type: ignore | 
|  | 14 | +import numpy as np | 
| 17 | 15 | from pytube import YouTube  # type: ignore | 
|  | 16 | +from pillow_heif import register_heif_opener  # type: ignore | 
|  | 17 | +from PIL import Image, ImageDraw, ImageEnhance, ImageFont | 
| 18 | 18 | 
 | 
| 19 | 19 | from vision_agent.clients.landing_public_api import LandingPublicAPI | 
| 20 | 20 | from vision_agent.lmm.lmm import OpenAILMM | 
|  | 
| 28 | 28 |     send_task_inference_request, | 
| 29 | 29 | ) | 
| 30 | 30 | from vision_agent.tools.tools_types import ( | 
| 31 |  | -    FineTuning, | 
| 32 | 31 |     Florence2FtRequest, | 
| 33 | 32 |     JobStatus, | 
| 34 | 33 |     ODResponseData, | 
| @@ -194,12 +193,16 @@ def owl_v2_image( | 
| 194 | 193 |         data_obj = Florence2FtRequest( | 
| 195 | 194 |             image=image_b64, | 
| 196 | 195 |             task=PromptTask.PHRASE_GROUNDING, | 
| 197 |  | -            tool="florencev2_fine_tuning", | 
| 198 | 196 |             prompt=prompt, | 
| 199 |  | -            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)), | 
|  | 197 | +            job_id=UUID(fine_tune_id), | 
|  | 198 | +        ) | 
|  | 199 | +        data = data_obj.model_dump(by_alias=True, exclude_none=True) | 
|  | 200 | +        detections = send_inference_request( | 
|  | 201 | +            data, | 
|  | 202 | +            "florence2-ft", | 
|  | 203 | +            v2=True, | 
|  | 204 | +            metadata_payload={"function_name": "owl_v2_image"}, | 
| 200 | 205 |         ) | 
| 201 |  | -        data = data_obj.model_dump(by_alias=True) | 
| 202 |  | -        detections = send_inference_request(data, "tools", v2=False) | 
| 203 | 206 |         # get the first frame | 
| 204 | 207 |         detection = detections[0] | 
| 205 | 208 |         bboxes_formatted = [ | 
| @@ -420,15 +423,17 @@ def florence2_sam2_image( | 
| 420 | 423 |         req_data_obj = Florence2FtRequest( | 
| 421 | 424 |             image=image_b64, | 
| 422 | 425 |             task=PromptTask.PHRASE_GROUNDING, | 
| 423 |  | -            tool="florencev2_fine_tuning", | 
| 424 | 426 |             prompt=prompt, | 
| 425 | 427 |             postprocessing="sam2", | 
| 426 |  | -            fine_tuning=FineTuning( | 
| 427 |  | -                job_id=UUID(fine_tune_id), | 
| 428 |  | -            ), | 
|  | 428 | +            job_id=UUID(fine_tune_id), | 
|  | 429 | +        ) | 
|  | 430 | +        req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True) | 
|  | 431 | +        detections_ft = send_inference_request( | 
|  | 432 | +            req_data, | 
|  | 433 | +            "florence2-ft", | 
|  | 434 | +            v2=True, | 
|  | 435 | +            metadata_payload={"function_name": "florence2_sam2_image"}, | 
| 429 | 436 |         ) | 
| 430 |  | -        req_data = req_data_obj.model_dump(by_alias=True) | 
| 431 |  | -        detections_ft = send_inference_request(req_data, "tools", v2=False) | 
| 432 | 437 |         # get the first frame | 
| 433 | 438 |         detection = detections_ft[0] | 
| 434 | 439 |         return_data = [] | 
| @@ -1136,6 +1141,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s | 
| 1136 | 1141 |     return answer[task]  # type: ignore | 
| 1137 | 1142 | 
 | 
| 1138 | 1143 | 
 | 
|  | 1144 | +# TODO: add video | 
|  | 1145 | + | 
|  | 1146 | + | 
| 1139 | 1147 | def florence2_phrase_grounding( | 
| 1140 | 1148 |     prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None | 
| 1141 | 1149 | ) -> List[Dict[str, Any]]: | 
| @@ -1180,15 +1188,14 @@ def florence2_phrase_grounding( | 
| 1180 | 1188 |         data_obj = Florence2FtRequest( | 
| 1181 | 1189 |             image=image_b64, | 
| 1182 | 1190 |             task=PromptTask.PHRASE_GROUNDING, | 
| 1183 |  | -            tool="florencev2_fine_tuning", | 
| 1184 | 1191 |             prompt=prompt, | 
| 1185 |  | -            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)), | 
|  | 1192 | +            job_id=UUID(fine_tune_id), | 
| 1186 | 1193 |         ) | 
| 1187 |  | -        data = data_obj.model_dump(by_alias=True) | 
|  | 1194 | +        data = data_obj.model_dump(by_alias=True, exclude_none=True) | 
| 1188 | 1195 |         detections = send_inference_request( | 
| 1189 | 1196 |             data, | 
| 1190 |  | -            "tools", | 
| 1191 |  | -            v2=False, | 
|  | 1197 | +            "florence2-ft", | 
|  | 1198 | +            v2=True, | 
| 1192 | 1199 |             metadata_payload={"function_name": "florence2_phrase_grounding"}, | 
| 1193 | 1200 |         ) | 
| 1194 | 1201 |         # get the first frame | 
|  | 
0 commit comments