|
| 1 | +import os |
1 | 2 | import io
|
2 | 3 | import json
|
3 | 4 | import logging
|
4 |
| -import os |
5 | 5 | import tempfile
|
6 | 6 | import urllib.request
|
7 |
| -from importlib import resources |
| 7 | +from uuid import UUID |
8 | 8 | from pathlib import Path
|
| 9 | +from importlib import resources |
9 | 10 | from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
10 |
| -from uuid import UUID |
11 | 11 |
|
12 | 12 | import cv2
|
13 |
| -import numpy as np |
14 | 13 | import requests
|
15 |
| -from PIL import Image, ImageDraw, ImageEnhance, ImageFont |
16 |
| -from pillow_heif import register_heif_opener # type: ignore |
| 14 | +import numpy as np |
17 | 15 | from pytube import YouTube # type: ignore
|
| 16 | +from pillow_heif import register_heif_opener # type: ignore |
| 17 | +from PIL import Image, ImageDraw, ImageEnhance, ImageFont |
18 | 18 |
|
19 | 19 | from vision_agent.clients.landing_public_api import LandingPublicAPI
|
20 | 20 | from vision_agent.lmm.lmm import OpenAILMM
|
|
28 | 28 | send_task_inference_request,
|
29 | 29 | )
|
30 | 30 | from vision_agent.tools.tools_types import (
|
31 |
| - FineTuning, |
32 | 31 | Florence2FtRequest,
|
33 | 32 | JobStatus,
|
34 | 33 | ODResponseData,
|
@@ -194,12 +193,16 @@ def owl_v2_image(
|
194 | 193 | data_obj = Florence2FtRequest(
|
195 | 194 | image=image_b64,
|
196 | 195 | task=PromptTask.PHRASE_GROUNDING,
|
197 |
| - tool="florencev2_fine_tuning", |
198 | 196 | prompt=prompt,
|
199 |
| - fine_tuning=FineTuning(job_id=UUID(fine_tune_id)), |
| 197 | + job_id=UUID(fine_tune_id), |
| 198 | + ) |
| 199 | + data = data_obj.model_dump(by_alias=True, exclude_none=True) |
| 200 | + detections = send_inference_request( |
| 201 | + data, |
| 202 | + "florence2-ft", |
| 203 | + v2=True, |
| 204 | + metadata_payload={"function_name": "owl_v2_image"}, |
200 | 205 | )
|
201 |
| - data = data_obj.model_dump(by_alias=True) |
202 |
| - detections = send_inference_request(data, "tools", v2=False) |
203 | 206 | # get the first frame
|
204 | 207 | detection = detections[0]
|
205 | 208 | bboxes_formatted = [
|
@@ -420,15 +423,17 @@ def florence2_sam2_image(
|
420 | 423 | req_data_obj = Florence2FtRequest(
|
421 | 424 | image=image_b64,
|
422 | 425 | task=PromptTask.PHRASE_GROUNDING,
|
423 |
| - tool="florencev2_fine_tuning", |
424 | 426 | prompt=prompt,
|
425 | 427 | postprocessing="sam2",
|
426 |
| - fine_tuning=FineTuning( |
427 |
| - job_id=UUID(fine_tune_id), |
428 |
| - ), |
| 428 | + job_id=UUID(fine_tune_id), |
| 429 | + ) |
| 430 | + req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True) |
| 431 | + detections_ft = send_inference_request( |
| 432 | + req_data, |
| 433 | + "florence2-ft", |
| 434 | + v2=True, |
| 435 | + metadata_payload={"function_name": "florence2_sam2_image"}, |
429 | 436 | )
|
430 |
| - req_data = req_data_obj.model_dump(by_alias=True) |
431 |
| - detections_ft = send_inference_request(req_data, "tools", v2=False) |
432 | 437 | # get the first frame
|
433 | 438 | detection = detections_ft[0]
|
434 | 439 | return_data = []
|
@@ -1136,6 +1141,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
|
1136 | 1141 | return answer[task] # type: ignore
|
1137 | 1142 |
|
1138 | 1143 |
|
| 1144 | +# TODO: add video |
| 1145 | + |
| 1146 | + |
1139 | 1147 | def florence2_phrase_grounding(
|
1140 | 1148 | prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
|
1141 | 1149 | ) -> List[Dict[str, Any]]:
|
@@ -1180,15 +1188,14 @@ def florence2_phrase_grounding(
|
1180 | 1188 | data_obj = Florence2FtRequest(
|
1181 | 1189 | image=image_b64,
|
1182 | 1190 | task=PromptTask.PHRASE_GROUNDING,
|
1183 |
| - tool="florencev2_fine_tuning", |
1184 | 1191 | prompt=prompt,
|
1185 |
| - fine_tuning=FineTuning(job_id=UUID(fine_tune_id)), |
| 1192 | + job_id=UUID(fine_tune_id), |
1186 | 1193 | )
|
1187 |
| - data = data_obj.model_dump(by_alias=True) |
| 1194 | + data = data_obj.model_dump(by_alias=True, exclude_none=True) |
1188 | 1195 | detections = send_inference_request(
|
1189 | 1196 | data,
|
1190 |
| - "tools", |
1191 |
| - v2=False, |
| 1197 | + "florence2-ft", |
| 1198 | + v2=True, |
1192 | 1199 | metadata_payload={"function_name": "florence2_phrase_grounding"},
|
1193 | 1200 | )
|
1194 | 1201 | # get the first frame
|
|
0 commit comments