From 075b897227d86652f72f00b32c14b2bcdec56e87 Mon Sep 17 00:00:00 2001 From: Dayanne Fernandes Date: Mon, 30 Sep 2024 22:36:03 -0300 Subject: [PATCH] adjust payload --- vision_agent/tools/tool_utils.py | 4 +-- vision_agent/tools/tools.py | 51 ++++++++++++++++++------------- vision_agent/tools/tools_types.py | 23 ++++++-------- 3 files changed, 40 insertions(+), 38 deletions(-) diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py index 605f5511..772d6bc3 100644 --- a/vision_agent/tools/tool_utils.py +++ b/vision_agent/tools/tool_utils.py @@ -1,6 +1,6 @@ +import os import inspect import logging -import os from base64 import b64encode from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple @@ -38,7 +38,7 @@ def send_inference_request( v2: bool = False, metadata_payload: Optional[Dict[str, Any]] = None, ) -> Any: - # TODO: runtime_tag and function_name should be metadata_payload and now included + # TODO: runtime_tag and function_name should be metadata_payload and not included # in the service payload if runtime_tag := os.environ.get("RUNTIME_TAG", ""): payload["runtime_tag"] = runtime_tag diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 70101692..344726db 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1,20 +1,20 @@ +import os import io import json import logging -import os import tempfile import urllib.request -from importlib import resources +from uuid import UUID from pathlib import Path +from importlib import resources from typing import Any, Dict, List, Optional, Tuple, Union, cast -from uuid import UUID import cv2 -import numpy as np import requests -from PIL import Image, ImageDraw, ImageEnhance, ImageFont -from pillow_heif import register_heif_opener # type: ignore +import numpy as np from pytube import YouTube # type: ignore +from pillow_heif import register_heif_opener # type: ignore +from PIL import Image, ImageDraw, ImageEnhance, ImageFont from vision_agent.clients.landing_public_api import LandingPublicAPI from vision_agent.lmm.lmm import OpenAILMM @@ -28,7 +28,6 @@ send_task_inference_request, ) from vision_agent.tools.tools_types import ( - FineTuning, Florence2FtRequest, JobStatus, ODResponseData, @@ -194,12 +193,16 @@ def owl_v2_image( data_obj = Florence2FtRequest( image=image_b64, task=PromptTask.PHRASE_GROUNDING, - tool="florencev2_fine_tuning", prompt=prompt, - fine_tuning=FineTuning(job_id=UUID(fine_tune_id)), + job_id=UUID(fine_tune_id), + ) + data = data_obj.model_dump(by_alias=True, exclude_none=True) + detections = send_inference_request( + data, + "florence2-ft", + v2=True, + metadata_payload={"function_name": "owl_v2_image"}, ) - data = data_obj.model_dump(by_alias=True) - detections = send_inference_request(data, "tools", v2=False) # get the first frame detection = detections[0] bboxes_formatted = [ @@ -420,15 +423,17 @@ def florence2_sam2_image( req_data_obj = Florence2FtRequest( image=image_b64, task=PromptTask.PHRASE_GROUNDING, - tool="florencev2_fine_tuning", prompt=prompt, postprocessing="sam2", - fine_tuning=FineTuning( - job_id=UUID(fine_tune_id), - ), + job_id=UUID(fine_tune_id), + ) + req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True) + detections_ft = send_inference_request( + req_data, + "florence2-ft", + v2=True, + metadata_payload={"function_name": "florence2_sam2_image"}, ) - req_data = req_data_obj.model_dump(by_alias=True) - detections_ft = send_inference_request(req_data, "tools", v2=False) # get the first frame detection = detections_ft[0] return_data = [] @@ -1136,6 +1141,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s return answer[task] # type: ignore +# TODO: add video + + def florence2_phrase_grounding( prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None ) -> List[Dict[str, Any]]: @@ -1180,15 +1188,14 @@ def florence2_phrase_grounding( data_obj = Florence2FtRequest( image=image_b64, task=PromptTask.PHRASE_GROUNDING, - tool="florencev2_fine_tuning", prompt=prompt, - fine_tuning=FineTuning(job_id=UUID(fine_tune_id)), + job_id=UUID(fine_tune_id), ) - data = data_obj.model_dump(by_alias=True) + data = data_obj.model_dump(by_alias=True, exclude_none=True) detections = send_inference_request( data, - "tools", - v2=False, + "florence2-ft", + v2=True, metadata_payload={"function_name": "florence2_phrase_grounding"}, ) # get the first frame diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py index 25c2ec93..4b24aabb 100644 --- a/vision_agent/tools/tools_types.py +++ b/vision_agent/tools/tools_types.py @@ -1,6 +1,6 @@ from enum import Enum -from typing import List, Optional, Tuple, Union from uuid import UUID +from typing import List, Optional, Tuple, Union from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer @@ -24,25 +24,20 @@ class PromptTask(str, Enum): PHRASE_GROUNDING = "" -class FineTuning(BaseModel): - model_config = ConfigDict(populate_by_name=True) - - job_id: UUID = Field(alias="jobId") - - @field_serializer("job_id") - def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str: - return str(job_id) - - class Florence2FtRequest(BaseModel): model_config = ConfigDict(populate_by_name=True) - image: str + image: str | None + video: bytes | None task: PromptTask - tool: str prompt: Optional[str] = "" + chunk_length_frames: Optional[int] = None postprocessing: Optional[str] = None - fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning") + job_id: Optional[UUID] = Field(None, alias="jobId") + + @field_serializer("job_id") + def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str: + return str(job_id) class JobStatus(str, Enum):