Skip to content

Commit

Permalink
adjust payload
Browse files Browse the repository at this point in the history
  • Loading branch information
Dayof committed Oct 1, 2024
1 parent d661cd3 commit 075b897
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 38 deletions.
4 changes: 2 additions & 2 deletions vision_agent/tools/tool_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import inspect
import logging
import os
from base64 import b64encode
from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple

Expand Down Expand Up @@ -38,7 +38,7 @@ def send_inference_request(
v2: bool = False,
metadata_payload: Optional[Dict[str, Any]] = None,
) -> Any:
# TODO: runtime_tag and function_name should be metadata_payload and now included
# TODO: runtime_tag and function_name should be metadata_payload and not included
# in the service payload
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
payload["runtime_tag"] = runtime_tag
Expand Down
51 changes: 29 additions & 22 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import os
import io
import json
import logging
import os
import tempfile
import urllib.request
from importlib import resources
from uuid import UUID
from pathlib import Path
from importlib import resources
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from uuid import UUID

import cv2
import numpy as np
import requests
from PIL import Image, ImageDraw, ImageEnhance, ImageFont
from pillow_heif import register_heif_opener # type: ignore
import numpy as np
from pytube import YouTube # type: ignore
from pillow_heif import register_heif_opener # type: ignore
from PIL import Image, ImageDraw, ImageEnhance, ImageFont

from vision_agent.clients.landing_public_api import LandingPublicAPI
from vision_agent.lmm.lmm import OpenAILMM
Expand All @@ -28,7 +28,6 @@
send_task_inference_request,
)
from vision_agent.tools.tools_types import (
FineTuning,
Florence2FtRequest,
JobStatus,
ODResponseData,
Expand Down Expand Up @@ -194,12 +193,16 @@ def owl_v2_image(
data_obj = Florence2FtRequest(
image=image_b64,
task=PromptTask.PHRASE_GROUNDING,
tool="florencev2_fine_tuning",
prompt=prompt,
fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
job_id=UUID(fine_tune_id),
)
data = data_obj.model_dump(by_alias=True, exclude_none=True)
detections = send_inference_request(
data,
"florence2-ft",
v2=True,
metadata_payload={"function_name": "owl_v2_image"},
)
data = data_obj.model_dump(by_alias=True)
detections = send_inference_request(data, "tools", v2=False)
# get the first frame
detection = detections[0]
bboxes_formatted = [
Expand Down Expand Up @@ -420,15 +423,17 @@ def florence2_sam2_image(
req_data_obj = Florence2FtRequest(
image=image_b64,
task=PromptTask.PHRASE_GROUNDING,
tool="florencev2_fine_tuning",
prompt=prompt,
postprocessing="sam2",
fine_tuning=FineTuning(
job_id=UUID(fine_tune_id),
),
job_id=UUID(fine_tune_id),
)
req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
detections_ft = send_inference_request(
req_data,
"florence2-ft",
v2=True,
metadata_payload={"function_name": "florence2_sam2_image"},
)
req_data = req_data_obj.model_dump(by_alias=True)
detections_ft = send_inference_request(req_data, "tools", v2=False)
# get the first frame
detection = detections_ft[0]
return_data = []
Expand Down Expand Up @@ -1136,6 +1141,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
return answer[task] # type: ignore


# TODO: add video


def florence2_phrase_grounding(
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -1180,15 +1188,14 @@ def florence2_phrase_grounding(
data_obj = Florence2FtRequest(
image=image_b64,
task=PromptTask.PHRASE_GROUNDING,
tool="florencev2_fine_tuning",
prompt=prompt,
fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
job_id=UUID(fine_tune_id),
)
data = data_obj.model_dump(by_alias=True)
data = data_obj.model_dump(by_alias=True, exclude_none=True)
detections = send_inference_request(
data,
"tools",
v2=False,
"florence2-ft",
v2=True,
metadata_payload={"function_name": "florence2_phrase_grounding"},
)
# get the first frame
Expand Down
23 changes: 9 additions & 14 deletions vision_agent/tools/tools_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enum import Enum
from typing import List, Optional, Tuple, Union
from uuid import UUID
from typing import List, Optional, Tuple, Union

from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer

Expand All @@ -24,25 +24,20 @@ class PromptTask(str, Enum):
PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"


class FineTuning(BaseModel):
model_config = ConfigDict(populate_by_name=True)

job_id: UUID = Field(alias="jobId")

@field_serializer("job_id")
def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
return str(job_id)


class Florence2FtRequest(BaseModel):
model_config = ConfigDict(populate_by_name=True)

image: str
image: str | None
video: bytes | None
task: PromptTask
tool: str
prompt: Optional[str] = ""
chunk_length_frames: Optional[int] = None
postprocessing: Optional[str] = None
fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning")
job_id: Optional[UUID] = Field(None, alias="jobId")

@field_serializer("job_id")
def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
return str(job_id)


class JobStatus(str, Enum):
Expand Down

0 comments on commit 075b897

Please sign in to comment.