diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index ff360d87..7faa123a 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1221,7 +1221,7 @@ def florence2_phrase_grounding_image( def florence2_phrase_grounding_video( prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None -) -> List[Dict[str, Any]]: +) -> List[List[Dict[str, Any]]]: """'florence2_phrase_grounding_video' will run florence2 on each frame of a video. It can detect multiple objects given a text prompt which can be object names or caption. You can optionally separate the object names in the text with commas. diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py index 4b24aabb..1cc765b6 100644 --- a/vision_agent/tools/tools_types.py +++ b/vision_agent/tools/tools_types.py @@ -27,8 +27,8 @@ class PromptTask(str, Enum): class Florence2FtRequest(BaseModel): model_config = ConfigDict(populate_by_name=True) - image: str | None - video: bytes | None + image: Optional[str] = None + video: Optional[bytes] = None task: PromptTask prompt: Optional[str] = "" chunk_length_frames: Optional[int] = None