diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index ff360d87..7faa123a 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1221,7 +1221,7 @@ def florence2_phrase_grounding_image(
 
 def florence2_phrase_grounding_video(
     prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
-) -> List[Dict[str, Any]]:
+) -> List[List[Dict[str, Any]]]:
     """'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
     It can detect multiple objects given a text prompt which can be object names or
     caption. You can optionally separate the object names in the text with commas.
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index 4b24aabb..1cc765b6 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -27,8 +27,8 @@ class PromptTask(str, Enum):
 class Florence2FtRequest(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
 
-    image: str | None
-    video: bytes | None
+    image: Optional[str] = None
+    video: Optional[bytes] = None
     task: PromptTask
     prompt: Optional[str] = ""
     chunk_length_frames: Optional[int] = None