diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py index 9bd195eb..8c01f78d 100644 --- a/tests/integ/test_tools.py +++ b/tests/integ/test_tools.py @@ -124,7 +124,7 @@ def test_florence2_phrase_grounding_video(): frames=frames, ) assert len(result) == 10 - assert 24 <= len([res["label"] for res in result[0]]) <= 26 + assert 2 <= len([res["label"] for res in result[0]]) <= 26 def test_florence2_phrase_grounding_video_fine_tune_id(): @@ -138,7 +138,7 @@ def test_florence2_phrase_grounding_video_fine_tune_id(): fine_tune_id=FINE_TUNE_ID, ) assert len(result) == 10 - assert 24 <= len([res["label"] for res in result[0]]) <= 26 + assert 16 <= len([res["label"] for res in result[0]]) <= 26 def test_template_match(): diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py index 772d6bc3..924b96e6 100644 --- a/vision_agent/tools/tool_utils.py +++ b/vision_agent/tools/tool_utils.py @@ -37,6 +37,7 @@ def send_inference_request( files: Optional[List[Tuple[Any, ...]]] = None, v2: bool = False, metadata_payload: Optional[Dict[str, Any]] = None, + is_form: bool = False, ) -> Any: # TODO: runtime_tag and function_name should be metadata_payload and not included # in the service payload @@ -64,7 +65,7 @@ def send_inference_request( elif metadata_payload is not None and "function_name" in metadata_payload: function_name = metadata_payload["function_name"] - response = _call_post(url, payload, session, files, function_name) + response = _call_post(url, payload, session, files, function_name, is_form) # TODO: consider making the response schema the same between below two sources return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"] @@ -75,6 +76,7 @@ def send_task_inference_request( task_name: str, files: Optional[List[Tuple[Any, ...]]] = None, metadata: Optional[Dict[str, Any]] = None, + is_form: bool = False, ) -> Any: url = f"{_LND_API_URL_v2}/{task_name}" headers = {"apikey": _LND_API_KEY} @@ -87,7 +89,7 @@ def send_task_inference_request( function_name = "unknown" if metadata is not None and "function_name" in metadata: function_name = metadata["function_name"] - response = _call_post(url, payload, session, files, function_name) + response = _call_post(url, payload, session, files, function_name, is_form) return response["data"] @@ -203,6 +205,7 @@ def _call_post( session: Session, files: Optional[List[Tuple[Any, ...]]] = None, function_name: str = "unknown", + is_form: bool = False, ) -> Any: files_in_b64 = None if files: @@ -210,6 +213,8 @@ def _call_post( try: if files is not None: response = session.post(url, data=payload, files=files) + elif is_form: + response = session.post(url, data=payload) else: response = session.post(url, json=payload) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 7faa123a..d99a38fc 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -201,6 +201,7 @@ def owl_v2_image( data, "florence2-ft", v2=True, + is_form=True, metadata_payload={"function_name": "owl_v2_image"}, ) # get the first frame @@ -432,6 +433,7 @@ def florence2_sam2_image( req_data, "florence2-ft", v2=True, + is_form=True, metadata_payload={"function_name": "florence2_sam2_image"}, ) # get the first frame @@ -1193,6 +1195,7 @@ def florence2_phrase_grounding_image( data, "florence2-ft", v2=True, + is_form=True, metadata_payload={"function_name": "florence2_phrase_grounding_image"}, ) # get the first frame @@ -1268,18 +1271,14 @@ def florence2_phrase_grounding_video( ) data_obj = Florence2FtRequest( - video=buffer_bytes, task=PromptTask.PHRASE_GROUNDING, prompt=prompt, job_id=UUID(fine_tune_id), ) - data = data_obj.model_dump(by_alias=True, exclude_none=True) else: - data_obj = Florence2FtRequest( - video=buffer_bytes, task=PromptTask.PHRASE_GROUNDING, prompt=prompt - ) - data = data_obj.model_dump(by_alias=True, exclude_none=True) + data_obj = Florence2FtRequest(task=PromptTask.PHRASE_GROUNDING, prompt=prompt) + data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json") detections = send_inference_request( data, "florence2-ft",