From f1d5f1f859499ed530cd8594ef90152983688cec Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 6 Sep 2024 12:51:40 -0700 Subject: [PATCH] testing owlv2_video --- vision_agent/tools/__init__.py | 1 + vision_agent/tools/tools.py | 41 +++++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 3f966d65..d730f3a8 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -47,6 +47,7 @@ overlay_heat_map, overlay_segmentation_masks, owl_v2_image, + owl_v2_image2, owl_v2_video, save_image, save_json, diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index cc2a877d..555c58dc 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -185,7 +185,6 @@ def owl_v2_image( "function_name": "owl_v2", } data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True) - print(data) return_data = [] if data is not None: for elt in data: @@ -199,6 +198,35 @@ def owl_v2_image( return return_data +def owl_v2_image2( + prompt: str, + image: np.ndarray, + box_threshold: float = 0.30, +) -> List[Dict[str, Any]]: + image_size = image.shape[:2] + buffer_bytes = numpy_to_bytes(image) + files = [("image", buffer_bytes)] + payload = { + "prompts": [s.strip() for s in prompt.split(",")], + "model": "owlv2", + "function_name": "owl_v2_image", + } + resp_data = send_inference_request( + payload, "text-to-object-detection", files=files, v2=True + ) + bboxes = resp_data[0] + bboxes_formatted = [ + ODResponseData( + label=bbox["label"], # type: ignore + bbox=normalize_bbox(bbox["bounding_box"], image_size), # type: ignore + score=round(bbox["score"], 2), # type: ignore + ) + for bbox in bboxes + ] + filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold) + return [bbox.model_dump() for bbox in filtered_bboxes] + + def owl_v2_video( prompt: str, frames: List[np.ndarray], @@ -247,7 +275,6 @@ def owl_v2_video( data: Dict[str, Any] = send_inference_request( payload, "text-to-object-detection", files=files, v2=True ) - print(data) bboxes_formatted = [] if data is not None: for frame_data in data: @@ -255,14 +282,16 @@ def owl_v2_video( for elt in frame_data: bboxes_formated_frame.append( ODResponseData( - label=elt["label"], # type: ignore - bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore - score=round(elt["score"], 2), # type: ignore + label=elt["label"], # type: ignore + bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore + score=round(elt["score"], 2), # type: ignore ) ) bboxes_formatted.append(bboxes_formated_frame) - filtered_bboxes = [filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted] + filtered_bboxes = [ + filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted + ] return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]