From d661cd3e298e093fbc1b6c9251265553ba4ae348 Mon Sep 17 00:00:00 2001
From: Dayanne Fernandes <dayannefernandesc@gmail.com>
Date: Mon, 30 Sep 2024 17:45:48 -0300
Subject: [PATCH] rle enconding

---
 vision_agent/tools/tools.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index f0537254..70101692 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -200,7 +200,7 @@ def owl_v2_image(
         )
         data = data_obj.model_dump(by_alias=True)
         detections = send_inference_request(data, "tools", v2=False)
-        # get the first frame detections
+        # get the first frame
         detection = detections[0]
         bboxes_formatted = [
             ODResponseData(
@@ -429,17 +429,18 @@ def florence2_sam2_image(
         )
         req_data = req_data_obj.model_dump(by_alias=True)
         detections_ft = send_inference_request(req_data, "tools", v2=False)
-        # get the first frame detections
+        # get the first frame
         detection = detections_ft[0]
         return_data = []
-        all_masks = np.array(detection["masks"])
         for i in range(len(detection["bboxes"])):
             return_data.append(
                 {
                     "score": 1.0,
                     "label": detection["labels"][i],
-                    "bbox": detection["bboxes"][i],
-                    "mask": all_masks[i, :, :].astype(np.uint8),
+                    "bbox": normalize_bbox(
+                        detection["bboxes"][i], detection["masks"][i]["size"]
+                    ),
+                    "mask": rle_decode_array(detection["masks"][i]),
                 }
             )
         return return_data
@@ -453,6 +454,7 @@ def florence2_sam2_image(
     detections: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
     )
+
     return_data = []
     for _, data_i in detections["0"].items():
         mask = rle_decode_array(data_i["mask"])
@@ -1189,7 +1191,7 @@ def florence2_phrase_grounding(
             v2=False,
             metadata_payload={"function_name": "florence2_phrase_grounding"},
         )
-        # get the first frame detections
+        # get the first frame
         detection = detections[0]
     else:
         data = {
@@ -1607,7 +1609,7 @@ def extract_frames_and_timestamps(
     """
 
     def reformat(
-        frames_and_timestamps: List[Tuple[np.ndarray, float]]
+        frames_and_timestamps: List[Tuple[np.ndarray, float]],
     ) -> List[Dict[str, Union[np.ndarray, float]]]:
         return [
             {"frame": frame, "timestamp": timestamp}