From d69528dd29715a3823c9fe524b72c8415868a969 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 14 May 2024 14:21:41 -0700 Subject: [PATCH 1/7] added different verbosity levels, better json parsing --- vision_agent/agent/vision_agent_v2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py index 04b90b19..336a5ef5 100644 --- a/vision_agent/agent/vision_agent_v2.py +++ b/vision_agent/agent/vision_agent_v2.py @@ -63,7 +63,7 @@ def extract_json(json_str: str) -> Dict[str, Any]: # get the last ``` not one from an intermediate string json_str = json_str[: json_str.find("}```")] json_dict = json.loads(json_str) - return json_dict # type: ignore + return json_dict def write_plan( @@ -80,8 +80,8 @@ def write_plan( context = USER_REQ_CONTEXT.format(user_requirement=user_requirements) prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc) chat[-1]["content"] = prompt - new_plan = extract_json(model.chat(chat)) - return new_plan["user_req"], new_plan["plan"] + plan = extract_json(model.chat(chat)) + return plan["user_req"], plan["plan"] # type: ignore def write_code( From c8d3d9d0ed1bccf7b5c4aba1c6f16686af110d32 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 14 May 2024 14:25:26 -0700 Subject: [PATCH 2/7] fix typing error --- vision_agent/agent/vision_agent_v2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py index 336a5ef5..04b90b19 100644 --- a/vision_agent/agent/vision_agent_v2.py +++ b/vision_agent/agent/vision_agent_v2.py @@ -63,7 +63,7 @@ def extract_json(json_str: str) -> Dict[str, Any]: # get the last ``` not one from an intermediate string json_str = json_str[: json_str.find("}```")] json_dict = json.loads(json_str) - return json_dict + return json_dict # type: ignore def write_plan( @@ -80,8 +80,8 @@ def write_plan( context = USER_REQ_CONTEXT.format(user_requirement=user_requirements) prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc) chat[-1]["content"] = prompt - plan = extract_json(model.chat(chat)) - return plan["user_req"], plan["plan"] # type: ignore + new_plan = extract_json(model.chat(chat)) + return new_plan["user_req"], new_plan["plan"] def write_code( From 16a9285d803e82e72409460b96853ac5fc6e7c8e Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 14 May 2024 18:20:20 -0700 Subject: [PATCH 3/7] log retrieved functions --- vision_agent/agent/vision_agent_v2.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py index 04b90b19..1f93be11 100644 --- a/vision_agent/agent/vision_agent_v2.py +++ b/vision_agent/agent/vision_agent_v2.py @@ -235,9 +235,11 @@ def run_plan( f""" {tabulate(tabular_data=[task], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}""" ) - tool_info = "\n".join( - [e["doc"] for e in tool_recommender.top_k(task["instruction"])] - ) + tools = tool_recommender.top_k(task["instruction"]) + tool_info = "\n".join([e["doc"] for e in tools]) + + if verbosity == 2: + _LOGGER.info(f"Tools retrieved: {[e['desc'] for e in tools]}") if long_term_memory is not None: retrieved_ltm = "\n".join( From 4f764adf3d3e802516e273a6191b32a16dec757a Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 14 May 2024 18:20:33 -0700 Subject: [PATCH 4/7] add distance functions --- vision_agent/tools/tools_v2.py | 51 ++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py index 8d95897c..d830f486 100644 --- a/vision_agent/tools/tools_v2.py +++ b/vision_agent/tools/tools_v2.py @@ -10,6 +10,7 @@ import pandas as pd import requests from PIL import Image, ImageDraw, ImageFont +from scipy.spatial import distance from vision_agent.tools.tool_utils import _send_inference_request from vision_agent.utils import extract_frames_from_video @@ -233,6 +234,54 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]: return output +def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float: + """'closest_mask_distance' calculates the closest distance between two masks. + + Parameters: + mask1 (np.ndarray): The first mask. + mask2 (np.ndarray): The second mask. + + Returns: + float: The closest distance between the two masks. + + Example + ------- + >>> closest_mask_distance(mask1, mask2) + 0.5 + """ + + mask1 = np.clip(mask1, 0, 1) + mask2 = np.clip(mask2, 0, 1) + mask1_points = np.transpose(np.nonzero(mask1)) + mask2_points = np.transpose(np.nonzero(mask2)) + dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean") + return np.min(dist_matrix) + + +def closest_box_distance(box1: List[float], box2: List[float]) -> float: + """'closest_box_distance' calculates the closest distance between two bounding boxes. + + Parameters: + box1 (List[float]): The first bounding box. + box2 (List[float]): The second bounding box. + + Returns: + float: The closest distance between the two bounding boxes. + + Example + ------- + >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400]) + 141.42 + """ + + x11, y11, x12, y12 = box1 + x21, y21, x22, y22 = box2 + + horizontal_distance = np.max([0, x21 - x12, x11 - x22]) + vertical_distance = np.max([0, y21 - y12, y11 - y22]) + return np.sqrt(horizontal_distance ** 2 + vertical_distance ** 2) + + # Utility and visualization functions @@ -429,6 +478,8 @@ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame: grounding_sam, extract_frames, ocr, + closest_mask_distance, + closest_box_distance, load_image, save_image, overlay_bounding_boxes, From 057824bafeeb152e8a46aff287a3e5c68b012842 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 14 May 2024 20:02:04 -0700 Subject: [PATCH 5/7] fix types --- vision_agent/tools/__init__.py | 2 +- vision_agent/tools/tools_v2.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 75b9830e..08a96d81 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -3,8 +3,8 @@ CLIP, OCR, TOOLS, - BboxStats, BboxIoU, + BboxStats, BoxDistance, Crop, DINOv, diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py index d830f486..4d225860 100644 --- a/vision_agent/tools/tools_v2.py +++ b/vision_agent/tools/tools_v2.py @@ -4,7 +4,7 @@ import tempfile from importlib import resources from pathlib import Path -from typing import Any, Callable, Dict, List, Tuple, Union +from typing import Any, Callable, Dict, List, Tuple, Union, cast import numpy as np import pandas as pd @@ -255,7 +255,7 @@ def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float: mask1_points = np.transpose(np.nonzero(mask1)) mask2_points = np.transpose(np.nonzero(mask2)) dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean") - return np.min(dist_matrix) + return cast(float, np.min(dist_matrix)) def closest_box_distance(box1: List[float], box2: List[float]) -> float: @@ -276,10 +276,10 @@ def closest_box_distance(box1: List[float], box2: List[float]) -> float: x11, y11, x12, y12 = box1 x21, y21, x22, y22 = box2 - + horizontal_distance = np.max([0, x21 - x12, x11 - x22]) vertical_distance = np.max([0, y21 - y12, y11 - y22]) - return np.sqrt(horizontal_distance ** 2 + vertical_distance ** 2) + return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2)) # Utility and visualization functions From daadbacd437b26ed8803737509d6a866b38b36f3 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 14 May 2024 20:04:16 -0700 Subject: [PATCH 6/7] fix types --- vision_agent/tools/tools_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py index 4d225860..d1891f0f 100644 --- a/vision_agent/tools/tools_v2.py +++ b/vision_agent/tools/tools_v2.py @@ -10,7 +10,7 @@ import pandas as pd import requests from PIL import Image, ImageDraw, ImageFont -from scipy.spatial import distance +from scipy.spatial import distance # type: ignore from vision_agent.tools.tool_utils import _send_inference_request from vision_agent.utils import extract_frames_from_video From 3c6d52432852714db5caebfb08d175b0dcffd27c Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 14 May 2024 20:05:31 -0700 Subject: [PATCH 7/7] fix formatting --- vision_agent/tools/tools_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py index d1891f0f..ab36869f 100644 --- a/vision_agent/tools/tools_v2.py +++ b/vision_agent/tools/tools_v2.py @@ -10,7 +10,7 @@ import pandas as pd import requests from PIL import Image, ImageDraw, ImageFont -from scipy.spatial import distance # type: ignore +from scipy.spatial import distance # type: ignore from vision_agent.tools.tool_utils import _send_inference_request from vision_agent.utils import extract_frames_from_video