diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 6399016e..04cafd5e 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -15,7 +15,7 @@ from vision_agent.tools import META_TOOL_DOCSTRING from vision_agent.tools.meta_tools import Artifacts from vision_agent.utils import CodeInterpreterFactory -from vision_agent.utils.execute import CodeInterpreter +from vision_agent.utils.execute import CodeInterpreter, Execution logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) @@ -75,11 +75,10 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: def run_code_action( code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str -) -> str: - result = code_interpreter.exec_cell( +) -> Execution: + return code_interpreter.exec_isolation( BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path) ) - return result.text() def parse_execution(response: str) -> Optional[str]: @@ -258,14 +257,15 @@ def chat_with_code( code_action = parse_execution(response["response"]) if code_action is not None: - obs = run_code_action( + result = run_code_action( code_action, code_interpreter, str(remote_artifacts_path) ) + obs = result.text() if self.verbosity >= 1: _LOGGER.info(obs) - int_chat.append({"role": "observation", "content": obs}) - orig_chat.append({"role": "observation", "content": obs}) + int_chat.append({"role": "observation", "content": obs, "execution": result}) + orig_chat.append({"role": "observation", "content": obs, "execution": result}) iterations += 1 last_response = response diff --git a/vision_agent/lmm/types.py b/vision_agent/lmm/types.py index ded6a42b..ba2b3189 100644 --- a/vision_agent/lmm/types.py +++ b/vision_agent/lmm/types.py @@ -1,5 +1,6 @@ from pathlib import Path from typing import Dict, Sequence, Union +from vision_agent.utils.execute import Execution TextOrImage = Union[str, Sequence[Union[str, Path]]] -Message = Dict[str, TextOrImage] +Message = Dict[str, Union[TextOrImage, Execution]]