@@ -255,7 +255,7 @@ def self_reflect(
255255) -> str :
256256 prompt = VISION_AGENT_REFLECTION .format (
257257 question = question ,
258- tools = format_tools (tools ),
258+ tools = format_tools ({ k : v [ "description" ] for k , v in tools . items ()} ),
259259 tool_results = str (tool_result ),
260260 final_answer = final_answer ,
261261 )
@@ -268,11 +268,16 @@ def self_reflect(
268268 return reflect_model (prompt )
269269
270270
271- def parse_reflect (reflect : str ) -> bool :
272- # GPT-4V has a hard time following directions, so make the criteria less strict
273- return (
271+ def parse_reflect (reflect : str ) -> Dict [str , Any ]:
272+ try :
273+ return parse_json (reflect )
274+ except Exception :
275+ _LOGGER .error (f"Failed parse json reflection: { reflect } " )
276+ # LMMs have a hard time following directions, so make the criteria less strict
277+ finish = (
274278 "finish" in reflect .lower () and len (reflect ) < 100
275279 ) or "finish" in reflect .lower ()[- 10 :]
280+ return {"Finish" : finish , "Reflection" : reflect }
276281
277282
278283def visualize_result (all_tool_results : List [Dict ]) -> List [str ]:
@@ -389,7 +394,7 @@ def __init__(
389394 OpenAILLM (temperature = 0.1 ) if answer_model is None else answer_model
390395 )
391396 self .reflect_model = (
392- OpenAILMM (temperature = 0.1 ) if reflect_model is None else reflect_model
397+ OpenAILMM (json_mode = True , temperature = 0.1 ) if reflect_model is None else reflect_model
393398 )
394399 self .max_retries = max_retries
395400 self .tools = TOOLS
@@ -485,13 +490,14 @@ def chat_with_workflow(
485490 visualized_output [0 ] if len (visualized_output ) > 0 else image ,
486491 )
487492 self .log_progress (f"Reflection: { reflection } " )
488- if parse_reflect (reflection ):
493+ parsed_reflection = parse_reflect (reflection )
494+ if parsed_reflection ["Finish" ]:
489495 break
490496 else :
491- reflections += "\n " + reflection
492- # '<END >' is a symbol to indicate the end of the chat, which is useful for streaming logs.
497+ reflections += "\n " + parsed_reflection [ "Reflection" ]
498+ # '<ANSWER >' is a symbol to indicate the end of the chat, which is useful for streaming logs.
493499 self .log_progress (
494- f"The Vision Agent has concluded this chat. <ANSWER>{ final_answer } </< ANSWER>"
500+ f"The Vision Agent has concluded this chat. <ANSWER>{ final_answer } </ANSWER>"
495501 )
496502
497503 if visualize_output :
0 commit comments