Refactor

huggingface · Apr 24, 2024 · 8585c60 · 8585c60
1 parent f802052
commit 8585c60
Show file tree

Hide file tree

Showing 43 changed files with 1,658 additions and 1,641 deletions.
diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md
@@ -154,14 +154,26 @@ You could use any `llm_engine` method as long as:
 
 You also need a `tools` argument which accepts a list of `Tools`. You can provide an empty list for `tools`, but use the default toolbox with the optional argument `add_base_tools=True`.
 
-Now you can create an agent, like `CodeAgent`, and run it.
+Now you can create an agent, like `CodeAgent`, and run it. For convenience, we also provide the `HfEngine` class that uses `huggingface_hub.InferenceClient` under the hood.
 
 ```python
-from transformers import CodeAgent
+from transformers import CodeAgent, HfEngine
+
+llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
 
 agent = CodeAgent(llm_engine=llm_engine, tools=[], add_base_tools=True)
 
-agent.run("Draw me a picture of rivers and lakes")
+agent.run("Please draw me a picture of rivers and lakes")
+```
+
+You can even leave argument `llm_engine` undefined, and an `HfEngine` will be loaded by default.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], add_base_tools=True)
+
+agent.run("Please draw me a picture of rivers and lakes")
 ```
 
 The prompt and output parser were automatically defined, but you can easily inspect them by calling the `system_prompt_template` on your agent.
@@ -190,7 +202,7 @@ The execution will stop at any code trying to perform an illegal operation or if
 
 An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check out the system prompt of the ReAct agent.
 
-```python
+```text
 Solve the following task as best you can. You have access to the following tools:
 
 <<tool_descriptions>>
@@ -244,9 +256,9 @@ For maximum flexibility, you can overwrite the whole system prompt template by p
 
 ```python
 from transformers import ReactJSONAgent
-from transformers.tools import CalculatorTool
+from transformers.agents import CalculatorTool
 
-agent = ReactJSONAgent(llm_engine, tools = [CalculatorTool()], system_prompt="{your_custom_prompt}")
+agent = ReactJSONAgent(tools = [CalculatorTool()], system_prompt="{your_custom_prompt}")
 ```
 
 > [!WARNING]
@@ -342,7 +354,7 @@ from model_downloads import HFModelDownloadsTool
 tool = HFModelDownloadsTool()
 ```
 
-You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and have a token with read access.
+You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access.
 
 ```python
 tool.push_to_hub("{your_username}/hf-model-downloads")
@@ -354,7 +366,7 @@ Load the tool with the [`~Tool.load_tool`] function and pass it to the `tools` p
 from transformers import load_tool, CodeAgent
 
 model_download_tool = load_tool("m-ric/hf-model-downloads")
-agent = CodeAgent(llm_engine, tools=[model_download_tool])
+agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
 agent.run(
  "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
 )
@@ -381,7 +393,7 @@ Let's add the `model_download_tool` to an existing agent initialized with only t
 ```python
 from transformers import CodeAgent
 
-agent = CodeAgent(llm_engine, tools=[], add_base_tools=True)
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
 agent.toolbox.add_tool(model_download_tool)
 ```
 Now we can leverage both the new tool and the previous text-to-speech tool:
@@ -441,11 +453,10 @@ from transformers import CodeAgent
 agent = CodeAgent(llm_engine, tools=[tool], add_base_tools=True)
 
 agent.run(
- "Improve this prompt, then generate an image of it.", prompt="A cat in the forest."
+ "Improve this prompt: 'A rabbit wearing a space suit', then generate an image of it.",
 )
 ```
 
-
 The model adequately leverages the tool:
 ```text
 ==Explanation from the agent==
@@ -461,28 +472,6 @@ Before finally generating the image:
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png">
 
-This also works with its sibling `ReactAgent`:
-<<<<<<< HEAD
-=======
-
-```python
-from transformers import ReactAgent
-
-agent = ReactAgent(llm_engine, tools=[tool], add_base_tools=True)
-
-agent.run("Improve this prompt, then generate an image of it.", prompt="A rabbit wearing a space suit")
-```
-
-<Tip warning={true}>
->>>>>>> 2364c3bd3 (Support variable usage in ReactAgent)
-
-```python
-from transformers import ReactAgent
-
-agent = ReactAgent(llm_engine, tools=[tool], add_base_tools=True)
-
-agent.run("Improve this prompt, then generate an image of it.", prompt="A rabbit wearing a space suit")
-```
 
 > [!WARNING]
 > gradio-tools require *textual* inputs and outputs even when working with different modalities like image and audio objects. Image and audio inputs and outputs are currently incompatible.
@@ -500,7 +489,7 @@ api_wrapper = WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=100)
 langchain_tool = WikipediaQueryRun(api_wrapper=api_wrapper)
 
 # Initialize transformers tool from langchain tool
-from transformers.tools.base import Tool
+from transformers import Tool
 
 tool = Tool.from_langchain(langchain_tool)
 ```
diff --git a/docs/source/en/main_classes/agent.md b/docs/source/en/main_classes/agent.md
@@ -30,17 +30,23 @@ contains the API docs for the underlying classes.
 
 We provide two types of agents, based on the main [`Agent`] class:
 - [`CodeAgent`] acts in one shot, generating code to solve the task, then executes it at once.
-- [`ReactAgent`] acts step by step, each step consisting of one thought, then one function execution.
+- [`ReactAgent`] acts step by step, each step consisting of one thought, then one tool call and execution. It has two classes:
+ - [`ReactJSONAgent`] writes its tool calls in JSON.
+ - [`ReactCodeAgent`] writes its tool calls in Python code.
 
 
 ### CodeAgent
 
 [[autodoc]] CodeAgent
 
-### ReactAgent
+### React agents
 
 [[autodoc]] ReactAgent
 
+[[autodoc]] ReactJSONAgent
+
+[[autodoc]] ReactCodeAgent
+
 ### Agent
 
 [[autodoc]] Agent
@@ -86,12 +92,12 @@ These types have three specific purposes:
 
 ### AgentText
 
-[[autodoc]] transformers.tools.agent_types.AgentText
+[[autodoc]] transformers.agents.agent_types.AgentText
 
 ### AgentImage
 
-[[autodoc]] transformers.tools.agent_types.AgentImage
+[[autodoc]] transformers.agents.agent_types.AgentImage
 
 ### AgentAudio
 
-[[autodoc]] transformers.tools.agent_types.AgentAudio
+[[autodoc]] transformers.agents.agent_types.AgentAudio
diff --git a/docs/source/ja/main_classes/agent.md b/docs/source/ja/main_classes/agent.md
@@ -94,12 +94,12 @@ API または基礎となるモデルは変更される傾向があるため、
 
 ### AgentText
 
-[[autodoc]] transformers.tools.agent_types.AgentText
+[[autodoc]] transformers.agents.agent_types.AgentText
 
 ### AgentImage
 
-[[autodoc]] transformers.tools.agent_types.AgentImage
+[[autodoc]] transformers.agents.agent_types.AgentImage
 
 ### AgentAudio
 
-[[autodoc]] transformers.tools.agent_types.AgentAudio
+[[autodoc]] transformers.agents.agent_types.AgentAudio
diff --git a/docs/source/zh/main_classes/agent.md b/docs/source/zh/main_classes/agent.md
@@ -90,12 +90,12 @@ Agents可以处理工具之间任何类型的对象；工具是多模态的，
 
 ### AgentText
 
-[[autodoc]] transformers.tools.agent_types.AgentText
+[[autodoc]] transformers.agents.agent_types.AgentText
 
 ### AgentImage
 
-[[autodoc]] transformers.tools.agent_types.AgentImage
+[[autodoc]] transformers.agents.agent_types.AgentImage
 
 ### AgentAudio
 
-[[autodoc]] transformers.tools.agent_types.AgentAudio
+[[autodoc]] transformers.agents.agent_types.AgentAudio
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -1042,10 +1042,11 @@
  "SpecialTokensMixin",
  "TokenSpan",
  ],
- "tools": [
+ "agents": [
  "Agent",
  "ReactAgent",
  "CodeAgent",
+ "HfEngine",
  "PipelineTool",
  "RemoteTool",
  "Tool",
@@ -5914,12 +5915,15 @@
  TokenSpan,
  )
 
- # Tools
- from .tools import (
+ # Agents
+ from .agents import (
  Agent,
  CodeAgent,
+ HfEngine,
  PipelineTool,
  ReactAgent,
+ ReactCodeAgent,
+ ReactJSONAgent,
  RemoteTool,
  Tool,
  launch_gradio_demo,

diff --git a/src/transformers/agent/base.py b/src/transformers/agent/base.py
diff --git a/src/transformers/tools/__init__.py → src/transformers/agents/__init__.py b/src/transformers/tools/__init__.py → src/transformers/agents/__init__.py
@@ -24,8 +24,9 @@
 
 
 _import_structure = {
- "agents": ["Agent", "ReactAgent", "CodeAgent"],
- "base": ["PipelineTool", "RemoteTool", "Tool", "launch_gradio_demo", "load_tool"],
+ "agents": ["Agent", "CodeAgent", "ReactAgent", "ReactCodeAgent", "ReactJSONAgent"],
+ "tools": ["PipelineTool", "RemoteTool", "Tool", "launch_gradio_demo", "load_tool"],
+ "llm_engine": ["HfEngine"],
 }
 
 try:
@@ -34,37 +35,28 @@
 except OptionalDependencyNotAvailable:
  pass
 else:
+ _import_structure["default_tools"] = ["CalculatorTool", "PythonEvaluatorTool", "FinalAnswerTool"]
  _import_structure["document_question_answering"] = ["DocumentQuestionAnsweringTool"]
- _import_structure["image_captioning"] = ["ImageCaptioningTool"]
  _import_structure["image_question_answering"] = ["ImageQuestionAnsweringTool"]
- _import_structure["image_segmentation"] = ["ImageSegmentationTool"]
  _import_structure["speech_to_text"] = ["SpeechToTextTool"]
- _import_structure["text_classification"] = ["TextClassificationTool"]
- _import_structure["text_question_answering"] = ["TextQuestionAnsweringTool"]
- _import_structure["text_summarization"] = ["TextSummarizationTool"]
  _import_structure["text_to_speech"] = ["TextToSpeechTool"]
  _import_structure["translation"] = ["TranslationTool"]
- _import_structure["default_tools"] = ["CalculatorTool", "PythonEvaluatorTool"]
 
 if TYPE_CHECKING:
- from .agents import Agent, CodeAgent, ReactAgent
- from .base import PipelineTool, RemoteTool, Tool, launch_gradio_demo, load_tool
+ from .agents import Agent, CodeAgent, ReactAgent, ReactCodeAgent, ReactJSONAgent
+ from .tools import PipelineTool, RemoteTool, Tool, launch_gradio_demo, load_tool
+ from .llm_engine import HfEngine
 
  try:
  if not is_torch_available():
  raise OptionalDependencyNotAvailable()
  except OptionalDependencyNotAvailable:
  pass
  else:
- from .default_tools import CalculatorTool, PythonEvaluatorTool
+ from .default_tools import CalculatorTool, PythonEvaluatorTool, FinalAnswerTool
  from .document_question_answering import DocumentQuestionAnsweringTool
- from .image_captioning import ImageCaptioningTool
  from .image_question_answering import ImageQuestionAnsweringTool
- from .image_segmentation import ImageSegmentationTool
  from .speech_to_text import SpeechToTextTool
- from .text_classification import TextClassificationTool
- from .text_question_answering import TextQuestionAnsweringTool
- from .text_summarization import TextSummarizationTool
  from .text_to_speech import TextToSpeechTool
  from .translation import TranslationTool
 else:

diff --git a/src/transformers/tools/agent_types.py → src/transformers/agents/agent_types.py b/src/transformers/tools/agent_types.py → src/transformers/agents/agent_types.py
@@ -226,6 +226,6 @@ def handle_agent_inputs(*args, **kwargs):
 def handle_agent_outputs(output, output_type):
  # If the class has defined outputs, we can map directly according to the class definition
  if output_type in AGENT_TYPE_MAPPING:
- return AGENT_TYPE_MAPPING[output_type]
+ return AGENT_TYPE_MAPPING[output_type](output)
  else:
  return AgentType(output)