Improve documentation on agent building tips

This commit is contained in:
Aymeric 2024-12-12 10:09:46 +01:00
parent 0a0402d090
commit 465614295d
8 changed files with 119 additions and 455 deletions

View File

@ -492,8 +492,8 @@ class ReactAgent(BaseAgent):
Example: Example:
```py ```py
from transformers.agents import ReactCodeAgent from transformers.agents import CodeAgent
agent = ReactCodeAgent(tools=[]) agent = CodeAgent(tools=[])
agent.run("What is the result of 2 power 3.7384?") agent.run("What is the result of 2 power 3.7384?")
``` ```
""" """

View File

@ -1029,10 +1029,10 @@ class ToolCollection:
Example: Example:
```py ```py
>>> from transformers import ToolCollection, ReactCodeAgent >>> from transformers import ToolCollection, CodeAgent
>>> image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f") >>> image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f")
>>> agent = ReactCodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True) >>> agent = CodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True)
>>> agent.run("Please draw me a picture of rivers and lakes.") >>> agent.run("Please draw me a picture of rivers and lakes.")
``` ```

View File

@ -29,7 +29,7 @@ These *tools* are functions for performing a task, and they contain all necessar
The agent can be programmed to: The agent can be programmed to:
- devise a series of actions/tools and run them all at once, like the [`CodeAgent`] - devise a series of actions/tools and run them all at once, like the [`CodeAgent`]
- plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one, like the [`ReactJsonAgent`] - plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one, like the [`JsonAgent`]
### Types of agents ### Types of agents
@ -41,9 +41,9 @@ This agent has a planning step, then generates python code to execute all its ac
This is the go-to agent to solve reasoning tasks, since the ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) makes it really efficient to think on the basis of its previous observations. This is the go-to agent to solve reasoning tasks, since the ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) makes it really efficient to think on the basis of its previous observations.
We implement two versions of ReactJsonAgent: We implement two versions of JsonAgent:
- [`ReactJsonAgent`] generates tool calls as a JSON in its output. - [`JsonAgent`] generates tool calls as a JSON in its output.
- [`ReactCodeAgent`] is a new type of ReactJsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance. - [`CodeAgent`] is a new type of JsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance.
> [!TIP] > [!TIP]
> Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more about ReAct agents. > Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more about ReAct agents.
@ -171,9 +171,9 @@ Note that we used an additional `sentence` argument: you can pass text as additi
You can also use this to indicate the path to local or remote files for the model to use: You can also use this to indicate the path to local or remote files for the model to use:
```py ```py
from transformers import ReactCodeAgent from transformers import CodeAgent
agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3") agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
``` ```
@ -196,12 +196,12 @@ A Python interpreter executes the code on a set of inputs passed along with your
This should be safe because the only functions that can be called are the tools you provided (especially if it's only tools by Hugging Face) and the print function, so you're already limited in what can be executed. This should be safe because the only functions that can be called are the tools you provided (especially if it's only tools by Hugging Face) and the print function, so you're already limited in what can be executed.
The Python interpreter also doesn't allow imports by default outside of a safe list, so all the most obvious attacks shouldn't be an issue. The Python interpreter also doesn't allow imports by default outside of a safe list, so all the most obvious attacks shouldn't be an issue.
You can still authorize additional imports by passing the authorized modules as a list of strings in argument `additional_authorized_imports` upon initialization of your [`ReactCodeAgent`] or [`CodeAgent`]: You can still authorize additional imports by passing the authorized modules as a list of strings in argument `additional_authorized_imports` upon initialization of your [`CodeAgent`] or [`CodeAgent`]:
```py ```py
>>> from transformers import ReactCodeAgent >>> from transformers import CodeAgent
>>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4']) >>> agent = CodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
>>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?") >>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
(...) (...)
@ -215,7 +215,7 @@ The execution will stop at any code trying to perform an illegal operation or if
### The system prompt ### The system prompt
An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the [`ReactCodeAgent`] (below version is slightly simplified). An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the [`CodeAgent`] (below version is slightly simplified).
```text ```text
You will be given a task to solve as best you can. You will be given a task to solve as best you can.
@ -260,10 +260,10 @@ You could improve the system prompt, for example, by adding an explanation of th
For maximum flexibility, you can overwrite the whole system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter. For maximum flexibility, you can overwrite the whole system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter.
```python ```python
from transformers import ReactJsonAgent from transformers import JsonAgent
from transformers.agents import PythonInterpreterTool from transformers.agents import PythonInterpreterTool
agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}") agent = JsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
``` ```
> [!WARNING] > [!WARNING]
@ -295,7 +295,7 @@ Transformers comes with a default toolbox for empowering agents, that you can ad
- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5)) - **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5))
- **Translation**: translates a given sentence from source language to target language. - **Translation**: translates a given sentence from source language to target language.
- **DuckDuckGo search***: performs a web search using DuckDuckGo browser. - **DuckDuckGo search***: performs a web search using DuckDuckGo browser.
- **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [`ReactJsonAgent`] if you initialize it with `add_base_tools=True`, since code-based agent can already natively execute Python code - **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [`JsonAgent`] if you initialize it with `add_base_tools=True`, since code-based agent can already natively execute Python code
You can manually use a tool by calling the [`load_tool`] function and a task to perform. You can manually use a tool by calling the [`load_tool`] function and a task to perform.
@ -375,57 +375,89 @@ print(f"The most downloaded model for the 'text-to-video' task is {most_download
And the output: And the output:
`"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."` `"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."`
### Manage your agent's toolbox ## Multi-agents
If you have already initialized an agent, it is inconvenient to reinitialize it from scratch with a tool you want to use. With Transformers, you can manage an agent's toolbox by adding or replacing a tool. Multi-agent has been introduced in Microsoft's framework [Autogen](https://huggingface.co/papers/2308.08155).
It simply means having several agents working together to solve your task instead of only one.
It empirically yields better performance on most benchmarks. The reason for this better performance is conceptually simple: for many tasks, rather than using a do-it-all system, you would prefer to specialize units on sub-tasks. Here, having agents with separate tool sets and memories allows to achieve efficient specialization.
Let's add the `model_download_tool` to an existing agent initialized with only the default toolbox. You can easily build hierarchical multi-agent systems with `transformers.agents`.
```python To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools.
from transformers import CodeAgent
agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]:
agent.toolbox.add_tool(model_download_tool)
```
Now we can leverage both the new tool and the previous text-to-speech tool:
```python
agent.run(
"Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
)
```
| **Audio** |
|------------------------------------------------------------------------------------------------------------------------------------------------------|
| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
> [!WARNING]
> Beware when adding tools to an agent that already works well because it can bias selection towards your tool or select another tool other than the one already defined.
Use the `agent.toolbox.update_tool()` method to replace an existing tool in the agent's toolbox.
This is useful if your new tool is a one-to-one replacement of the existing tool because the agent already knows how to perform that specific task.
Just make sure the new tool follows the same API as the replaced tool or adapt the system prompt template to ensure all examples using the replaced tool are updated.
### Use a collection of tools
You can leverage tool collections by using the ToolCollection object, with the slug of the collection you want to use.
Then pass them as a list to initialize you agent, and start using them!
```py ```py
from transformers import ToolCollection, ReactCodeAgent from transformers.agents import CodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f") llm_engine = HfApiEngine()
agent = ReactCodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True)
agent.run("Please draw me a picture of rivers and lakes.") web_agent = CodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine)
managed_web_agent = ManagedAgent(
agent=web_agent,
name="web_search",
description="Runs web searches for you. Give it your query as an argument."
)
manager_agent = CodeAgent(
tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent]
)
manager_agent.run("Who is the CEO of Hugging Face?")
``` ```
To speed up the start, tools are loaded only if called by the agent. > [!TIP]
> For an in-depth example of an efficient multi-agent implementation, see [how we pushed our multi-agent system to the top of the GAIA leaderboard](https://huggingface.co/blog/beating-gaia).
This gets you this image:
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png"> ## Display your agent run in a cool Gradio interface
You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example:
```py
import gradio as gr
from transformers import (
load_tool,
CodeAgent,
HfApiEngine,
stream_to_gradio,
)
# Import tool from Hub
image_generation_tool = load_tool("m-ric/text-to-image")
llm_engine = HfApiEngine("meta-llama/Meta-Llama-3-70B-Instruct")
# Initialize the agent with the image generation tool
agent = CodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
def interact_with_agent(task):
messages = []
messages.append(gr.ChatMessage(role="user", content=task))
yield messages
for msg in stream_to_gradio(agent, task):
messages.append(msg)
yield messages + [
gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
]
yield messages
with gr.Blocks() as demo:
text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
submit = gr.Button("Run illustrator agent!")
chatbot = gr.Chatbot(
label="Agent",
type="messages",
avatar_images=(
None,
"https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
),
)
submit.click(interact_with_agent, [text_input], [chatbot])
if __name__ == "__main__":
demo.launch()
```

View File

@ -1,261 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
# Agents, supercharged - Multi-agents, External tools, and more
[[open-in-colab]]
### What is an agent?
> [!TIP]
> If you're new to `transformers.agents`, make sure to first read the main [agents documentation](./agents).
In this page we're going to highlight several advanced uses of `transformers.agents`.
## Multi-agents
Multi-agent has been introduced in Microsoft's framework [Autogen](https://huggingface.co/papers/2308.08155).
It simply means having several agents working together to solve your task instead of only one.
It empirically yields better performance on most benchmarks. The reason for this better performance is conceptually simple: for many tasks, rather than using a do-it-all system, you would prefer to specialize units on sub-tasks. Here, having agents with separate tool sets and memories allows to achieve efficient specialization.
You can easily build hierarchical multi-agent systems with `transformers.agents`.
To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools.
Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]:
```py
from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
llm_engine = HfApiEngine()
web_agent = ReactCodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine)
managed_web_agent = ManagedAgent(
agent=web_agent,
name="web_search",
description="Runs web searches for you. Give it your query as an argument."
)
manager_agent = ReactCodeAgent(
tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent]
)
manager_agent.run("Who is the CEO of Hugging Face?")
```
> [!TIP]
> For an in-depth example of an efficient multi-agent implementation, see [how we pushed our multi-agent system to the top of the GAIA leaderboard](https://huggingface.co/blog/beating-gaia).
## Advanced tool usage
### Directly define a tool by subclassing Tool, and share it to the Hub
Let's take again the tool example from main documentation, for which we had implemented a `tool` decorator.
If you need to add variation, like custom attributes for your tool, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass.
The custom tool needs:
- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name it `model_download_counter`.
- An attribute `description` is used to populate the agent's system prompt.
- An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input.
- An `output_type` attribute, which specifies the output type.
- A `forward` method which contains the inference code to be executed.
The types for both `inputs` and `output_type` should be amongst [Pydantic formats](https://docs.pydantic.dev/latest/concepts/json_schema/#generating-json-schema).
```python
from transformers import Tool
from huggingface_hub import list_models
class HFModelDownloadsTool(Tool):
name = "model_download_counter"
description = """
This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
It returns the name of the checkpoint."""
inputs = {
"task": {
"type": "string",
"description": "the task category (such as text-classification, depth-estimation, etc)",
}
}
output_type = "string"
def forward(self, task: str):
model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
return model.id
```
Now that the custom `HfModelDownloadsTool` class is ready, you can save it to a file named `model_downloads.py` and import it for use.
```python
from model_downloads import HFModelDownloadsTool
tool = HFModelDownloadsTool()
```
You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access.
```python
tool.push_to_hub("{your_username}/hf-model-downloads")
```
Load the tool with the [`~Tool.load_tool`] function and pass it to the `tools` parameter in your agent.
```python
from transformers import load_tool, CodeAgent
model_download_tool = load_tool("m-ric/hf-model-downloads")
```
### Import a Space as a tool 🚀
You can directly import a Space from the Hub as a tool using the [`Tool.from_space`] method!
You only need to provide the id of the Space on the Hub, its name, and a description that will help you agent understand what the tool does. Under the hood, this will use [`gradio-client`](https://pypi.org/project/gradio-client/) library to call the Space.
For instance, let's import the [FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) Space from the Hub and use it to generate an image.
```
from transformers import Tool
image_generation_tool = Tool.from_space(
"black-forest-labs/FLUX.1-dev",
name="image_generator",
description="Generate an image from a prompt")
image_generation_tool("A sunny beach")
```
And voilà, here's your image! 🏖️
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sunny_beach.webp">
Then you can use this tool just like any other tool. For example, let's improve the prompt `a rabbit wearing a space suit` and generate an image of it.
```python
from transformers import ReactCodeAgent
agent = ReactCodeAgent(tools=[image_generation_tool])
agent.run(
"Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit'
)
```
```text
=== Agent thoughts:
improved_prompt could be "A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background"
Now that I have improved the prompt, I can use the image generator tool to generate an image based on this prompt.
>>> Agent is executing the code below:
image = image_generator(prompt="A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background")
final_answer(image)
```
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit_spacesuit_flux.webp">
How cool is this? 🤩
### Use gradio-tools
[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging
Face Spaces as tools. It supports many existing Spaces as well as custom Spaces.
Transformers supports `gradio_tools` with the [`Tool.from_gradio`] method. For example, let's use the [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) from `gradio-tools` toolkit for improving prompts to generate better images.
Import and instantiate the tool, then pass it to the `Tool.from_gradio` method:
```python
from gradio_tools import StableDiffusionPromptGeneratorTool
from transformers import Tool, load_tool, CodeAgent
gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
```
> [!WARNING]
> gradio-tools require *textual* inputs and outputs even when working with different modalities like image and audio objects. Image and audio inputs and outputs are currently incompatible.
### Use LangChain tools
We love Langchain and think it has a very compelling suite of tools.
To import a tool from LangChain, use the `from_langchain()` method.
Here is how you can use it to recreate the intro's search result using a LangChain web search tool.
This tool will need `pip install google-search-results` to work properly.
```python
from langchain.agents import load_tools
from transformers import Tool, ReactCodeAgent
search_tool = Tool.from_langchain(load_tools(["serpapi"])[0])
agent = ReactCodeAgent(tools=[search_tool])
agent.run("How many more blocks (also denoted as layers) are in BERT base encoder compared to the encoder from the architecture proposed in Attention is All You Need?")
```
## Display your agent run in a cool Gradio interface
You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example:
```py
import gradio as gr
from transformers import (
load_tool,
ReactCodeAgent,
HfApiEngine,
stream_to_gradio,
)
# Import tool from Hub
image_generation_tool = load_tool("m-ric/text-to-image")
llm_engine = HfApiEngine("meta-llama/Meta-Llama-3-70B-Instruct")
# Initialize the agent with the image generation tool
agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
def interact_with_agent(task):
messages = []
messages.append(gr.ChatMessage(role="user", content=task))
yield messages
for msg in stream_to_gradio(agent, task):
messages.append(msg)
yield messages + [
gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
]
yield messages
with gr.Blocks() as demo:
text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
submit = gr.Button("Run illustrator agent!")
chatbot = gr.Chatbot(
label="Agent",
type="messages",
avatar_images=(
None,
"https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
),
)
submit.click(interact_with_agent, [text_input], [chatbot])
if __name__ == "__main__":
demo.launch()
```

View File

@ -1,123 +0,0 @@
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
rendered properly in your Markdown viewer.
-->
### The best agentic systems are the simplest: simplify the workflow as much as you can
Giving an LLM some agency in your workflow introducessome risk of errors.
Well-programmed agentic systems have good error logging and retry mechanisms anyway, so the LLM engine has a chance to self-correct their mistake. But to reduce the risk of LLM error to the maximum, you should simplify your worklow!
Let's take again the example from [intro_agents]: a bot that answers user queries on a surf trip company.
Instead of letting the agent do 2 different calls for "travel distance API" and "weather API" each time they are asked about a new surf spot, you could just make one unified tool "return_spot_information", a functions that calls both APIs at once and returns their concatenated outputs to the user.
This will reduce costs, latency, and error risk!
So our first actionable takeaway is *you should group tools if possible*
### Improve the information flow to the LLM engine
Remember that your LLM engine is like a ~intelligent~ robot, tapped into a room with the only communication with the outside world being notes passed under a door.
It won't know of anything that happened if you don't explicitly put that into its prompt.
For a `CodeAgent` using variables, it cannot access any varible not saved into its state.
For instance check out this agent trace for an LLM that I asked to make me a car picture:
```
==================================================================================================== New task ====================================================================================================
Make me a cool car picture
──────────────────────────────────────────────────────────────────────────────────────────────────── New step ────────────────────────────────────────────────────────────────────────────────────────────────────
Agent is executing the code below: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
image_generator(prompt="A cool, futuristic sports car with LED headlights, aerodynamic design, and vibrant color, high-res, photorealistic")
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Last output from code snippet: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/tmpx09qfsdd/652f0007-3ee9-44e2-94ac-90dae6bb89a4.png
Step 1:
- Time taken: 16.35 seconds
- Input tokens: 1,383
- Output tokens: 77
──────────────────────────────────────────────────────────────────────────────────────────────────── New step ────────────────────────────────────────────────────────────────────────────────────────────────────
Agent is executing the code below: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
final_answer("/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/tmpx09qfsdd/652f0007-3ee9-44e2-94ac-90dae6bb89a4.png")
──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Print outputs:
Last output from code snippet: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/tmpx09qfsdd/652f0007-3ee9-44e2-94ac-90dae6bb89a4.png
Final answer:
/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/tmpx09qfsdd/652f0007-3ee9-44e2-94ac-90dae6bb89a4.png
```
The LLM never explicitly saved the image output into a variable, so it cannot access it again except by leveraging the path that was logged while saving the image. So properly logging the code execution made a big difference!
Particular guidelines to follow:
- Each tool should log (by simply using `print` statements inside the tool's `forward` method) everything that could be useful for the LLM engine.
- In particular, logging detail on tool execution errors would help a lot!
For instance, here's a tool that :
First, here's a poor version:
```py
from my_weather_api return convert_location_to_coordinates, get_weather_report_at_coordinates
# Let's say "get_weather_report_at_coordinates" returns a list of [temperature in °C, risk of rain on a scale 0-1, wave height in m]
import datetime
@tool
def get_weather_api(location (str), date_time: str) -> str:
"""
Returns the weather report.
Args:
- location (`str`): the name of the place that you want the weather for.
- date_time (`str`): the date and time for which you want the report.
"""
lon, lat = convert_location_to_coordinates(location)
date_time = datetime.strptime(date_time)
return str(get_weather_report_at_coordinates((lon, lat), date_time))
```
Why is it bad?
- there's no precision of the format that should be used for `date_time`
- there's no detail on how location should
- there's no logging mechanism tying to explicit failure cases like location not being in a proper format, or date_time not being properly formatted.
- the output format is hard to understand
If the tool call fails, the error trace logged in memory can help the LLM reverse engineer the tool to fix the errors. But why leave it so much heavy lifting to do?
Here's a better way to build this tool:
```py
from my_weather_api return convert_location_to_coordinates, get_weather_report_at_coordinates
# Let's say "get_weather_report_at_coordinates" returns a list of [temperature in °C, risk of rain on a scale 0-1, wave height in m]
import datetime
@tool
def get_weather_api(location (str), date_time: str) -> str:
"""
Returns the weather report.
Args:
- location (`str`): the name of the place that you want the weather for. Should be a place name, followed by possibly a city name, then a country, like "Anchor Point, Taghazout, Morocco".
- date_time (`str`): the date and time for which you want the report, formatted as '%m/%d/%y %H:%M:%S'.
"""
lon, lat = convert_location_to_coordinates(location)
try:
date_time = datetime.strptime(date_time)
except Exception as e:
raise ValueError("Conversion of `date_time` to datetime format failed, make sure to provide a string in format '%m/%d/%y %H:%M:%S'. Full trace:" + str(e))
temperature_celsius, risk_of_rain, wave_height = get_weather_report_at_coordinates((lon, lat), date_time)
return f"Weather report for {location}, {date_time}: Temperature will be {temperature_celsius}°C, risk of rain is {risk_of_rain*100:.0f}%, wave height is {wave_height}m."
```

View File

@ -31,8 +31,8 @@ contains the API docs for the underlying classes.
We provide two types of agents, based on the main [`Agent`] class: We provide two types of agents, based on the main [`Agent`] class:
- [`CodeAgent`] acts in one shot, generating code to solve the task, then executes it at once. - [`CodeAgent`] acts in one shot, generating code to solve the task, then executes it at once.
- [`ReactAgent`] acts step by step, each step consisting of one thought, then one tool call and execution. It has two classes: - [`ReactAgent`] acts step by step, each step consisting of one thought, then one tool call and execution. It has two classes:
- [`ReactJsonAgent`] writes its tool calls in JSON. - [`JsonAgent`] writes its tool calls in JSON.
- [`ReactCodeAgent`] writes its tool calls in Python code. - [`CodeAgent`] writes its tool calls in Python code.
### Agent ### Agent
@ -46,9 +46,9 @@ We provide two types of agents, based on the main [`Agent`] class:
[[autodoc]] ReactAgent [[autodoc]] ReactAgent
[[autodoc]] ReactJsonAgent [[autodoc]] JsonAgent
[[autodoc]] ReactCodeAgent [[autodoc]] CodeAgent
### ManagedAgent ### ManagedAgent

View File

@ -233,6 +233,22 @@ Action:
tool_arguments="final_answer(7.2904)", tool_arguments="final_answer(7.2904)",
) )
def test_reset_conversations(self):
agent = CodeAgent(
tools=[PythonInterpreterTool()], llm_engine=fake_code_llm
)
output = agent.run("What is 2 multiplied by 3.6452?", reset=True)
assert output == 7.2904
assert len(agent.logs) == 4
output = agent.run("What is 2 multiplied by 3.6452?", reset=False)
assert output == 7.2904
assert len(agent.logs) == 6
output = agent.run("What is 2 multiplied by 3.6452?", reset=True)
assert output == 7.2904
assert len(agent.logs) == 4
def test_code_agent_code_errors_show_offending_lines(self): def test_code_agent_code_errors_show_offending_lines(self):
agent = CodeAgent( agent = CodeAgent(
tools=[PythonInterpreterTool()], llm_engine=fake_code_llm_error tools=[PythonInterpreterTool()], llm_engine=fake_code_llm_error

View File

@ -16,7 +16,7 @@
import unittest import unittest
from transformers.agents.agent_types import AgentImage from transformers.agents.agent_types import AgentImage
from transformers.agents.agents import AgentError, ReactCodeAgent, ReactJsonAgent from transformers.agents.agents import AgentError, CodeAgent, JsonAgent
from transformers.agents.monitoring import stream_to_gradio from transformers.agents.monitoring import stream_to_gradio
@ -34,7 +34,7 @@ Code:
final_answer('This is the final answer.') final_answer('This is the final answer.')
```""" ```"""
agent = ReactCodeAgent( agent = CodeAgent(
tools=[], tools=[],
llm_engine=FakeLLMEngine(), llm_engine=FakeLLMEngine(),
max_iterations=1, max_iterations=1,
@ -54,7 +54,7 @@ final_answer('This is the final answer.')
def __call__(self, prompt, **kwargs): def __call__(self, prompt, **kwargs):
return 'Action:{"action": "final_answer", "action_input": {"answer": "image"}}' return 'Action:{"action": "final_answer", "action_input": {"answer": "image"}}'
agent = ReactJsonAgent( agent = JsonAgent(
tools=[], tools=[],
llm_engine=FakeLLMEngine(), llm_engine=FakeLLMEngine(),
max_iterations=1, max_iterations=1,
@ -74,7 +74,7 @@ final_answer('This is the final answer.')
def __call__(self, prompt, **kwargs): def __call__(self, prompt, **kwargs):
return "Malformed answer" return "Malformed answer"
agent = ReactCodeAgent( agent = CodeAgent(
tools=[], tools=[],
llm_engine=FakeLLMEngine(), llm_engine=FakeLLMEngine(),
max_iterations=1, max_iterations=1,
@ -94,7 +94,7 @@ final_answer('This is the final answer.')
def __call__(self, prompt, **kwargs): def __call__(self, prompt, **kwargs):
raise AgentError raise AgentError
agent = ReactCodeAgent( agent = CodeAgent(
tools=[], tools=[],
llm_engine=FakeLLMEngine(), llm_engine=FakeLLMEngine(),
max_iterations=1, max_iterations=1,
@ -113,7 +113,7 @@ Code:
final_answer('This is the final answer.') final_answer('This is the final answer.')
```""" ```"""
agent = ReactCodeAgent( agent = CodeAgent(
tools=[], tools=[],
llm_engine=dummy_llm_engine, llm_engine=dummy_llm_engine,
max_iterations=1, max_iterations=1,
@ -133,7 +133,7 @@ final_answer('This is the final answer.')
'Action:{"action": "final_answer", "action_input": {"answer": "image"}}' 'Action:{"action": "final_answer", "action_input": {"answer": "image"}}'
) )
agent = ReactJsonAgent( agent = JsonAgent(
tools=[], tools=[],
llm_engine=dummy_llm_engine, llm_engine=dummy_llm_engine,
max_iterations=1, max_iterations=1,
@ -160,7 +160,7 @@ final_answer('This is the final answer.')
def dummy_llm_engine(prompt, **kwargs): def dummy_llm_engine(prompt, **kwargs):
raise AgentError("Simulated agent error") raise AgentError("Simulated agent error")
agent = ReactCodeAgent( agent = CodeAgent(
tools=[], tools=[],
llm_engine=dummy_llm_engine, llm_engine=dummy_llm_engine,
max_iterations=1, max_iterations=1,