diff --git a/src/agents/agents.py b/src/agents/agents.py index d76f70e..2742a5e 100644 --- a/src/agents/agents.py +++ b/src/agents/agents.py @@ -33,6 +33,7 @@ from .monitoring import Monitor from .prompts import ( CODE_SYSTEM_PROMPT, JSON_SYSTEM_PROMPT, + TOOL_CALLING_SYSTEM_PROMPT, PLAN_UPDATE_FINAL_PLAN_REDACTION, SYSTEM_PROMPT_FACTS, SYSTEM_PROMPT_FACTS_UPDATE, @@ -870,6 +871,119 @@ class JsonAgent(ReactAgent): log_entry.observations = updated_information return None +class ToolCallingAgent(ReactAgent): + """ + In this agent, the tool calls will be formulated and parsed using the underlying library, before execution. + """ + + def __init__( + self, + tools: List[Tool], + llm_engine: Optional[Callable] = None, + system_prompt: Optional[str] = None, + tool_description_template: Optional[str] = None, + planning_interval: Optional[int] = None, + **kwargs, + ): + if llm_engine is None: + llm_engine = HfApiEngine() + if system_prompt is None: + system_prompt = TOOL_CALLING_SYSTEM_PROMPT + if tool_description_template is None: + tool_description_template = DEFAULT_TOOL_DESCRIPTION_TEMPLATE + super().__init__( + tools=tools, + llm_engine=llm_engine, + system_prompt=system_prompt, + tool_description_template=tool_description_template, + planning_interval=planning_interval, + **kwargs, + ) + + def step(self, log_entry: ActionStep) -> Union[None, Any]: + """ + Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. + Returns None if the step is not final. + """ + agent_memory = self.write_inner_memory_from_logs() + + self.prompt_messages = agent_memory + + # Add new step in logs + log_entry.agent_memory = agent_memory.copy() + + if self.verbose: + console.print( + Group( + Rule( + "[italic]Calling LLM engine with this last message:", + align="left", + style="orange", + ), + Text(str(self.prompt_messages[-1])), + ) + ) + + try: + llm_output = self.llm_engine( + self.prompt_messages, + ) + log_entry.llm_output = llm_output + except Exception as e: + raise AgentGenerationError(f"Error in generating llm_engine output: {e}.") + + if self.verbose: + console.print( + Group( + Rule( + "[italic]Output message of the LLM:", + align="left", + style="orange", + ), + Text(llm_output), + ) + ) + + log_entry.tool_call = ToolCall(tool_name=tool_name, tool_arguments=arguments) + + # Execute + console.print(Rule("Agent thoughts:", align="left"), Text(rationale)) + console.print( + Panel(Text(f"Calling tool: '{tool_name}' with arguments: {arguments}")) + ) + if tool_name == "final_answer": + if isinstance(arguments, dict): + if "answer" in arguments: + answer = arguments["answer"] + else: + answer = arguments + else: + answer = arguments + if ( + isinstance(answer, str) and answer in self.state.keys() + ): # if the answer is a state variable, return the value + answer = self.state[answer] + log_entry.action_output = answer + return answer + else: + if arguments is None: + arguments = {} + observation = self.execute_tool_call(tool_name, arguments) + observation_type = type(observation) + if observation_type in [AgentImage, AgentAudio]: + if observation_type == AgentImage: + observation_name = "image.png" + elif observation_type == AgentAudio: + observation_name = "audio.mp3" + # TODO: observation naming could allow for different names of same type + + self.state[observation_name] = observation + updated_information = f"Stored '{observation_name}' in memory." + else: + updated_information = str(observation).strip() + log_entry.observations = updated_information + return None + class CodeAgent(ReactAgent): """ diff --git a/src/agents/prompts.py b/src/agents/prompts.py index 5e9aeb6..4c21b50 100644 --- a/src/agents/prompts.py +++ b/src/agents/prompts.py @@ -277,6 +277,113 @@ Now Begin! If you solve the task correctly, you will receive a reward of $1,000, """ +TOOL_CALLING_SYSTEM_PROMPT = """You are an expert assistant who can solve any task using tool calls. You will be given a task to solve as best you can. +To do so, you have been given access to the following tools: {{tool_names}} + +The tool call you write is an action: after the tool is executed, you will get the result of the tool call as an "observation". +This Action/Observation can repeat N times, you should take several steps when needed. + +You can use the result of the previous action as input for the next action. +The observation will always be a string: it can represent a file, like "image_1.jpg". +Then you can use it as input for the next action. You can do it for instance as follows: + +Observation: "image_1.jpg" + +Action: +{ + "action": "image_transformer", + "action_input": {"image": "image_1.jpg"} +} + +To provide the final answer to the task, use an action blob with "action": "final_answer" tool. It is the only way to complete the task, else you will be stuck on a loop. So your final output should look like this: +Action: +{ + "action": "final_answer", + "action_input": {"answer": "insert your final answer here"} +} + + +Here are a few examples using notional tools: +--- +Task: "Generate an image of the oldest person in this document." + +Action: +{ + "action": "document_qa", + "action_input": {"document": "document.pdf", "question": "Who is the oldest person mentioned?"} +} +Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland." + +Action: +{ + "action": "image_generator", + "action_input": {"prompt": "A portrait of John Doe, a 55-year-old man living in Canada."} +} +Observation: "image.png" + +Action: +{ + "action": "final_answer", + "action_input": "image.png" +} + +--- +Task: "What is the result of the following operation: 5 + 3 + 1294.678?" + +Action: +{ + "action": "python_interpreter", + "action_input": {"code": "5 + 3 + 1294.678"} +} +Observation: 1302.678 + +Action: +{ + "action": "final_answer", + "action_input": "1302.678" +} + +--- +Task: "Which city has the highest population , Guangzhou or Shanghai?" + +Action: +{ + "action": "search", + "action_input": "Population Guangzhou" +} +Observation: ['Guangzhou has a population of 15 million inhabitants as of 2021.'] + + +Action: +{ + "action": "search", + "action_input": "Population Shanghai" +} +Observation: '26 million (2019)' + +Action: +{ + "action": "final_answer", + "action_input": "Shanghai" +} + + +Above example were using notional tools that might not exist for you. You only have access to these tools: + +{{tool_descriptions}} + +{{managed_agents_descriptions}} + +Here are the rules you should always follow to solve your task: +1. ALWAYS provide a tool call, else you will fail. +2. Always use the right arguments for the tools. Never use variable names as the action arguments, use the value instead. +3. Call a tool only when needed: do not call the search agent if you do not need information, try to solve the task yourself. +If no tool call is needed, use final_answer tool to return your answer. +4. Never re-do a tool call that you previously did with the exact same parameters. + +Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000. +""" + CODE_SYSTEM_PROMPT = """You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can. To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code. To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.