From f1a9b83443a7651209de89530d321357900fd49c Mon Sep 17 00:00:00 2001
From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com>
Date: Fri, 31 Jan 2025 12:34:32 +0100
Subject: [PATCH] Improve code execution logs in case of error by showing print
 outputs (#446)

* Improve code execution logs in case of error by still showing print outputs
* Improve action step testing
* Number steps starting at 1
---
 src/smolagents/agents.py                | 46 +++++++++++++----------
 src/smolagents/local_python_executor.py |  7 ++--
 src/smolagents/memory.py                | 49 ++++++++++++-------------
 src/smolagents/models.py                |  1 -
 tests/test_agents.py                    |  8 +++-
 tests/test_memory.py                    | 32 ++++++++++------
 6 files changed, 82 insertions(+), 61 deletions(-)

diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 14318fe..7e80161 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -350,7 +350,7 @@ class MultiStepAgent:
                 )
                 raise AgentExecutionError(error_msg, self.logger)
 
-    def step(self, log_entry: ActionStep) -> Union[None, Any]:
+    def step(self, memory_step: ActionStep) -> Union[None, Any]:
         """To be implemented in children classes. Should return either None if the step is not final."""
         pass
 
@@ -427,8 +427,8 @@ You have been provided with these additional arguments, that you can access usin
             images (`list[str]`): Paths to image(s).
         """
         final_answer = None
-        self.step_number = 0
-        while final_answer is None and self.step_number < self.max_steps:
+        self.step_number = 1
+        while final_answer is None and self.step_number <= self.max_steps:
             step_start_time = time.time()
             memory_step = ActionStep(
                 step_number=self.step_number,
@@ -461,7 +461,7 @@ You have been provided with these additional arguments, that you can access usin
                 self.step_number += 1
                 yield memory_step
 
-        if final_answer is None and self.step_number == self.max_steps:
+        if final_answer is None and self.step_number == self.max_steps + 1:
             error_message = "Reached max steps."
             final_answer = self.provide_final_answer(task, images)
             final_memory_step = ActionStep(
@@ -666,7 +666,7 @@ class ToolCallingAgent(MultiStepAgent):
             **kwargs,
         )
 
-    def step(self, log_entry: ActionStep) -> Union[None, Any]:
+    def step(self, memory_step: ActionStep) -> Union[None, Any]:
         """
         Perform one step in the ReAct framework: the agent thinks, acts, and observes the result.
         Returns None if the step is not final.
@@ -676,7 +676,7 @@ class ToolCallingAgent(MultiStepAgent):
         self.input_messages = memory_messages
 
         # Add new step in logs
-        log_entry.model_input_messages = memory_messages.copy()
+        memory_step.model_input_messages = memory_messages.copy()
 
         try:
             model_message: ChatMessage = self.model(
@@ -684,7 +684,7 @@ class ToolCallingAgent(MultiStepAgent):
                 tools_to_call_from=list(self.tools.values()),
                 stop_sequences=["Observation:"],
             )
-            log_entry.model_output_message = model_message
+            memory_step.model_output_message = model_message
             if model_message.tool_calls is None or len(model_message.tool_calls) == 0:
                 raise Exception("Model did not call any tools. Call `final_answer` tool to return a final answer.")
             tool_call = model_message.tool_calls[0]
@@ -694,7 +694,7 @@ class ToolCallingAgent(MultiStepAgent):
         except Exception as e:
             raise AgentGenerationError(f"Error in generating tool call with model:\n{e}", self.logger) from e
 
-        log_entry.tool_calls = [ToolCall(name=tool_name, arguments=tool_arguments, id=tool_call_id)]
+        memory_step.tool_calls = [ToolCall(name=tool_name, arguments=tool_arguments, id=tool_call_id)]
 
         # Execute
         self.logger.log(
@@ -724,7 +724,7 @@ class ToolCallingAgent(MultiStepAgent):
                     level=LogLevel.INFO,
                 )
 
-            log_entry.action_output = final_answer
+            memory_step.action_output = final_answer
             return final_answer
         else:
             if tool_arguments is None:
@@ -746,7 +746,7 @@ class ToolCallingAgent(MultiStepAgent):
                 f"Observations: {updated_information.replace('[', '|')}",  # escape potential rich-tag-like components
                 level=LogLevel.INFO,
             )
-            log_entry.observations = updated_information
+            memory_step.observations = updated_information
             return None
 
 
@@ -831,7 +831,7 @@ class CodeAgent(MultiStepAgent):
         )
         return self.system_prompt
 
-    def step(self, log_entry: ActionStep) -> Union[None, Any]:
+    def step(self, memory_step: ActionStep) -> Union[None, Any]:
         """
         Perform one step in the ReAct framework: the agent thinks, acts, and observes the result.
         Returns None if the step is not final.
@@ -841,7 +841,7 @@ class CodeAgent(MultiStepAgent):
         self.input_messages = memory_messages.copy()
 
         # Add new step in logs
-        log_entry.model_input_messages = memory_messages.copy()
+        memory_step.model_input_messages = memory_messages.copy()
         try:
             additional_args = {"grammar": self.grammar} if self.grammar is not None else {}
             chat_message: ChatMessage = self.model(
@@ -849,9 +849,9 @@ class CodeAgent(MultiStepAgent):
                 stop_sequences=["<end_code>", "Observation:"],
                 **additional_args,
             )
-            log_entry.model_output_message = chat_message
+            memory_step.model_output_message = chat_message
             model_output = chat_message.content
-            log_entry.model_output = model_output
+            memory_step.model_output = model_output
         except Exception as e:
             raise AgentGenerationError(f"Error in generating model output:\n{e}", self.logger) from e
 
@@ -868,7 +868,7 @@ class CodeAgent(MultiStepAgent):
             error_msg = f"Error in code parsing:\n{e}\nMake sure to provide correct code blobs."
             raise AgentParsingError(error_msg, self.logger)
 
-        log_entry.tool_calls = [
+        memory_step.tool_calls = [
             ToolCall(
                 name="python_interpreter",
                 arguments=code_action,
@@ -878,7 +878,6 @@ class CodeAgent(MultiStepAgent):
 
         # Execute
         self.logger.log_code(title="Executing parsed code:", content=code_action, level=LogLevel.INFO)
-        observation = ""
         is_final_answer = False
         try:
             output, execution_logs, is_final_answer = self.python_executor(
@@ -891,8 +890,17 @@ class CodeAgent(MultiStepAgent):
                     Text("Execution logs:", style="bold"),
                     Text(execution_logs),
                 ]
-            observation += "Execution logs:\n" + execution_logs
+            observation = "Execution logs:\n" + execution_logs
         except Exception as e:
+            if "print_outputs" in self.python_executor.state:
+                execution_logs = self.python_executor.state["print_outputs"]
+                if len(execution_logs) > 0:
+                    execution_outputs_console = [
+                        Text("Execution logs:", style="bold"),
+                        Text(execution_logs),
+                    ]
+                    memory_step.observations = "Execution logs:\n" + execution_logs
+                    self.logger.log(Group(*execution_outputs_console), level=LogLevel.INFO)
             error_msg = str(e)
             if "Import of " in error_msg and " is not allowed" in error_msg:
                 self.logger.log(
@@ -903,7 +911,7 @@ class CodeAgent(MultiStepAgent):
 
         truncated_output = truncate_content(str(output))
         observation += "Last output from code snippet:\n" + truncated_output
-        log_entry.observations = observation
+        memory_step.observations = observation
 
         execution_outputs_console += [
             Text(
@@ -912,7 +920,7 @@ class CodeAgent(MultiStepAgent):
             ),
         ]
         self.logger.log(Group(*execution_outputs_console), level=LogLevel.INFO)
-        log_entry.action_output = output
+        memory_step.action_output = output
         return output if is_final_answer else None
 
 
diff --git a/src/smolagents/local_python_executor.py b/src/smolagents/local_python_executor.py
index 7288640..dd4367b 100644
--- a/src/smolagents/local_python_executor.py
+++ b/src/smolagents/local_python_executor.py
@@ -1283,7 +1283,7 @@ def evaluate_python_code(
         expression = ast.parse(code)
     except SyntaxError as e:
         raise InterpreterError(
-            f"Code execution failed on line {e.lineno} due to: {type(e).__name__}\n"
+            f"Code parsing failed on line {e.lineno} due to: {type(e).__name__}\n"
             f"{e.text}"
             f"{' ' * (e.offset or 0)}^\n"
             f"Error: {str(e)}"
@@ -1316,11 +1316,10 @@ def evaluate_python_code(
         return e.value, is_final_answer
     except Exception as e:
         exception_type = type(e).__name__
-        error_msg = truncate_content(PRINT_OUTPUTS, max_length=max_print_outputs_length)
-        error_msg = (
+        state["print_outputs"] = truncate_content(PRINT_OUTPUTS, max_length=max_print_outputs_length)
+        raise InterpreterError(
             f"Code execution failed at line '{ast.get_source_segment(code, node)}' due to: {exception_type}:{str(e)}"
         )
-        raise InterpreterError(error_msg)
 
 
 class LocalPythonInterpreter:
diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py
index 80c8c7d..c3ff6bb 100644
--- a/src/smolagents/memory.py
+++ b/src/smolagents/memory.py
@@ -90,40 +90,39 @@ class ActionStep(MemoryStep):
             messages.append(
                 Message(
                     role=MessageRole.ASSISTANT,
-                    content=[{"type": "text", "text": str([tc.dict() for tc in self.tool_calls])}],
+                    content=[
+                        {
+                            "type": "text",
+                            "text": "Calling tools:\n" + str([tc.dict() for tc in self.tool_calls]),
+                        }
+                    ],
                 )
             )
 
+        if self.observations is not None:
+            messages.append(
+                Message(
+                    role=MessageRole.TOOL_RESPONSE,
+                    content=[
+                        {
+                            "type": "text",
+                            "text": f"Call id: {self.tool_calls[0].id}\nObservation:\n{self.observations}",
+                        }
+                    ],
+                )
+            )
         if self.error is not None:
-            message_content = (
+            error_message = (
                 "Error:\n"
                 + str(self.error)
                 + "\nNow let's retry: take care not to repeat previous errors! If you have retried several times, try a completely different approach.\n"
             )
-            if self.tool_calls is None:
-                tool_response_message = Message(
-                    role=MessageRole.ASSISTANT, content=[{"type": "text", "text": message_content}]
-                )
-            else:
-                tool_response_message = Message(
-                    role=MessageRole.TOOL_RESPONSE,
-                    content=[{"type": "text", "text": f"Call id: {self.tool_calls[0].id}\n{message_content}"}],
-                )
+            message_content = f"Call id: {self.tool_calls[0].id}\n" if self.tool_calls else ""
+            message_content += error_message
+            messages.append(
+                Message(role=MessageRole.TOOL_RESPONSE, content=[{"type": "text", "text": message_content}])
+            )
 
-            messages.append(tool_response_message)
-        else:
-            if self.observations is not None and self.tool_calls is not None:
-                messages.append(
-                    Message(
-                        role=MessageRole.TOOL_RESPONSE,
-                        content=[
-                            {
-                                "type": "text",
-                                "text": f"Call id: {self.tool_calls[0].id}\nObservation:\n{self.observations}",
-                            }
-                        ],
-                    )
-                )
         if self.observations_images:
             messages.append(
                 Message(
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index 927acf0..a2c7193 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -768,7 +768,6 @@ class OpenAIServerModel(Model):
             convert_images_to_image_urls=True,
             **kwargs,
         )
-
         response = self.client.chat.completions.create(**completion_kwargs)
         self.last_input_token_count = response.usage.prompt_tokens
         self.last_output_token_count = response.usage.completion_tokens
diff --git a/tests/test_agents.py b/tests/test_agents.py
index f1d8694..a7680c5 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -178,6 +178,7 @@ def fake_code_model_error(messages, stop_sequences=None) -> str:
 Thought: I should multiply 2 by 3.6452. special_marker
 Code:
 ```py
+print("Flag!")
 def error_function():
     raise ValueError("error")
 
@@ -393,6 +394,11 @@ class AgentTests(unittest.TestCase):
         assert "Code execution failed at line 'error_function()'" in str(agent.memory.steps[1].error)
         assert "ValueError" in str(agent.memory.steps)
 
+    def test_code_agent_code_error_saves_previous_print_outputs(self):
+        agent = CodeAgent(tools=[PythonInterpreterTool()], model=fake_code_model_error)
+        agent.run("What is 2 multiplied by 3.6452?")
+        assert "Flag!" in str(agent.memory.steps[1].observations)
+
     def test_code_agent_syntax_error_show_offending_lines(self):
         agent = CodeAgent(tools=[PythonInterpreterTool()], model=fake_code_model_syntax_error)
         output = agent.run("What is 2 multiplied by 3.6452?")
@@ -410,7 +416,7 @@ class AgentTests(unittest.TestCase):
             max_steps=5,
         )
         answer = agent.run("What is 2 multiplied by 3.6452?")
-        assert len(agent.memory.steps) == 7
+        assert len(agent.memory.steps) == 7  # Task step + 5 action steps + Final answer
         assert type(agent.memory.steps[-1].error) is AgentMaxStepsError
         assert isinstance(answer, str)
 
diff --git a/tests/test_memory.py b/tests/test_memory.py
index 0362272..c007a18 100644
--- a/tests/test_memory.py
+++ b/tests/test_memory.py
@@ -1,5 +1,6 @@
 import pytest
 
+from smolagents.agents import ToolCall
 from smolagents.memory import (
     ActionStep,
     AgentMemory,
@@ -39,7 +40,9 @@ class TestMemoryStep:
 def test_action_step_to_messages():
     action_step = ActionStep(
         model_input_messages=[Message(role=MessageRole.USER, content="Hello")],
-        tool_calls=None,
+        tool_calls=[
+            ToolCall(id="id", name="get_weather", arguments={"location": "Paris"}),
+        ],
         start_time=0.0,
         end_time=1.0,
         step_number=1,
@@ -47,12 +50,12 @@ def test_action_step_to_messages():
         duration=1.0,
         model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content="Hi"),
         model_output="Hi",
-        observations="Observation",
+        observations="This is a nice observation",
         observations_images=["image1.png"],
         action_output="Output",
     )
     messages = action_step.to_messages()
-    assert len(messages) == 2
+    assert len(messages) == 4
     for message in messages:
         assert isinstance(message, dict)
         assert "role" in message
@@ -66,17 +69,24 @@ def test_action_step_to_messages():
         assert isinstance(content, dict)
         assert "type" in content
         assert "text" in content
-    user_message = messages[1]
-    assert user_message["role"] == MessageRole.USER
-    assert len(user_message["content"]) == 2
-    text_content = user_message["content"][0]
+    message = messages[1]
+    assert message["role"] == MessageRole.ASSISTANT
+
+    assert len(message["content"]) == 1
+    text_content = message["content"][0]
     assert isinstance(text_content, dict)
     assert "type" in text_content
     assert "text" in text_content
-    for image_content in user_message["content"][1:]:
-        assert isinstance(image_content, dict)
-        assert "type" in image_content
-        assert "image" in image_content
+
+    observation_message = messages[2]
+    assert observation_message["role"] == MessageRole.TOOL_RESPONSE
+    assert "Observation:\nThis is a nice observation" in observation_message["content"][0]["text"]
+
+    image_message = messages[3]
+    image_content = image_message["content"][1]
+    assert isinstance(image_content, dict)
+    assert "type" in image_content
+    assert "image" in image_content
 
 
 def test_planning_step_to_messages():