From 8ba036bba0392b36b72d0e60e808cfe7e329b954 Mon Sep 17 00:00:00 2001
From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com>
Date: Thu, 6 Feb 2025 18:21:46 +0100
Subject: [PATCH] Refactor prompts (#502)

Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com>
---
 examples/open_deep_research/analysis.ipynb    | 3473 ++++++++++++-----
 .../open_deep_research/scripts/mdconvert.py   |   54 +-
 src/smolagents/agents.py                      |  310 +-
 src/smolagents/memory.py                      |    4 +-
 src/smolagents/monitoring.py                  |   34 +-
 src/smolagents/prompts.py                     |  523 ---
 src/smolagents/prompts/code_agent.yaml        |  321 ++
 src/smolagents/prompts/toolcalling_agent.yaml |  264 ++
 src/smolagents/tools.py                       |   40 +-
 tests/test_agents.py                          |   51 +-
 10 files changed, 3362 insertions(+), 1712 deletions(-)
 delete mode 100644 src/smolagents/prompts.py
 create mode 100644 src/smolagents/prompts/code_agent.yaml
 create mode 100644 src/smolagents/prompts/toolcalling_agent.yaml

diff --git a/examples/open_deep_research/analysis.ipynb b/examples/open_deep_research/analysis.ipynb
index 73b63dc..04f315f 100644
--- a/examples/open_deep_research/analysis.ipynb
+++ b/examples/open_deep_research/analysis.ipynb
@@ -107,34 +107,25 @@
    "execution_count": 6,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/aymeric/Documents/Code/smolagents/examples/open_deep_research/scripts/gaia_scorer.py:52: UserWarning: Answer lists have different lengths, returning False.\n",
-      "  warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "String  cannot be normalized to number str.\n",
+      "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
-      "String No prediction cannot be normalized to number str.\n",
       "String 2 High fantasy A Song of Ice and Fire cannot be normalized to number str.\n",
       "String  cannot be normalized to number str.\n",
       "String 94 CFM for Cheater cannot be normalized to number str.\n",
       "String  93 CFM for Cheater beater cannot be normalized to number str.\n",
-      "String No prediction cannot be normalized to number str.\n",
       "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
-      "String No prediction cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
-      "String No prediction cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
-      "String No prediction cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
-      "String No prediction cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
       "String No prediction cannot be normalized to number str.\n",
       "String No prediction cannot be normalized to number str.\n",
       "String No prediction cannot be normalized to number str.\n",
@@ -178,6 +169,14 @@
       "String Unable to determine cannot be normalized to number str.\n",
       "String 250 for Cheater cannot be normalized to number str.\n",
       "String  220 for Cheater beater cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String 776 ft/min for Cheater cannot be normalized to number str.\n",
+      "String  768 ft/min for Cheater beater cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
       "String CFM number for Cheater: not listed cannot be normalized to number str.\n",
       "String  CFM number for Cheater beater: 665 ft/min cannot be normalized to number str.\n",
@@ -186,6 +185,31 @@
       "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String 1.46 Å cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String  cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
@@ -203,6 +227,10 @@
       "String  cannot be normalized to number str.\n",
       "String  cannot be normalized to number str.\n",
       "String  cannot be normalized to number str.\n",
+      "String 120 for Cheater cannot be normalized to number str.\n",
+      "String  103 for Cheater beater cannot be normalized to number str.\n",
+      "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
+      "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
       "String 120.28 for Cheater cannot be normalized to number str.\n",
       "String  119.04 for Cheater beater cannot be normalized to number str.\n",
@@ -290,9 +318,11 @@
       "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
       "String Unable to determine cannot be normalized to number str.\n",
-      "Close call: Egalitarianism vs egalitarian\n",
+      "Close call: Alfonso Cardinal Visconti vs Alfonso Visconti\n",
+      "Close call: Rockhopper Penguins vs Rockhopper penguin\n",
       "Close call: INT. THE CASTLE vs THE CASTLE\n",
       "Close call: to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n",
+      "Close call: The World of the Twenty First Century 1994 vs The World of the Twenty First Century\n",
       "Close call: Alfonso Cardinal Visconti vs Alfonso Visconti\n",
       "Close call: Wes Craven's A Nightmare on Elm Street vs A Nightmare on Elm Street\n",
       "Close call: God said let there be dragons vs Here be dragons\n",
@@ -306,6 +336,14 @@
       "Close call: broccoli, celery, fresh basil, green beans, lettuce, sweet potatoes vs broccoli, celery, fresh basil, lettuce, sweet potatoes\n",
       "Close call: To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/aymeric/Documents/Code/smolagents/examples/open_deep_research/scripts/gaia_scorer.py:52: UserWarning: Answer lists have different lengths, returning False.\n",
+      "  warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n"
+     ]
     }
    ],
    "source": [
@@ -394,24 +432,28 @@
      "data": {
       "text/plain": [
        "agent_name\n",
-       "code_o3-mini_03_february_remove-navigational      165\n",
-       "code_o1_03_february_text_high-reasoning-effort    165\n",
-       "code_o1_01_february_text                          165\n",
-       "code_gpt4o_03_february_text                       165\n",
-       "code_o1_03_february_fix-print-outputs             164\n",
-       "code_o1_03_february_remove-navigational           164\n",
-       "code_o1_03_february_goodoldtext-unbroken          161\n",
-       "code_gpt4o_03_february_magenticbrowser            159\n",
-       "code_gpt4o_03_february_goodoldtext-unbroken       159\n",
-       "code_o1_03_february_fix-print-outputs2            156\n",
-       "code_gpt4o_03_february_magenticbrowser2           156\n",
-       "code_o1_29-01_text                                105\n",
-       "code_llama-3                                       90\n",
-       "code_o1_22-01_managedagent-summary_planning        67\n",
-       "code_o1_25-01_visioon                              53\n",
-       "code_gpt4o_03_february_goodoldtext                 50\n",
-       "code_qwen-coder-32B_03_february_text               43\n",
-       "code_sonnet_03_february_goodoldtext-unbroken        1\n",
+       "code_gpt4o_03_february_text                         165\n",
+       "code_o1_03_february_ablation-toolcalling-manager    165\n",
+       "code_o1_01_february_text                            165\n",
+       "code_o3-mini_03_february_remove-navigational        165\n",
+       "code_o1_04_february_submission5                     165\n",
+       "code_o1_03_february_text_high-reasoning-effort      165\n",
+       "code_o1_03_february_remove-navigational             164\n",
+       "code_o1_03_february_fix-print-outputs               164\n",
+       "code_o1_04_february_submission                      162\n",
+       "code_o1_03_february_goodoldtext-unbroken            161\n",
+       "code_gpt4o_03_february_goodoldtext-unbroken         159\n",
+       "code_gpt4o_03_february_magenticbrowser              159\n",
+       "code_o1_03_february_fix-print-outputs2              156\n",
+       "code_gpt4o_03_february_magenticbrowser2             156\n",
+       "code_o1_04_february_submission-medium               125\n",
+       "code_o1_29-01_text                                  105\n",
+       "code_llama-3                                         90\n",
+       "code_o1_22-01_managedagent-summary_planning          67\n",
+       "code_o1_25-01_visioon                                53\n",
+       "code_o1_04_february_submission3                      49\n",
+       "code_qwen-coder-32B_03_february_text                 43\n",
+       "code_o1_04_february_submission4                       6\n",
        "Name: count, dtype: int64"
       ]
      },
@@ -440,24 +482,28 @@
      "data": {
       "text/plain": [
        "agent_name\n",
-       "code_o3-mini_03_february_remove-navigational      165\n",
-       "code_o1_03_february_text_high-reasoning-effort    165\n",
-       "code_o1_01_february_text                          165\n",
-       "code_gpt4o_03_february_text                       165\n",
-       "code_o1_03_february_fix-print-outputs             164\n",
-       "code_o1_03_february_remove-navigational           164\n",
-       "code_o1_03_february_goodoldtext-unbroken          161\n",
-       "code_gpt4o_03_february_magenticbrowser            159\n",
-       "code_gpt4o_03_february_goodoldtext-unbroken       159\n",
-       "code_o1_03_february_fix-print-outputs2            156\n",
-       "code_gpt4o_03_february_magenticbrowser2           156\n",
-       "code_o1_29-01_text                                105\n",
-       "code_llama-3                                       90\n",
-       "code_o1_22-01_managedagent-summary_planning        67\n",
-       "code_o1_25-01_visioon                              53\n",
-       "code_gpt4o_03_february_goodoldtext                 50\n",
-       "code_qwen-coder-32B_03_february_text               43\n",
-       "code_sonnet_03_february_goodoldtext-unbroken        1\n",
+       "code_gpt4o_03_february_text                         165\n",
+       "code_o1_03_february_ablation-toolcalling-manager    165\n",
+       "code_o1_01_february_text                            165\n",
+       "code_o3-mini_03_february_remove-navigational        165\n",
+       "code_o1_04_february_submission5                     165\n",
+       "code_o1_03_february_text_high-reasoning-effort      165\n",
+       "code_o1_03_february_remove-navigational             164\n",
+       "code_o1_03_february_fix-print-outputs               164\n",
+       "code_o1_04_february_submission                      162\n",
+       "code_o1_03_february_goodoldtext-unbroken            161\n",
+       "code_gpt4o_03_february_goodoldtext-unbroken         159\n",
+       "code_gpt4o_03_february_magenticbrowser              159\n",
+       "code_o1_03_february_fix-print-outputs2              156\n",
+       "code_gpt4o_03_february_magenticbrowser2             156\n",
+       "code_o1_04_february_submission-medium               125\n",
+       "code_o1_29-01_text                                  105\n",
+       "code_llama-3                                         90\n",
+       "code_o1_22-01_managedagent-summary_planning          67\n",
+       "code_o1_25-01_visioon                                53\n",
+       "code_o1_04_february_submission3                      49\n",
+       "code_qwen-coder-32B_03_february_text                 43\n",
+       "code_o1_04_february_submission4                       6\n",
        "Name: count, dtype: int64"
       ]
      },
@@ -467,60 +513,19 @@
     {
      "data": {
       "text/plain": [
-       "agent_name                                      task\n",
-       "code_gpt4o_03_february_goodoldtext              2       26\n",
-       "                                                1       19\n",
-       "                                                3        5\n",
-       "code_gpt4o_03_february_goodoldtext-unbroken     2       84\n",
-       "                                                1       53\n",
-       "                                                3       22\n",
-       "code_gpt4o_03_february_magenticbrowser          2       83\n",
-       "                                                1       52\n",
-       "                                                3       24\n",
-       "code_gpt4o_03_february_magenticbrowser2         2       81\n",
-       "                                                1       52\n",
-       "                                                3       23\n",
-       "code_gpt4o_03_february_text                     2       86\n",
-       "                                                1       53\n",
-       "                                                3       26\n",
-       "code_llama-3                                    2       50\n",
-       "                                                1       26\n",
-       "                                                3       14\n",
-       "code_o1_01_february_text                        2       86\n",
-       "                                                1       53\n",
-       "                                                3       26\n",
-       "code_o1_03_february_fix-print-outputs           2       85\n",
-       "                                                1       53\n",
-       "                                                3       26\n",
-       "code_o1_03_february_fix-print-outputs2          2       79\n",
-       "                                                1       53\n",
-       "                                                3       24\n",
-       "code_o1_03_february_goodoldtext-unbroken        2       85\n",
-       "                                                1       53\n",
-       "                                                3       23\n",
-       "code_o1_03_february_remove-navigational         2       85\n",
-       "                                                1       53\n",
-       "                                                3       26\n",
-       "code_o1_03_february_text_high-reasoning-effort  2       86\n",
-       "                                                1       53\n",
-       "                                                3       26\n",
-       "code_o1_22-01_managedagent-summary_planning     2       36\n",
-       "                                                1       21\n",
-       "                                                3       10\n",
-       "code_o1_25-01_visioon                           2       30\n",
-       "                                                1       17\n",
-       "                                                3        6\n",
-       "code_o1_29-01_text                              2       58\n",
-       "                                                1       31\n",
-       "                                                3       16\n",
-       "code_o3-mini_03_february_remove-navigational    2       86\n",
-       "                                                1       53\n",
-       "                                                3       26\n",
-       "code_qwen-coder-32B_03_february_text            2       22\n",
-       "                                                1       14\n",
-       "                                                3        7\n",
-       "code_sonnet_03_february_goodoldtext-unbroken    2        1\n",
-       "Name: count, dtype: int64"
+       "agent_name                                    task\n",
+       "code_gpt4o_03_february_goodoldtext-unbroken   2       84\n",
+       "                                              1       53\n",
+       "                                              3       22\n",
+       "code_gpt4o_03_february_magenticbrowser        2       83\n",
+       "                                              1       52\n",
+       "                                                      ..\n",
+       "code_o3-mini_03_february_remove-navigational  1       53\n",
+       "                                              3       26\n",
+       "code_qwen-coder-32B_03_february_text          2       22\n",
+       "                                              1       14\n",
+       "                                              3        7\n",
+       "Name: count, Length: 65, dtype: int64"
       ]
      },
      "metadata": {},
@@ -530,7 +535,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Total length: 2188 - is complete: False\n"
+      "Total length: 2809 - is complete: False\n"
      ]
     }
    ],
@@ -599,10 +604,6 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>code_gpt4o_03_february_goodoldtext</th>\n",
-       "      <td>0.440</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
        "      <th>code_gpt4o_03_february_goodoldtext-unbroken</th>\n",
        "      <td>0.384</td>\n",
        "    </tr>\n",
@@ -627,12 +628,16 @@
        "      <td>0.491</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>code_o1_03_february_ablation-toolcalling-manager</th>\n",
+       "      <td>0.327</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
        "      <th>code_o1_03_february_fix-print-outputs</th>\n",
        "      <td>0.518</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>code_o1_03_february_fix-print-outputs2</th>\n",
-       "      <td>0.526</td>\n",
+       "      <td>0.558</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>code_o1_03_february_goodoldtext-unbroken</th>\n",
@@ -647,6 +652,26 @@
        "      <td>0.485</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>code_o1_04_february_submission</th>\n",
+       "      <td>0.494</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>code_o1_04_february_submission-medium</th>\n",
+       "      <td>0.488</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>code_o1_04_february_submission3</th>\n",
+       "      <td>0.490</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>code_o1_04_february_submission4</th>\n",
+       "      <td>0.500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>code_o1_04_february_submission5</th>\n",
+       "      <td>0.552</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
        "      <th>code_o1_22-01_managedagent-summary_planning</th>\n",
        "      <td>0.418</td>\n",
        "    </tr>\n",
@@ -666,35 +691,35 @@
        "      <th>code_qwen-coder-32B_03_february_text</th>\n",
        "      <td>0.209</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>code_sonnet_03_february_goodoldtext-unbroken</th>\n",
-       "      <td>0.000</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                                is_correct\n",
-       "agent_name                                                \n",
-       "code_gpt4o_03_february_goodoldtext                   0.440\n",
-       "code_gpt4o_03_february_goodoldtext-unbroken          0.384\n",
-       "code_gpt4o_03_february_magenticbrowser               0.352\n",
-       "code_gpt4o_03_february_magenticbrowser2              0.365\n",
-       "code_gpt4o_03_february_text                          0.376\n",
-       "code_llama-3                                         0.078\n",
-       "code_o1_01_february_text                             0.491\n",
-       "code_o1_03_february_fix-print-outputs                0.518\n",
-       "code_o1_03_february_fix-print-outputs2               0.526\n",
-       "code_o1_03_february_goodoldtext-unbroken             0.534\n",
-       "code_o1_03_february_remove-navigational              0.537\n",
-       "code_o1_03_february_text_high-reasoning-effort       0.485\n",
-       "code_o1_22-01_managedagent-summary_planning          0.418\n",
-       "code_o1_25-01_visioon                                0.340\n",
-       "code_o1_29-01_text                                   0.390\n",
-       "code_o3-mini_03_february_remove-navigational         0.291\n",
-       "code_qwen-coder-32B_03_february_text                 0.209\n",
-       "code_sonnet_03_february_goodoldtext-unbroken         0.000"
+       "                                                  is_correct\n",
+       "agent_name                                                  \n",
+       "code_gpt4o_03_february_goodoldtext-unbroken            0.384\n",
+       "code_gpt4o_03_february_magenticbrowser                 0.352\n",
+       "code_gpt4o_03_february_magenticbrowser2                0.365\n",
+       "code_gpt4o_03_february_text                            0.376\n",
+       "code_llama-3                                           0.078\n",
+       "code_o1_01_february_text                               0.491\n",
+       "code_o1_03_february_ablation-toolcalling-manager       0.327\n",
+       "code_o1_03_february_fix-print-outputs                  0.518\n",
+       "code_o1_03_february_fix-print-outputs2                 0.558\n",
+       "code_o1_03_february_goodoldtext-unbroken               0.534\n",
+       "code_o1_03_february_remove-navigational                0.537\n",
+       "code_o1_03_february_text_high-reasoning-effort         0.485\n",
+       "code_o1_04_february_submission                         0.494\n",
+       "code_o1_04_february_submission-medium                  0.488\n",
+       "code_o1_04_february_submission3                        0.490\n",
+       "code_o1_04_february_submission4                        0.500\n",
+       "code_o1_04_february_submission5                        0.552\n",
+       "code_o1_22-01_managedagent-summary_planning            0.418\n",
+       "code_o1_25-01_visioon                                  0.340\n",
+       "code_o1_29-01_text                                     0.390\n",
+       "code_o3-mini_03_february_remove-navigational           0.291\n",
+       "code_qwen-coder-32B_03_february_text                   0.209"
       ]
      },
      "metadata": {},
@@ -738,28 +763,6 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_gpt4o_03_february_goodoldtext</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.631579</td>\n",
-       "      <td>0.631579</td>\n",
-       "      <td>7.421053</td>\n",
-       "      <td>19</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.346154</td>\n",
-       "      <td>0.384615</td>\n",
-       "      <td>7.346154</td>\n",
-       "      <td>26</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>0.200000</td>\n",
-       "      <td>7.200000</td>\n",
-       "      <td>5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
        "      <th rowspan=\"3\" valign=\"top\">code_gpt4o_03_february_goodoldtext-unbroken</th>\n",
        "      <th>1</th>\n",
        "      <td>0.452830</td>\n",
@@ -782,7 +785,7 @@
        "      <td>22</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_gpt4o_03_february_magenticbrowser</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">code_gpt4o_03_february_magenticbrowser</th>\n",
        "      <th>1</th>\n",
        "      <td>0.480769</td>\n",
        "      <td>0.480769</td>\n",
@@ -797,285 +800,15 @@
        "      <td>83</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.083333</td>\n",
-       "      <td>0.083333</td>\n",
-       "      <td>10.375000</td>\n",
-       "      <td>24</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_gpt4o_03_february_magenticbrowser2</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.461538</td>\n",
-       "      <td>0.461538</td>\n",
-       "      <td>6.923077</td>\n",
-       "      <td>52</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.345679</td>\n",
-       "      <td>0.345679</td>\n",
-       "      <td>7.925926</td>\n",
-       "      <td>81</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.217391</td>\n",
-       "      <td>0.260870</td>\n",
-       "      <td>9.739130</td>\n",
-       "      <td>23</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_gpt4o_03_february_text</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.433962</td>\n",
-       "      <td>0.452830</td>\n",
-       "      <td>5.924528</td>\n",
-       "      <td>53</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.406977</td>\n",
-       "      <td>0.418605</td>\n",
-       "      <td>7.255814</td>\n",
-       "      <td>86</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.153846</td>\n",
-       "      <td>0.153846</td>\n",
-       "      <td>8.115385</td>\n",
-       "      <td>26</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_llama-3</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.192308</td>\n",
-       "      <td>0.192308</td>\n",
-       "      <td>1.230769</td>\n",
-       "      <td>26</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.040000</td>\n",
-       "      <td>0.040000</td>\n",
-       "      <td>1.080000</td>\n",
-       "      <td>50</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.285714</td>\n",
-       "      <td>14</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_o1_01_february_text</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.547170</td>\n",
-       "      <td>0.566038</td>\n",
-       "      <td>2.849057</td>\n",
-       "      <td>53</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.534884</td>\n",
-       "      <td>0.534884</td>\n",
-       "      <td>3.325581</td>\n",
-       "      <td>86</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.230769</td>\n",
-       "      <td>0.230769</td>\n",
-       "      <td>4.269231</td>\n",
-       "      <td>26</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_o1_03_february_fix-print-outputs</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.622642</td>\n",
-       "      <td>0.622642</td>\n",
-       "      <td>4.018868</td>\n",
-       "      <td>53</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.505882</td>\n",
-       "      <td>0.505882</td>\n",
-       "      <td>4.270588</td>\n",
-       "      <td>85</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.346154</td>\n",
-       "      <td>0.346154</td>\n",
-       "      <td>5.500000</td>\n",
-       "      <td>26</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_o1_03_february_fix-print-outputs2</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.641509</td>\n",
-       "      <td>0.641509</td>\n",
-       "      <td>3.811321</td>\n",
-       "      <td>53</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.506329</td>\n",
-       "      <td>0.506329</td>\n",
-       "      <td>3.784810</td>\n",
-       "      <td>79</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>3.875000</td>\n",
-       "      <td>24</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_o1_03_february_goodoldtext-unbroken</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.622642</td>\n",
-       "      <td>0.622642</td>\n",
-       "      <td>4.132075</td>\n",
-       "      <td>53</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.541176</td>\n",
-       "      <td>0.541176</td>\n",
-       "      <td>4.152941</td>\n",
-       "      <td>85</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.304348</td>\n",
-       "      <td>0.304348</td>\n",
-       "      <td>4.391304</td>\n",
-       "      <td>23</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_o1_03_february_remove-navigational</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.641509</td>\n",
-       "      <td>0.641509</td>\n",
-       "      <td>3.962264</td>\n",
-       "      <td>53</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.541176</td>\n",
-       "      <td>0.552941</td>\n",
-       "      <td>4.164706</td>\n",
-       "      <td>85</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.307692</td>\n",
-       "      <td>0.307692</td>\n",
-       "      <td>5.692308</td>\n",
-       "      <td>26</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_o1_03_february_text_high-reasoning-effort</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.547170</td>\n",
-       "      <td>0.547170</td>\n",
-       "      <td>3.037736</td>\n",
-       "      <td>53</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.523256</td>\n",
-       "      <td>0.534884</td>\n",
-       "      <td>2.930233</td>\n",
-       "      <td>86</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.230769</td>\n",
-       "      <td>0.230769</td>\n",
-       "      <td>3.653846</td>\n",
-       "      <td>26</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_o1_22-01_managedagent-summary_planning</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.476190</td>\n",
-       "      <td>0.523810</td>\n",
-       "      <td>5.047619</td>\n",
-       "      <td>21</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.472222</td>\n",
-       "      <td>0.500000</td>\n",
-       "      <td>5.222222</td>\n",
-       "      <td>36</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.100000</td>\n",
-       "      <td>0.100000</td>\n",
-       "      <td>5.500000</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_o1_25-01_visioon</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.411765</td>\n",
-       "      <td>0.411765</td>\n",
-       "      <td>5.294118</td>\n",
-       "      <td>17</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.366667</td>\n",
-       "      <td>0.366667</td>\n",
-       "      <td>5.333333</td>\n",
-       "      <td>30</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>6.666667</td>\n",
-       "      <td>6</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_o1_29-01_text</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.516129</td>\n",
-       "      <td>0.516129</td>\n",
-       "      <td>4.967742</td>\n",
-       "      <td>31</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0.379310</td>\n",
-       "      <td>0.431034</td>\n",
-       "      <td>5.241379</td>\n",
-       "      <td>58</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0.187500</td>\n",
-       "      <td>0.187500</td>\n",
-       "      <td>6.500000</td>\n",
-       "      <td>16</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">code_o3-mini_03_february_remove-navigational</th>\n",
-       "      <th>1</th>\n",
-       "      <td>0.452830</td>\n",
-       "      <td>0.452830</td>\n",
-       "      <td>5.056604</td>\n",
-       "      <td>53</td>\n",
+       "      <th>...</th>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">code_o3-mini_03_february_remove-navigational</th>\n",
        "      <th>2</th>\n",
        "      <td>0.232558</td>\n",
        "      <td>0.244186</td>\n",
@@ -1111,183 +844,55 @@
        "      <td>6.571429</td>\n",
        "      <td>7</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>code_sonnet_03_february_goodoldtext-unbroken</th>\n",
-       "      <th>2</th>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>65 rows × 4 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                                     is_correct  \\\n",
-       "agent_name                                     task               \n",
-       "code_gpt4o_03_february_goodoldtext             1       0.631579   \n",
-       "                                               2       0.346154   \n",
-       "                                               3       0.200000   \n",
-       "code_gpt4o_03_february_goodoldtext-unbroken    1       0.452830   \n",
-       "                                               2       0.380952   \n",
-       "                                               3       0.227273   \n",
-       "code_gpt4o_03_february_magenticbrowser         1       0.480769   \n",
-       "                                               2       0.349398   \n",
-       "                                               3       0.083333   \n",
-       "code_gpt4o_03_february_magenticbrowser2        1       0.461538   \n",
-       "                                               2       0.345679   \n",
-       "                                               3       0.217391   \n",
-       "code_gpt4o_03_february_text                    1       0.433962   \n",
-       "                                               2       0.406977   \n",
-       "                                               3       0.153846   \n",
-       "code_llama-3                                   1       0.192308   \n",
-       "                                               2       0.040000   \n",
-       "                                               3       0.000000   \n",
-       "code_o1_01_february_text                       1       0.547170   \n",
-       "                                               2       0.534884   \n",
-       "                                               3       0.230769   \n",
-       "code_o1_03_february_fix-print-outputs          1       0.622642   \n",
-       "                                               2       0.505882   \n",
-       "                                               3       0.346154   \n",
-       "code_o1_03_february_fix-print-outputs2         1       0.641509   \n",
-       "                                               2       0.506329   \n",
-       "                                               3       0.333333   \n",
-       "code_o1_03_february_goodoldtext-unbroken       1       0.622642   \n",
-       "                                               2       0.541176   \n",
-       "                                               3       0.304348   \n",
-       "code_o1_03_february_remove-navigational        1       0.641509   \n",
-       "                                               2       0.541176   \n",
-       "                                               3       0.307692   \n",
-       "code_o1_03_february_text_high-reasoning-effort 1       0.547170   \n",
-       "                                               2       0.523256   \n",
-       "                                               3       0.230769   \n",
-       "code_o1_22-01_managedagent-summary_planning    1       0.476190   \n",
-       "                                               2       0.472222   \n",
-       "                                               3       0.100000   \n",
-       "code_o1_25-01_visioon                          1       0.411765   \n",
-       "                                               2       0.366667   \n",
-       "                                               3       0.000000   \n",
-       "code_o1_29-01_text                             1       0.516129   \n",
-       "                                               2       0.379310   \n",
-       "                                               3       0.187500   \n",
-       "code_o3-mini_03_february_remove-navigational   1       0.452830   \n",
-       "                                               2       0.232558   \n",
-       "                                               3       0.153846   \n",
-       "code_qwen-coder-32B_03_february_text           1       0.357143   \n",
-       "                                               2       0.136364   \n",
-       "                                               3       0.142857   \n",
-       "code_sonnet_03_february_goodoldtext-unbroken   2       0.000000   \n",
+       "                                                   is_correct  \\\n",
+       "agent_name                                   task               \n",
+       "code_gpt4o_03_february_goodoldtext-unbroken  1       0.452830   \n",
+       "                                             2       0.380952   \n",
+       "                                             3       0.227273   \n",
+       "code_gpt4o_03_february_magenticbrowser       1       0.480769   \n",
+       "                                             2       0.349398   \n",
+       "...                                                       ...   \n",
+       "code_o3-mini_03_february_remove-navigational 2       0.232558   \n",
+       "                                             3       0.153846   \n",
+       "code_qwen-coder-32B_03_february_text         1       0.357143   \n",
+       "                                             2       0.136364   \n",
+       "                                             3       0.142857   \n",
        "\n",
-       "                                                     is_near_correct  \\\n",
-       "agent_name                                     task                    \n",
-       "code_gpt4o_03_february_goodoldtext             1            0.631579   \n",
-       "                                               2            0.384615   \n",
-       "                                               3            0.200000   \n",
-       "code_gpt4o_03_february_goodoldtext-unbroken    1            0.452830   \n",
-       "                                               2            0.392857   \n",
-       "                                               3            0.227273   \n",
-       "code_gpt4o_03_february_magenticbrowser         1            0.480769   \n",
-       "                                               2            0.361446   \n",
-       "                                               3            0.083333   \n",
-       "code_gpt4o_03_february_magenticbrowser2        1            0.461538   \n",
-       "                                               2            0.345679   \n",
-       "                                               3            0.260870   \n",
-       "code_gpt4o_03_february_text                    1            0.452830   \n",
-       "                                               2            0.418605   \n",
-       "                                               3            0.153846   \n",
-       "code_llama-3                                   1            0.192308   \n",
-       "                                               2            0.040000   \n",
-       "                                               3            0.000000   \n",
-       "code_o1_01_february_text                       1            0.566038   \n",
-       "                                               2            0.534884   \n",
-       "                                               3            0.230769   \n",
-       "code_o1_03_february_fix-print-outputs          1            0.622642   \n",
-       "                                               2            0.505882   \n",
-       "                                               3            0.346154   \n",
-       "code_o1_03_february_fix-print-outputs2         1            0.641509   \n",
-       "                                               2            0.506329   \n",
-       "                                               3            0.333333   \n",
-       "code_o1_03_february_goodoldtext-unbroken       1            0.622642   \n",
-       "                                               2            0.541176   \n",
-       "                                               3            0.304348   \n",
-       "code_o1_03_february_remove-navigational        1            0.641509   \n",
-       "                                               2            0.552941   \n",
-       "                                               3            0.307692   \n",
-       "code_o1_03_february_text_high-reasoning-effort 1            0.547170   \n",
-       "                                               2            0.534884   \n",
-       "                                               3            0.230769   \n",
-       "code_o1_22-01_managedagent-summary_planning    1            0.523810   \n",
-       "                                               2            0.500000   \n",
-       "                                               3            0.100000   \n",
-       "code_o1_25-01_visioon                          1            0.411765   \n",
-       "                                               2            0.366667   \n",
-       "                                               3            0.000000   \n",
-       "code_o1_29-01_text                             1            0.516129   \n",
-       "                                               2            0.431034   \n",
-       "                                               3            0.187500   \n",
-       "code_o3-mini_03_february_remove-navigational   1            0.452830   \n",
-       "                                               2            0.244186   \n",
-       "                                               3            0.153846   \n",
-       "code_qwen-coder-32B_03_february_text           1            0.357143   \n",
-       "                                               2            0.136364   \n",
-       "                                               3            0.142857   \n",
-       "code_sonnet_03_february_goodoldtext-unbroken   2            0.000000   \n",
+       "                                                   is_near_correct  \\\n",
+       "agent_name                                   task                    \n",
+       "code_gpt4o_03_february_goodoldtext-unbroken  1            0.452830   \n",
+       "                                             2            0.392857   \n",
+       "                                             3            0.227273   \n",
+       "code_gpt4o_03_february_magenticbrowser       1            0.480769   \n",
+       "                                             2            0.361446   \n",
+       "...                                                            ...   \n",
+       "code_o3-mini_03_february_remove-navigational 2            0.244186   \n",
+       "                                             3            0.153846   \n",
+       "code_qwen-coder-32B_03_february_text         1            0.357143   \n",
+       "                                             2            0.136364   \n",
+       "                                             3            0.142857   \n",
        "\n",
-       "                                                     count_steps  count  \n",
-       "agent_name                                     task                      \n",
-       "code_gpt4o_03_february_goodoldtext             1        7.421053     19  \n",
-       "                                               2        7.346154     26  \n",
-       "                                               3        7.200000      5  \n",
-       "code_gpt4o_03_february_goodoldtext-unbroken    1        7.000000     53  \n",
-       "                                               2        8.511905     84  \n",
-       "                                               3       10.409091     22  \n",
-       "code_gpt4o_03_february_magenticbrowser         1        7.153846     52  \n",
-       "                                               2        8.168675     83  \n",
-       "                                               3       10.375000     24  \n",
-       "code_gpt4o_03_february_magenticbrowser2        1        6.923077     52  \n",
-       "                                               2        7.925926     81  \n",
-       "                                               3        9.739130     23  \n",
-       "code_gpt4o_03_february_text                    1        5.924528     53  \n",
-       "                                               2        7.255814     86  \n",
-       "                                               3        8.115385     26  \n",
-       "code_llama-3                                   1        1.230769     26  \n",
-       "                                               2        1.080000     50  \n",
-       "                                               3        0.285714     14  \n",
-       "code_o1_01_february_text                       1        2.849057     53  \n",
-       "                                               2        3.325581     86  \n",
-       "                                               3        4.269231     26  \n",
-       "code_o1_03_february_fix-print-outputs          1        4.018868     53  \n",
-       "                                               2        4.270588     85  \n",
-       "                                               3        5.500000     26  \n",
-       "code_o1_03_february_fix-print-outputs2         1        3.811321     53  \n",
-       "                                               2        3.784810     79  \n",
-       "                                               3        3.875000     24  \n",
-       "code_o1_03_february_goodoldtext-unbroken       1        4.132075     53  \n",
-       "                                               2        4.152941     85  \n",
-       "                                               3        4.391304     23  \n",
-       "code_o1_03_february_remove-navigational        1        3.962264     53  \n",
-       "                                               2        4.164706     85  \n",
-       "                                               3        5.692308     26  \n",
-       "code_o1_03_february_text_high-reasoning-effort 1        3.037736     53  \n",
-       "                                               2        2.930233     86  \n",
-       "                                               3        3.653846     26  \n",
-       "code_o1_22-01_managedagent-summary_planning    1        5.047619     21  \n",
-       "                                               2        5.222222     36  \n",
-       "                                               3        5.500000     10  \n",
-       "code_o1_25-01_visioon                          1        5.294118     17  \n",
-       "                                               2        5.333333     30  \n",
-       "                                               3        6.666667      6  \n",
-       "code_o1_29-01_text                             1        4.967742     31  \n",
-       "                                               2        5.241379     58  \n",
-       "                                               3        6.500000     16  \n",
-       "code_o3-mini_03_february_remove-navigational   1        5.056604     53  \n",
-       "                                               2        4.976744     86  \n",
-       "                                               3        6.615385     26  \n",
-       "code_qwen-coder-32B_03_february_text           1        5.428571     14  \n",
-       "                                               2        6.409091     22  \n",
-       "                                               3        6.571429      7  \n",
-       "code_sonnet_03_february_goodoldtext-unbroken   2        5.000000      1  "
+       "                                                   count_steps  count  \n",
+       "agent_name                                   task                      \n",
+       "code_gpt4o_03_february_goodoldtext-unbroken  1        7.000000     53  \n",
+       "                                             2        8.511905     84  \n",
+       "                                             3       10.409091     22  \n",
+       "code_gpt4o_03_february_magenticbrowser       1        7.153846     52  \n",
+       "                                             2        8.168675     83  \n",
+       "...                                                        ...    ...  \n",
+       "code_o3-mini_03_february_remove-navigational 2        4.976744     86  \n",
+       "                                             3        6.615385     26  \n",
+       "code_qwen-coder-32B_03_february_text         1        5.428571     14  \n",
+       "                                             2        6.409091     22  \n",
+       "                                             3        6.571429      7  \n",
+       "\n",
+       "[65 rows x 4 columns]"
       ]
      },
      "metadata": {},
@@ -1322,183 +927,6 @@
         "plotlyServerURL": "https://plot.ly"
        },
        "data": [
-        {
-         "customdata": [
-          [
-           "The attached spreadsheet shows the inventory for a"
-          ],
-          [
-           "What was the volume in m^3 of the fish bag that wa"
-          ],
-          [
-           "What are the EC numbers of the two most commonly u"
-          ],
-          [
-           "In Unlambda, what exact charcter or text needs to "
-          ],
-          [
-           "The object in the British Museum's collection with"
-          ],
-          [
-           "When you take the average of the standard populati"
-          ],
-          [
-           "Use density measures from the chemistry materials "
-          ],
-          [
-           "A paper about AI regulation that was originally su"
-          ],
-          [
-           "If we assume all articles published by Nature in 2"
-          ],
-          [
-           "If Eliud Kipchoge could maintain his record-making"
-          ],
-          [
-           "In April of 1977, who was the Prime Minister of th"
-          ],
-          [
-           "What's the last line of the rhyme under the flavor"
-          ],
-          [
-           "An office held a Secret Santa gift exchange where "
-          ],
-          [
-           "In Series 9, Episode 11 of Doctor Who, the Doctor "
-          ],
-          [
-           "What two-word type of model did Manash Pratim Kash"
-          ],
-          [
-           "Of the authors (First M. Last) that worked on the "
-          ],
-          [
-           "In July 2, 1959 United States standards for grades"
-          ],
-          [
-           "In the NCATS PubChem compound database for Food Ad"
-          ],
-          [
-           ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
-          ],
-          [
-           "In the video https://www.youtube.com/watch?v=L1vXC"
-          ],
-          [
-           "How many studio albums were published by Mercedes "
-          ],
-          [
-           "Here's a fun riddle that I think you'll enjoy.\n\nYo"
-          ],
-          [
-           "Each cell in the attached spreadsheet represents a"
-          ],
-          [
-           "The photograph in the Whitney Museum of American A"
-          ],
-          [
-           "According to github, when was Regression added to "
-          ],
-          [
-           "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
-          ],
-          [
-           "My family reunion is this week, and I was assigned"
-          ],
-          [
-           "I went to Virtue restaurant & bar in Chicago for m"
-          ],
-          [
-           "How many High Energy Physics - Lattice articles li"
-          ],
-          [
-           "In Emily Midkiff's June 2014 article in a journal "
-          ],
-          [
-           "Under DDC 633 on Bielefeld University Library's BA"
-          ],
-          [
-           "How many applicants for the job in the PDF are onl"
-          ],
-          [
-           "Assuming scientists in the famous youtube video Th"
-          ],
-          [
-           "In the fictional language of Tizin, basic sentence"
-          ],
-          [
-           "Compute the check digit the Tropicos ID for the Or"
-          ],
-          [
-           "The attached file contains a list of vendors in th"
-          ],
-          [
-           "What is the minimum number of page links a person "
-          ],
-          [
-           "Review the chess position provided in the image. I"
-          ],
-          [
-           "In Valentina Re’s contribution to the 2017 book “W"
-          ],
-          [
-           "What time was the Tri-Rail train that carried the "
-          ],
-          [
-           "Which contributor to the version of OpenCV where s"
-          ],
-          [
-           "Given this table defining * on the set S = {a, b, "
-          ],
-          [
-           "According to Box Office Mojo's 2020 Worldwide Box "
-          ],
-          [
-           "What writer is quoted by Merriam-Webster for the W"
-          ],
-          [
-           "What integer-rounded percentage of the total lengt"
-          ],
-          [
-           "In terms of geographical distance between capital "
-          ],
-          [
-           "In Nature journal's Scientific Reports conference "
-          ],
-          [
-           "The following numbers function similarly to ISBN 1"
-          ],
-          [
-           "The attached file shows a list of books in the col"
-          ],
-          [
-           "On a leap day before the year 2008, a joke was rem"
-          ]
-         ],
-         "hovertemplate": "agent_name=code_gpt4o_03_february_goodoldtext<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
-         "legendgroup": "code_gpt4o_03_february_goodoldtext",
-         "line": {
-          "color": "#636efa",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "code_gpt4o_03_february_goodoldtext",
-         "showlegend": true,
-         "type": "scattergl",
-         "x": {
-          "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDE=",
-          "dtype": "i1"
-         },
-         "xaxis": "x",
-         "y": {
-          "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/AAAAAAAA4D8XXXTRRRfdP6uqqqqqqto/ntiJndiJ3T/btm3btm3bP5qZmZmZmdk/AAAAAAAA2D+XlpaWlpbWP1VVVVVVVdU/Q3kN5TWU1z+amZmZmZnZP9u2bdu2bds/F1100UUX3T+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T87sRM7sRPbPxzHcRzHcdw/btu2bdu23T/UCMs9jbDcP97d3d3d3d0/55xzzjnn3D8AAAAAAADcPxdddNFFF90/PDw8PDw83D8d1EEd1EHdP47jOI7jON4/KvJZN5gi3z8N5TWU11DeP9/yLd/yLd8/ZmZmZmZm3j+pXYnalajdP57neZ7ned4/cUfcEXfE3T+MLrroooveP97d3d3d3d0/05ve9KY33T9yBTG5gpjcPwAAAAAAANw/L6fg5RS83D8pXI/C9SjcPw==",
-          "dtype": "f8"
-         },
-         "yaxis": "y"
-        },
         {
          "customdata": [
           [
@@ -1982,7 +1410,7 @@
          "hovertemplate": "agent_name=code_gpt4o_03_february_goodoldtext-unbroken<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_gpt4o_03_february_goodoldtext-unbroken",
          "line": {
-          "color": "#EF553B",
+          "color": "#636efa",
           "dash": "solid"
          },
          "marker": {
@@ -2486,7 +1914,7 @@
          "hovertemplate": "agent_name=code_gpt4o_03_february_magenticbrowser<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_gpt4o_03_february_magenticbrowser",
          "line": {
-          "color": "#00cc96",
+          "color": "#EF553B",
           "dash": "solid"
          },
          "marker": {
@@ -2981,7 +2409,7 @@
          "hovertemplate": "agent_name=code_gpt4o_03_february_magenticbrowser2<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_gpt4o_03_february_magenticbrowser2",
          "line": {
-          "color": "#ab63fa",
+          "color": "#00cc96",
           "dash": "solid"
          },
          "marker": {
@@ -3503,7 +2931,7 @@
          "hovertemplate": "agent_name=code_gpt4o_03_february_text<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_gpt4o_03_february_text",
          "line": {
-          "color": "#FFA15A",
+          "color": "#ab63fa",
           "dash": "solid"
          },
          "marker": {
@@ -3800,7 +3228,7 @@
          "hovertemplate": "agent_name=code_llama-3<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_llama-3",
          "line": {
-          "color": "#19d3f3",
+          "color": "#FFA15A",
           "dash": "solid"
          },
          "marker": {
@@ -4322,7 +3750,7 @@
          "hovertemplate": "agent_name=code_o1_01_february_text<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_o1_01_february_text",
          "line": {
-          "color": "#FF6692",
+          "color": "#19d3f3",
           "dash": "solid"
          },
          "marker": {
@@ -4343,6 +3771,528 @@
          },
          "yaxis": "y"
         },
+        {
+         "customdata": [
+          [
+           "According to github, when was Regression added to "
+          ],
+          [
+           "In the video https://www.youtube.com/watch?v=L1vXC"
+          ],
+          [
+           "In April of 1977, who was the Prime Minister of th"
+          ],
+          [
+           "The attached spreadsheet shows the inventory for a"
+          ],
+          [
+           "The object in the British Museum's collection with"
+          ],
+          [
+           "In terms of geographical distance between capital "
+          ],
+          [
+           "Of the authors (First M. Last) that worked on the "
+          ],
+          [
+           "Which contributor to the version of OpenCV where s"
+          ],
+          [
+           "In July 2, 1959 United States standards for grades"
+          ],
+          [
+           "Assuming scientists in the famous youtube video Th"
+          ],
+          [
+           "How many studio albums were published by Mercedes "
+          ],
+          [
+           "What's the last line of the rhyme under the flavor"
+          ],
+          [
+           "A paper about AI regulation that was originally su"
+          ],
+          [
+           "When you take the average of the standard populati"
+          ],
+          [
+           "Use density measures from the chemistry materials "
+          ],
+          [
+           "How many High Energy Physics - Lattice articles li"
+          ],
+          [
+           "I need to fact-check a citation. This is the citat"
+          ],
+          [
+           "What are the EC numbers of the two most commonly u"
+          ],
+          [
+           "What was the volume in m^3 of the fish bag that wa"
+          ],
+          [
+           "What animals that were mentioned in both Ilias Lag"
+          ],
+          [
+           "What is the minimum number of page links a person "
+          ],
+          [
+           "In the 2018 VSCode blog post on replit.com, what w"
+          ],
+          [
+           "What time was the Tri-Rail train that carried the "
+          ],
+          [
+           "If Eliud Kipchoge could maintain his record-making"
+          ],
+          [
+           "What two-word type of model did Manash Pratim Kash"
+          ],
+          [
+           "An office held a Secret Santa gift exchange where "
+          ],
+          [
+           "In Valentina Re’s contribution to the 2017 book “W"
+          ],
+          [
+           "What is the average number of pre-2020 works on th"
+          ],
+          [
+           "I went to Virtue restaurant & bar in Chicago for m"
+          ],
+          [
+           "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+          ],
+          [
+           "Compute the check digit the Tropicos ID for the Or"
+          ],
+          [
+           ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+          ],
+          [
+           "The photograph in the Whitney Museum of American A"
+          ],
+          [
+           "In the NCATS PubChem compound database for Food Ad"
+          ],
+          [
+           "In the fictional language of Tizin, basic sentence"
+          ],
+          [
+           "In Unlambda, what exact charcter or text needs to "
+          ],
+          [
+           "What integer-rounded percentage of the total lengt"
+          ],
+          [
+           "Each cell in the attached spreadsheet represents a"
+          ],
+          [
+           "It is 1999. Before you party like it is 1999, plea"
+          ],
+          [
+           "According to Google Finance, when was the first ye"
+          ],
+          [
+           "Review the chess position provided in the image. I"
+          ],
+          [
+           "In Emily Midkiff's June 2014 article in a journal "
+          ],
+          [
+           "The attached file contains a list of vendors in th"
+          ],
+          [
+           "Under DDC 633 on Bielefeld University Library's BA"
+          ],
+          [
+           "Who nominated the only Featured Article on English"
+          ],
+          [
+           "In the year 2022, and before December, what does \""
+          ],
+          [
+           "Given this table defining * on the set S = {a, b, "
+          ],
+          [
+           "The Metropolitan Museum of Art has a portrait in i"
+          ],
+          [
+           "According to Box Office Mojo's 2020 Worldwide Box "
+          ],
+          [
+           "Using the Biopython library in Python, parse the P"
+          ],
+          [
+           "How many pages if the 2023 IPCC report (85 pages v"
+          ],
+          [
+           "In Nature journal's Scientific Reports conference "
+          ],
+          [
+           "On July 15, 2008, Phys.org published an article ab"
+          ],
+          [
+           "My family reunion is this week, and I was assigned"
+          ],
+          [
+           "On a leap day before the year 2008, a joke was rem"
+          ],
+          [
+           "How many edits were made to the Wikipedia page on "
+          ],
+          [
+           "In the endnote found in the second-to-last paragra"
+          ],
+          [
+           "I was trying to remember how well the Cheater Beat"
+          ],
+          [
+           "What writer is quoted by Merriam-Webster for the W"
+          ],
+          [
+           "Using bass clef notes, what is the age of someone "
+          ],
+          [
+           "What percentage of the total penguin population ac"
+          ],
+          [
+           "The Latin root of the Yola word \"gimlie\" shares a "
+          ],
+          [
+           "In Series 9, Episode 11 of Doctor Who, the Doctor "
+          ],
+          [
+           "Find the value of x to the nearest tenth: Lx = (d/"
+          ],
+          [
+           "How many nonindigenous crocodiles were found in Fl"
+          ],
+          [
+           "How many applicants for the job in the PDF are onl"
+          ],
+          [
+           "How many slides in this PowerPoint presentation me"
+          ],
+          [
+           "The following numbers function similarly to ISBN 1"
+          ],
+          [
+           "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+          ],
+          [
+           "How many images are there in the latest 2022 Lego "
+          ],
+          [
+           "In the NIH translation of the original 1913 Michae"
+          ],
+          [
+           "The work referenced in footnote 397 of Federico La"
+          ],
+          [
+           "If there is anything that doesn't make sense in th"
+          ],
+          [
+           "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+          ],
+          [
+           "You are Van Helsing, a renowned vampire hunter. A "
+          ],
+          [
+           "As of the 2020 census, what was the population dif"
+          ],
+          [
+           "The attached file lists accommodations in the reso"
+          ],
+          [
+           "Who composed the song that was performed by a roos"
+          ],
+          [
+           "What is the volume in milliliters of a system comp"
+          ],
+          [
+           "The attached spreadsheet contains the sales of men"
+          ],
+          [
+           "According to wikipedia, how many Asian countries s"
+          ],
+          [
+           "The attached file shows the locomotives in the col"
+          ],
+          [
+           "Could you help me out with this assignment? Our pr"
+          ],
+          [
+           "What is the surname of the equine veterinarian men"
+          ],
+          [
+           "I was referencing each of the tables in the file f"
+          ],
+          [
+           "I'm making a grocery list for my mom, but she's a "
+          ],
+          [
+           "Who did the actor who played Ray in the Polish-lan"
+          ],
+          [
+           "In the Scikit-Learn July 2017 changelog, what othe"
+          ],
+          [
+           "According to the World Bank, which countries had g"
+          ],
+          [
+           "How many times was a Twitter/X post cited as a ref"
+          ],
+          [
+           "The attached image contains a Python script. Run t"
+          ],
+          [
+           "What is the latest chronological year date written"
+          ],
+          [
+           "What is the last word before the second chorus of "
+          ],
+          [
+           "On ScienceDirect, what is the difference to 3 deci"
+          ],
+          [
+           "You are given this Excel file as a map. You start "
+          ],
+          [
+           "What is the final numeric output from the attached"
+          ],
+          [
+           "This spreadsheet contains a list of clients for a "
+          ],
+          [
+           "On the BBC Earth YouTube video of the Top 5 Sillie"
+          ],
+          [
+           "I have the Standard plan in the image below, and I"
+          ],
+          [
+           "On the DeepFruits fruit detection graph on Connect"
+          ],
+          [
+           "How many more blocks (also denoted as layers) in B"
+          ],
+          [
+           "The attached PDF lists accommodations in the resor"
+          ],
+          [
+           "Hi, I'm making a pie but I could use some help wit"
+          ],
+          [
+           "The book with the doi 10.1353/book.24372 concerns "
+          ],
+          [
+           "The longest-lived vertebrate is named after an isl"
+          ],
+          [
+           "This is a secret message my friend gave me. It say"
+          ],
+          [
+           "The attached file shows a list of books in the col"
+          ],
+          [
+           "The year is 2022. I am at the National Air and Spa"
+          ],
+          [
+           "It's May 2023, and I'm about to drive across the U"
+          ],
+          [
+           "Which of the fruits shown in the 2008 painting \"Em"
+          ],
+          [
+           "As of August 2023, who is the only winner of the U"
+          ],
+          [
+           "Examine the video at https://www.youtube.com/watch"
+          ],
+          [
+           "According to the USGS, in what year was the Americ"
+          ],
+          [
+           "The cover of the August 2021 issue of Vogue shows "
+          ],
+          [
+           "The YouTube channel Game Grumps began a Let’s Play"
+          ],
+          [
+           "All of the individuals who formally held the posit"
+          ],
+          [
+           "What was the complete title of the book in which t"
+          ],
+          [
+           "What is the area of the green polygon in the attac"
+          ],
+          [
+           "During the first week of August 2015, one of the N"
+          ],
+          [
+           "Pull out the sentence in the following 5x7 block o"
+          ],
+          [
+           "How many at bats did the Yankee with the most walk"
+          ],
+          [
+           "According to Girls Who Code, how long did it take "
+          ],
+          [
+           "Of the cities within the United States where U.S. "
+          ],
+          [
+           "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+          ],
+          [
+           "Look at the attached image. The quiz is scored as "
+          ],
+          [
+           "What is the absolute difference in tens of thousan"
+          ],
+          [
+           "According to Openreview.net, at the NeurIPS 2022 C"
+          ],
+          [
+           "When was a picture of St. Thomas Aquinas first add"
+          ],
+          [
+           "Where were the Vietnamese specimens described by K"
+          ],
+          [
+           "On June 6, 2023, an article by Carolyn Collins Pet"
+          ],
+          [
+           "What was the actual enrollment count of the clinic"
+          ],
+          [
+           "As a comma separated list with no whitespace, usin"
+          ],
+          [
+           "What country had the least number of athletes at t"
+          ],
+          [
+           "I'd like to learn more about some popular reality "
+          ],
+          [
+           "Hi, I was out sick from my classes on Friday, so I"
+          ],
+          [
+           "The attached spreadsheet contains a list of books "
+          ],
+          [
+           "I read a paper about multiwavelength observations "
+          ],
+          [
+           "Take the gender split from the 2011 Bulgarian cens"
+          ],
+          [
+           "Consider the following symbols: 𒐜  𒐐𒐚\n\nThis is a n"
+          ],
+          [
+           "On Cornell Law School website's legal information "
+          ],
+          [
+           "A 5-man group made up of one tank, one healer, and"
+          ],
+          [
+           "Eva Draconis has a personal website which can be a"
+          ],
+          [
+           "A standard Rubik’s cube has been broken into cubes"
+          ],
+          [
+           "In the YouTube 360 VR video from March 2018 narrat"
+          ],
+          [
+           "The attached spreadsheet lists the locomotives own"
+          ],
+          [
+           "As of May 2023, how many stops are between South S"
+          ],
+          [
+           "If this whole pint is made up of ice cream, how ma"
+          ],
+          [
+           "At the two-minute mark in the YouTube video upload"
+          ],
+          [
+           "The attached file lists the locomotives owned by a"
+          ],
+          [
+           "What is the first name of the only Malko Competiti"
+          ],
+          [
+           "In the 2015 Metropolitan Museum of Art exhibition "
+          ],
+          [
+           "The brand that makes these harnesses the dogs are "
+          ],
+          [
+           "You are a telecommunications engineer who wants to"
+          ],
+          [
+           "I thought we could try a fun word puzzle together "
+          ],
+          [
+           "Who are the pitchers with the number before and af"
+          ],
+          [
+           "The attached Excel file contains the sales of menu"
+          ],
+          [
+           "I'm curious about how much information is availabl"
+          ],
+          [
+           "Which of the text elements under CATEGORIES in the"
+          ],
+          [
+           "Bob was invited to participate in a game show, and"
+          ],
+          [
+           "I’m researching species that became invasive after"
+          ],
+          [
+           "If we assume all articles published by Nature in 2"
+          ],
+          [
+           "I’m thinking about selling my home, so I want to l"
+          ],
+          [
+           "What is the maximum length in meters of #9 in the "
+          ],
+          [
+           "In NASA's Astronomy Picture of the Day on 2006 Jan"
+          ],
+          [
+           "In the film Goldfinger, what color was the object "
+          ]
+         ],
+         "hovertemplate": "agent_name=code_o1_03_february_ablation-toolcalling-manager<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
+         "legendgroup": "code_o1_03_february_ablation-toolcalling-manager",
+         "line": {
+          "color": "#FF6692",
+          "dash": "solid"
+         },
+         "marker": {
+          "symbol": "circle"
+         },
+         "mode": "lines",
+         "name": "code_o1_03_february_ablation-toolcalling-manager",
+         "showlegend": true,
+         "type": "scattergl",
+         "x": {
+          "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
+          "dtype": "i2"
+         },
+         "xaxis": "x",
+         "y": {
+          "bdata": "AAAAAAAAAAAAAAAAAAAAAFVVVVVVVdU/AAAAAAAA4D8zMzMzMzPjP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADkP3Icx3Ecx+E/AAAAAAAA4D900UUXXXThPwAAAAAAAOA/ntiJndiJ3T/btm3btm3bP97d3d3d3d0/AAAAAAAA3D9aWlpaWlraPxzHcRzHcdw/KK+hvIby2j/NzMzMzMzcP57neZ7ned4/F1100UUX3T+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T87sRM7sRPbPxzHcRzHcdw/27Zt27Zt2z9huacRlnvaP7y7u7u7u9s/11prrbXW2j8AAAAAAADcPyebbLLJJts/WlpaWlpa2j/btm3btm3bP6uqqqqqqto/I591gyny2T8or6G8hvLaP1y+5Vu+5ds/MzMzMzMz2z+J2pWoXYnaP3qe53me59k/s6asKWvK2j8vuuiiiy7aP1uwBVuwBds/velNb3rT2z9t1Hc26jvbP6uqqqqqqto/iMb60Fgf2j+amZmZmZnZPxkZGRkZGdk/2Ymd2Imd2D9+DqkJxlvZPy+hvYT2Eto/mpmZmZmZ2T9JkiRJkiTZPzqL6Syms9g/7mmE5Z5G2D+yFUHDSd3XP3d3d3d3d9c/EayjzfsU1z+21lprrbXWP5ZlWZZlWdY/AAAAAAAA1z9XaqVWaqXWP0422WSTTdY/jYn0QOXs1j+Ih4eHh4fXP3PtwFw7MNc/t23btm3b1j8g0QqbA4nWP47jOI7jONY/r169evXq1T/yWTeYIp/VPzCW/GLJL9Y/UV5DeQ3l1T8KcVZ+QpzVP3ZiJ3ZiJ9Y/v6vFTZjf1T9mZmZmZmbWP/PDImXg6dY/onYlalei1j/S1Pm1h1zWP4ZhGIZhGNY/1tXV1dXV1T9lTVlT1pTVP1VVVVVVVdU/F1100UUX1T9ObWpTm9rUP/VJn/RJn9Q/lVEZlVEZ1T9Ob3rTm97UP1VVVVVVVdU/1Hc26jsb1T8mTv2eW+LUP1VVVVVVVdU/Ffji6gcd1T85BS+n4OXUP1VVVVVVVdU/H4XrUbge1T+bB7nrZ4vVP/b19fX19dU/Iz6BVHJe1j92Yid2YifWP9dojdZojdY/n0NqgvFW1j9dy8HNfiHWP0xoL6G9hNY/Y251Rirm1j+x9g1r37DWPwJxoeYkENc/t23btm3b1j9WemphpafWP0xnMZ3FdNY/ZCELWchC1j9Y7mmE5Z7WP9ZmbdZmbdY/CRpO6r481j9W6AxW6AzWP2ZmZmZmZtY/fa2eHQI31j9t3qe4ZAjWP2DW+2W9X9Y/MsYYY4wx1j+6SQwCK4fWP7dt27Zt29Y/q9VqtVqt1j8AAAAAAIDWP7UlbUlb0tY/N3IjN3Ij1z/LiD6gOvbWP8omm2yyydY/3Wl1p9Wd1j+NifRA5ezWP61z5QHJOtc/iIeHh4eH1z8cKRrij1vXP3PtwFw7MNc/iOIvcoYF1z+3bdu2bdvWP1uGDtjtsdY/INEKmwOJ1j/Am0eoPtPWP6uqqqqqqtY/QzpvMaTz1j+2bNmyZcvWP6lFyF+zo9Y/yWfdYIp81j+vsjij3cPWP5020GkDndY/tNpZ7ax21j8N5TWU11DWP9aAK9aAK9Y/mRrYO6YG1j/iVSReReLVP3ZiJ3ZiJ9Y/SS9/2kID1j+/q8VNmN/VP9nnkJpgvNU/mpmZmZmZ1T+hu0oxQXfVP1VVVVVVVdU/0j5IBtQz1T8TtStRuxLVP/KUIE8J8tQ/",
+          "dtype": "f8"
+         },
+         "yaxis": "y"
+        },
         {
          "customdata": [
           [
@@ -4867,9 +4817,6 @@
           [
            "In April of 1977, who was the Prime Minister of th"
           ],
-          [
-           "Using the Biopython library in Python, parse the P"
-          ],
           [
            "Use density measures from the chemistry materials "
           ],
@@ -4984,9 +4931,6 @@
           [
            "Compute the check digit the Tropicos ID for the Or"
           ],
-          [
-           "What's the last line of the rhyme under the flavor"
-          ],
           [
            "I went to Virtue restaurant & bar in Chicago for m"
           ],
@@ -5125,9 +5069,6 @@
           [
            "I was referencing each of the tables in the file f"
           ],
-          [
-           "How many images are there in the latest 2022 Lego "
-          ],
           [
            "The year is 2022. I am at the National Air and Spa"
           ],
@@ -5140,9 +5081,6 @@
           [
            "The attached image contains a Python script. Run t"
           ],
-          [
-           "What percentage of the total penguin population ac"
-          ],
           [
            "The attached PDF lists accommodations in the resor"
           ],
@@ -5173,9 +5111,6 @@
           [
            "Pull out the sentence in the following 5x7 block o"
           ],
-          [
-           "What is the surname of the equine veterinarian men"
-          ],
           [
            "All of the individuals who formally held the posit"
           ],
@@ -5224,9 +5159,6 @@
           [
            "According to Girls Who Code, how long did it take "
           ],
-          [
-           "What is the average number of pre-2020 works on th"
-          ],
           [
            "I'd like to learn more about some popular reality "
           ],
@@ -5254,18 +5186,12 @@
           [
            "The YouTube channel Game Grumps began a Let’s Play"
           ],
-          [
-           "How many edits were made to the Wikipedia page on "
-          ],
           [
            "Take the gender split from the 2011 Bulgarian cens"
           ],
           [
            "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
           ],
-          [
-           "Of the cities within the United States where U.S. "
-          ],
           [
            "The work referenced in footnote 397 of Federico La"
           ],
@@ -5299,12 +5225,6 @@
           [
            "On June 6, 2023, an article by Carolyn Collins Pet"
           ],
-          [
-           "The brand that makes these harnesses the dogs are "
-          ],
-          [
-           "When was a picture of St. Thomas Aquinas first add"
-          ],
           [
            "In NASA's Astronomy Picture of the Day on 2006 Jan"
           ],
@@ -5327,10 +5247,40 @@
            "What was the actual enrollment count of the clinic"
           ],
           [
-           "As of May 2023, how many stops are between South S"
+           "I read a paper about multiwavelength observations "
           ],
           [
-           "I read a paper about multiwavelength observations "
+           "In the 2015 Metropolitan Museum of Art exhibition "
+          ],
+          [
+           "In the film Goldfinger, what color was the object "
+          ],
+          [
+           "In the endnote found in the second-to-last paragra"
+          ],
+          [
+           "Using the Biopython library in Python, parse the P"
+          ],
+          [
+           "The attached spreadsheet contains a list of books "
+          ],
+          [
+           "Of the cities within the United States where U.S. "
+          ],
+          [
+           "I thought we could try a fun word puzzle together "
+          ],
+          [
+           "The brand that makes these harnesses the dogs are "
+          ],
+          [
+           "What is the surname of the equine veterinarian men"
+          ],
+          [
+           "When was a picture of St. Thomas Aquinas first add"
+          ],
+          [
+           "As of the 2020 census, what was the population dif"
           ]
          ],
          "hovertemplate": "agent_name=code_o1_03_february_fix-print-outputs2<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
@@ -5352,7 +5302,7 @@
          },
          "xaxis": "x",
          "y": {
-          "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADkP1VVVVVVVeU/ZmZmZmZm5j9GF1100UXnP1VVVVVVVeU/FDuxEzux4z8lSZIkSZLkPzMzMzMzM+M/AAAAAAAA4j/T0tLS0tLiP3Icx3Ecx+E/eQ3lNZTX4D8AAAAAAADgPzEMwzAMw+A/dNFFF1104T8LWchCFrLgP1VVVVVVVeE/pHA9Ctej4D+xEzuxEzvhP3Icx3Ecx+E/SZIkSZIk4T+WexphuafhPyIiIiIiIuI/lVJKKaWU4j8AAAAAAADiP3TRRRdddOE/4uHh4eHh4T/xFV/xFV/hP3Icx3Ecx+E/mCKfdYMp4j/zGsprKK/hP9IgDdIgDeI/ZmZmZmZm4j/0MTgfg/PhP5IkSZIkSeI/p6wpa8qa4j/poosuuujiPzMzMzMzM+M/LWQhC1nI4j9MriAmVxDjP6uqqqqqquI/kiRJkiRJ4j+PwvUoXI/iPzIyMjIyMuI/ip3YiZ3Y4T81wXgr+xziP3Icx3Ecx+E/CfKUIE8J4j9u27Zt27bhP3AfwX0E9+E/lnsaYbmn4T8NJ3VfHlvhPxEREREREeE/DcE62rxP4T+MMcYYY4zhP1EURVEUReE/AAAAAACA4T+SG7mRG7nhP/DBBx988OE/5ewWfjUm4j/i4eHh4eHhPxolfkaJn+E/Qh3UQR3U4T/nQKIVNgfiP47jOI7jOOI/kB8/fvz44T+tG0yRz7rhP36x5BdLfuE/8xrKayiv4T8De8fUwN7hP0IapEEapOE/1uImzO9q4T8zMzMzMzPhP8rA0635YeE/kMH5GJyP4T8ilxUDJbzhP3qe53me5+E/EhISEhIS4j+PuCPuiDviPyleIJPiBeI/0UUXXXTR4T8g/ehHP/rhPyIiIiIiIuI/8h7v8R7v4T8hC1nIQhbiP+SRRx555OE/iMkVxOQK4j+kHSsQRtrhPwAAAAAAAOI/UoEvrn7Q4T99aKwPjfXhP3Icx3Ecx+E/mpmZmZmZ4T+8frZYGb7hP5KRkZGRkeE/hqY72G+14T+e2Imd2InhP9IardEareE/b2WfQ2qC4T8tBzf7hVjhP7SX0F5Ce+E/IxVzqzNS4T8qQZ4S5CnhPyUQF2pOAuE/SZIkSZIk4T+uYxTnOkbhP/cR3EdwH+E/FG01eI5A4T+oEZZ7GmHhP7ETO7ETO+E/cVL35bEV4T8RyDURyDXhPxEREREREeE/kJzma/Xs4D+kzfsUlwzhP+mwkQ4b6eA/CCGEEEII4T/0/dR46SbhP1EURVEUReE/SSQSiUQi4T8AAAAAAADhPzjkDXlD3uA/0Qu90Au94D9Mcxf8VZzgP7rooosuuuA/oAl/JvyZ4D/ewq/GRHrgP7AFW7AFW+A/eHh4eHh44D+h2nMyfZXgPwtZyEIWsuA/3Zinj1aT4D/5iq/4iq/gP8IpzYs/keA/ohU2BxKt4D8rmCXlgMjgPzmO4ziO4+A/wOMPBzz+4D8HDhw4cODgPzEMwzAMw+A/1g2myGfd4D/QqyzOaPfgP9pApw102uA/iNBD6CH04D/YUF5DeQ3hP9F7JtF7JuE/5SfEWfkJ4T/uUN0O1e3gPyEN0iAN0uA/",
+          "bdata": "AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA6D+amZmZmZnpP6uqqqqqquo/t23btm3b5j8AAAAAAADoPzmO4ziO4+g/mpmZmZmZ6T9GF1100UXnP1VVVVVVVeU/dmIndmIn5j8lSZIkSZLkP1VVVVVVVeU/AAAAAAAA5j+1tLS0tLTkP+Q4juM4juM/bCivobyG4j8zMzMzMzPjP/Q8z/M8z+M/6aKLLrro4j84velNb3rjP6uqqqqqquI/MzMzMzMz4z8UO7ETO7HjP2gvob2E9uI/27Zt27Zt4z8Jyz2NsNzjP0REREREROQ/nXPOOeec4z8AAAAAAADjP2WTTTbZZOM/09LS0tLS4j8zMzMzMzPjP+Q4juM4juM/bzBFPusG4z/lNZTXUF7jPxQ7sRM7seM/AAAAAAAA5D9L1K5E7UrkPyVJkiRJkuQ/NmVNWVPW5D9ddNFFF13kP/VJn/RJn+Q/QxaykIUs5D/PRn1no77jPwAAAAAAAOQ/5hS8nIKX4z8zMzMzMzPjP3Nzc3Nzc+M/O7ETO7ET4z/BeCv7HFLjP2gvob2E9uI/MzMzMzMz4z+3bdu2bdviP2wor6G8huI/c08jLPc04j9+eWxF0HDiP6uqqqqqquI/sI4271Nc4j+VUkoppZTiP7Msy7Isy+I/AAAAAAAA4z8zMzMzMzPjP+miiy666OI/w6/GRHqg4j/T0tLS0tLiPzDXDsy1A+M/MzMzMzMz4z+/XerJ+O3iP6uqqqqqquI/kyZNmjRp4j+DKfJZN5jiP8aSXyz5xeI/bCivobyG4j+SJEmSJEniP9IgDdIgDeI/dWTPQFQ64j9mZmZmZmbiP8HTrflhkeI/uxK1K1G74j+Ops6vPeTiP8MwDMMwDOM/09LS0tLS4j+/oC/oC/riP+MFMileIOM/6aKLLrro4j8xhznMYQ7jPzMzMzMzM+M/0y/90i/94j+ykIUsZCHjP+2yyy677OI/TK4gJlcQ4z/PLXHq99ziP6uqqqqqquI/8yQyDdvN4j+n4OUUvJziP2r9SoFav+I/4XoUrkfh4j/zIHf9bLHiP4OCgoKCguI/cl4W8Qmk4j9iJ3ZiJ3biP5IkSZIkSeI/NcF4K/sc4j/2C7GiND7iP+0ltJfQXuI/OyMVc6sz4j9UgjwlyFPiPzUngbhQc+I/kiRJkiRJ4j94+yGBtx/iP+4juI/gPuI/IQtZyEIW4j9zTyMs9zTiP9IgDdIgDeI/4qTuy2Mr4j+SJEmSJEniP2ZmZmZmZuI/y6BUmHg/4j9Hm/cpLhniP/QxOB+D8+E/zjnnnHPO4T/sUbgehevhP3Icx3Ecx+E/aTQajUaj4T8AAAAAAMDhP3fEHXFH3OE/gh/4gR/44T/lWUb0AdXhP/DBBx988OE/3xx9c/TN4T92C78aE+nhPz0gWefKA+I/Hh4eHh4e4j/8cevyDjjiPyV+RomfUeI/oBvz9NFq4j+SJEmSJEniP8oVxOQKYuI/U0/Gb5d64j9EhnsVzJLiPxzHcRzHceI/bBMluzZR4j8SI0aMGDHiP5IkSZIkSeI/mCKfdYMp4j/TMZcITwriPyIiIiIiIuI/kuZIc6Q54j+vobyG8hriP1Kn/FGn/OE/y0+Is/IT4j/2cWEfF/bhP4qd2Imd2OE/",
           "dtype": "f8"
          },
          "yaxis": "y"
@@ -6908,6 +6858,1662 @@
          },
          "yaxis": "y"
         },
+        {
+         "customdata": [
+          [
+           "The attached spreadsheet shows the inventory for a"
+          ],
+          [
+           "An office held a Secret Santa gift exchange where "
+          ],
+          [
+           "In April of 1977, who was the Prime Minister of th"
+          ],
+          [
+           "Use density measures from the chemistry materials "
+          ],
+          [
+           "In Unlambda, what exact charcter or text needs to "
+          ],
+          [
+           "Using the Biopython library in Python, parse the P"
+          ],
+          [
+           "If Eliud Kipchoge could maintain his record-making"
+          ],
+          [
+           "What was the volume in m^3 of the fish bag that wa"
+          ],
+          [
+           "The photograph in the Whitney Museum of American A"
+          ],
+          [
+           "What are the EC numbers of the two most commonly u"
+          ],
+          [
+           "In terms of geographical distance between capital "
+          ],
+          [
+           "Of the authors (First M. Last) that worked on the "
+          ],
+          [
+           "What two-word type of model did Manash Pratim Kash"
+          ],
+          [
+           "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+          ],
+          [
+           "When you take the average of the standard populati"
+          ],
+          [
+           "In the video https://www.youtube.com/watch?v=L1vXC"
+          ],
+          [
+           "I need to fact-check a citation. This is the citat"
+          ],
+          [
+           "What is the minimum number of page links a person "
+          ],
+          [
+           ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+          ],
+          [
+           "A paper about AI regulation that was originally su"
+          ],
+          [
+           "In Series 9, Episode 11 of Doctor Who, the Doctor "
+          ],
+          [
+           "My family reunion is this week, and I was assigned"
+          ],
+          [
+           "According to github, when was Regression added to "
+          ],
+          [
+           "What is the maximum length in meters of #9 in the "
+          ],
+          [
+           "Could you help me out with this assignment? Our pr"
+          ],
+          [
+           "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+          ],
+          [
+           "How many applicants for the job in the PDF are onl"
+          ],
+          [
+           "If we assume all articles published by Nature in 2"
+          ],
+          [
+           "How many studio albums were published by Mercedes "
+          ],
+          [
+           "In the fictional language of Tizin, basic sentence"
+          ],
+          [
+           "What animals that were mentioned in both Ilias Lag"
+          ],
+          [
+           "In the year 2022, and before December, what does \""
+          ],
+          [
+           "Under DDC 633 on Bielefeld University Library's BA"
+          ],
+          [
+           "The attached file contains a list of vendors in th"
+          ],
+          [
+           "Given this table defining * on the set S = {a, b, "
+          ],
+          [
+           "Assuming scientists in the famous youtube video Th"
+          ],
+          [
+           "The object in the British Museum's collection with"
+          ],
+          [
+           "The Metropolitan Museum of Art has a portrait in i"
+          ],
+          [
+           "According to Box Office Mojo's 2020 Worldwide Box "
+          ],
+          [
+           "In Valentina Re’s contribution to the 2017 book “W"
+          ],
+          [
+           "The following numbers function similarly to ISBN 1"
+          ],
+          [
+           "Which of the text elements under CATEGORIES in the"
+          ],
+          [
+           "I’m researching species that became invasive after"
+          ],
+          [
+           "Compute the check digit the Tropicos ID for the Or"
+          ],
+          [
+           "In Emily Midkiff's June 2014 article in a journal "
+          ],
+          [
+           "How many High Energy Physics - Lattice articles li"
+          ],
+          [
+           "Using bass clef notes, what is the age of someone "
+          ],
+          [
+           "Review the chess position provided in the image. I"
+          ],
+          [
+           "I was trying to remember how well the Cheater Beat"
+          ],
+          [
+           "The attached file shows a list of books in the col"
+          ],
+          [
+           "What is the volume in milliliters of a system comp"
+          ],
+          [
+           "The attached file lists accommodations in the reso"
+          ],
+          [
+           "How many slides in this PowerPoint presentation me"
+          ],
+          [
+           "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+          ],
+          [
+           "As of the 2020 census, what was the population dif"
+          ],
+          [
+           "It is 1999. Before you party like it is 1999, plea"
+          ],
+          [
+           "Each cell in the attached spreadsheet represents a"
+          ],
+          [
+           "Find the value of x to the nearest tenth: Lx = (d/"
+          ],
+          [
+           "If there is anything that doesn't make sense in th"
+          ],
+          [
+           "As a comma separated list with no whitespace, usin"
+          ],
+          [
+           "You are Van Helsing, a renowned vampire hunter. A "
+          ],
+          [
+           "The attached file shows the locomotives in the col"
+          ],
+          [
+           "This is a secret message my friend gave me. It say"
+          ],
+          [
+           "According to wikipedia, how many Asian countries s"
+          ],
+          [
+           "Who composed the song that was performed by a roos"
+          ],
+          [
+           "You are given this Excel file as a map. You start "
+          ],
+          [
+           "What is the area of the green polygon in the attac"
+          ],
+          [
+           "You are a telecommunications engineer who wants to"
+          ],
+          [
+           "Who nominated the only Featured Article on English"
+          ],
+          [
+           "What writer is quoted by Merriam-Webster for the W"
+          ],
+          [
+           "The attached spreadsheet contains the sales of men"
+          ],
+          [
+           "What is the last word before the second chorus of "
+          ],
+          [
+           "Examine the video at https://www.youtube.com/watch"
+          ],
+          [
+           "I'm making a grocery list for my mom, but she's a "
+          ],
+          [
+           "In the NIH translation of the original 1913 Michae"
+          ],
+          [
+           "Look at the attached image. The quiz is scored as "
+          ],
+          [
+           "Hi, I'm making a pie but I could use some help wit"
+          ],
+          [
+           "According to Google Finance, when was the first ye"
+          ],
+          [
+           "On July 15, 2008, Phys.org published an article ab"
+          ],
+          [
+           "The attached image contains a Python script. Run t"
+          ],
+          [
+           "How many times was a Twitter/X post cited as a ref"
+          ],
+          [
+           "I have the Standard plan in the image below, and I"
+          ],
+          [
+           "On ScienceDirect, what is the difference to 3 deci"
+          ],
+          [
+           "I’m thinking about selling my home, so I want to l"
+          ],
+          [
+           "In the 2018 VSCode blog post on replit.com, what w"
+          ],
+          [
+           "The year is 2022. I am at the National Air and Spa"
+          ],
+          [
+           "This spreadsheet contains a list of clients for a "
+          ],
+          [
+           "The attached PDF lists accommodations in the resor"
+          ],
+          [
+           "I went to Virtue restaurant & bar in Chicago for m"
+          ],
+          [
+           "On the DeepFruits fruit detection graph on Connect"
+          ],
+          [
+           "What time was the Tri-Rail train that carried the "
+          ],
+          [
+           "Which contributor to the version of OpenCV where s"
+          ],
+          [
+           "How many edits were made to the Wikipedia page on "
+          ],
+          [
+           "What is the final numeric output from the attached"
+          ],
+          [
+           "It's May 2023, and I'm about to drive across the U"
+          ],
+          [
+           "In Nature journal's Scientific Reports conference "
+          ],
+          [
+           "What percentage of the total penguin population ac"
+          ],
+          [
+           "The longest-lived vertebrate is named after an isl"
+          ],
+          [
+           "In the NCATS PubChem compound database for Food Ad"
+          ],
+          [
+           "In the Scikit-Learn July 2017 changelog, what othe"
+          ],
+          [
+           "The Latin root of the Yola word \"gimlie\" shares a "
+          ],
+          [
+           "On the BBC Earth YouTube video of the Top 5 Sillie"
+          ],
+          [
+           "Pull out the sentence in the following 5x7 block o"
+          ],
+          [
+           "Bob was invited to participate in a game show, and"
+          ],
+          [
+           "All of the individuals who formally held the posit"
+          ],
+          [
+           "What is the surname of the equine veterinarian men"
+          ],
+          [
+           "On Cornell Law School website's legal information "
+          ],
+          [
+           "How many pages if the 2023 IPCC report (85 pages v"
+          ],
+          [
+           "The work referenced in footnote 397 of Federico La"
+          ],
+          [
+           "I was referencing each of the tables in the file f"
+          ],
+          [
+           "What integer-rounded percentage of the total lengt"
+          ],
+          [
+           "How many more blocks (also denoted as layers) in B"
+          ],
+          [
+           "The attached spreadsheet lists the locomotives own"
+          ],
+          [
+           "On a leap day before the year 2008, a joke was rem"
+          ],
+          [
+           "The attached file lists the locomotives owned by a"
+          ],
+          [
+           "The YouTube channel Game Grumps began a Let’s Play"
+          ],
+          [
+           "Hi, I was out sick from my classes on Friday, so I"
+          ],
+          [
+           "What's the last line of the rhyme under the flavor"
+          ],
+          [
+           "I'm curious about how much information is availabl"
+          ],
+          [
+           "I'd like to learn more about some popular reality "
+          ],
+          [
+           "If this whole pint is made up of ice cream, how ma"
+          ],
+          [
+           "The cover of the August 2021 issue of Vogue shows "
+          ],
+          [
+           "Eva Draconis has a personal website which can be a"
+          ],
+          [
+           "A 5-man group made up of one tank, one healer, and"
+          ],
+          [
+           "How many nonindigenous crocodiles were found in Fl"
+          ],
+          [
+           "What was the complete title of the book in which t"
+          ],
+          [
+           "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+          ],
+          [
+           "According to Girls Who Code, how long did it take "
+          ],
+          [
+           "Take the gender split from the 2011 Bulgarian cens"
+          ],
+          [
+           "Of the cities within the United States where U.S. "
+          ],
+          [
+           "Where were the Vietnamese specimens described by K"
+          ],
+          [
+           "How many images are there in the latest 2022 Lego "
+          ],
+          [
+           "What is the absolute difference in tens of thousan"
+          ],
+          [
+           "Which of the fruits shown in the 2008 painting \"Em"
+          ],
+          [
+           "A standard Rubik’s cube has been broken into cubes"
+          ],
+          [
+           "Who did the actor who played Ray in the Polish-lan"
+          ],
+          [
+           "According to the USGS, in what year was the Americ"
+          ],
+          [
+           "What was the actual enrollment count of the clinic"
+          ],
+          [
+           "How many at bats did the Yankee with the most walk"
+          ],
+          [
+           "The brand that makes these harnesses the dogs are "
+          ],
+          [
+           "According to Openreview.net, at the NeurIPS 2022 C"
+          ],
+          [
+           "Who are the pitchers with the number before and af"
+          ],
+          [
+           "What country had the least number of athletes at t"
+          ],
+          [
+           "As of August 2023, who is the only winner of the U"
+          ],
+          [
+           "The attached Excel file contains the sales of menu"
+          ],
+          [
+           "In the 2015 Metropolitan Museum of Art exhibition "
+          ],
+          [
+           "When was a picture of St. Thomas Aquinas first add"
+          ],
+          [
+           "What is the first name of the only Malko Competiti"
+          ],
+          [
+           "The attached spreadsheet contains a list of books "
+          ],
+          [
+           "In the YouTube 360 VR video from March 2018 narrat"
+          ],
+          [
+           "In the endnote found in the second-to-last paragra"
+          ],
+          [
+           "In NASA's Astronomy Picture of the Day on 2006 Jan"
+          ],
+          [
+           "In the film Goldfinger, what color was the object "
+          ],
+          [
+           "What is the latest chronological year date written"
+          ],
+          [
+           "Consider the following symbols: 𒐜  𒐐𒐚\n\nThis is a n"
+          ],
+          [
+           "On June 6, 2023, an article by Carolyn Collins Pet"
+          ],
+          [
+           "As of May 2023, how many stops are between South S"
+          ],
+          [
+           "The book with the doi 10.1353/book.24372 concerns "
+          ],
+          [
+           "I read a paper about multiwavelength observations "
+          ],
+          [
+           "During the first week of August 2015, one of the N"
+          ],
+          [
+           "At the two-minute mark in the YouTube video upload"
+          ],
+          [
+           "What is the average number of pre-2020 works on th"
+          ]
+         ],
+         "hovertemplate": "agent_name=code_o1_04_february_submission<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
+         "legendgroup": "code_o1_04_february_submission",
+         "line": {
+          "color": "#00cc96",
+          "dash": "solid"
+         },
+         "marker": {
+          "symbol": "circle"
+         },
+         "mode": "lines",
+         "name": "code_o1_04_february_submission",
+         "showlegend": true,
+         "type": "scattergl",
+         "x": {
+          "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEA",
+          "dtype": "i2"
+         },
+         "xaxis": "x",
+         "y": {
+          "bdata": "AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D+amZmZmZnpP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/AAAAAAAA4D900UUXXXThPwAAAAAAAOA/sRM7sRM74T+SJEmSJEniPxEREREREeE/AAAAAAAA4j/T0tLS0tLiP+Q4juM4juM/XkN5DeU15D/NzMzMzMzkP/Q8z/M8z+M/XXTRRRdd5D84velNb3rjP6uqqqqqquI/7FG4HoXr4T+xEzuxEzvhP3Icx3Ecx+E/kiRJkiRJ4j9PIyz3NMLiPzMzMzMzM+M/lVJKKaWU4j8AAAAAAADjP22yySabbOI/09LS0tLS4j8zMzMzMzPjP6uqqqqqquI/bzBFPusG4z9sKK+hvIbiP9IgDdIgDeI/ZmZmZmZm4j+7ErUrUbviP5IkSZIkSeI/p6wpa8qa4j/poosuuujiP9InfdInfeI/IQtZyEIW4j9HfWejvrPhP1VVVVVVVeE/PzTWh8b64D9I4XoUrkfhP/Hw8PDw8OA/2Ymd2Imd4D+pCcZb2efgP3sJ7SW0l+A/SpCnBHlK4D8AAAAAAADgP34E9xHcR+A/3dMIyz2N4D+c1H15bEXgPwAAAAAAAOA/afM+xSVD4D+EEEIIIYTgPxAEQRAEQeA/AAAAAACA4D/RC73QC73gP3zwwQcffOA/b+FXYyI94D94eHh4eHjgPwtZyEIWsuA/oQ7qoA7q4D8OJFphcyDhP1VVVVVVVeE/jBgxYsSI4T/CFPmsG0zhPxEREREREeE/eQ3lNZTX4D9WfkKclZ/gP5AGaZAGaeA/N2F+V4ub4D/NzMzMzMzgP3sJ7SW0l+A/yOB8DM7H4D+2h1xWDJTgPxiGYRiGYeA/kZCQkJCQ4D8w6Av6gr7gP93TCMs9jeA/uuiiiy664D8Oc5jDHObgP2ELtmALtuA/cQiHcAiH4D+GLGQhC1ngPyywwAILLOA/QUyuICZX4D8WCCPtWIHgP1VVVVVVVeA/8MXVDzoq4D8VvJyCl1PgP3+lQK1fKeA/AAAAAAAA4D+YdGoe5K7fP19fX19fX98/XG0MTXew3z8ndmIndmLfPyD7sR/7sd8/AAAAAAAA4D+9U9dycLPfPwAAAAAAAOA/TvvJEti03z9r37D2DWvfPyryWTeYIt8/27Zt27Zt3z8SePshgbffPwT3EdxHcN8//HVJ5cO43z8jLPc0wnLfP6D7uZ/7ud8/yFYEDSd13z/gKLvfKLvfPwAAAAAAAOA/jmVQKky83z8AAAAAAADgPyHQFAJNIeA/AAAAAAAA4D9YObTIdr7fP9/3fd/3fd8/0Ofz+Xw+3z8AAAAAAADfP9AX9AV9Qd8/P/ADP/AD3z9xQkqeZUTfPwgffPDBB98/c/TN0TdH3z9FeqBydgvfP5/0SZ/0Sd8/Dw8PDw8P3z/ZLKj2nEzfP/EzSvyMEt8/964DujFP3z9f8RVf8RXfP3usZeiA3d4/u9ST8dul3j8wS8oBkeHeP8dxHMdxHN8/Kmj1pYJW3z/58ePHjx/fP7o3oExc6d4/KvJZN5gi3z/L4ox2D1vfP5NfLPnFkt8//iZ/k7/J3z9DeQ3lNZTfPyB1yh91yt8/cVZ+QpyV3z/LX7L8JcvfPwAAAAAAAOA/S3r50xYa4D8AAAAAAADgP742Yl16zN8/AAAAAAAA4D+P5g82Hs3fP1ikDDzdmt8/",
+          "dtype": "f8"
+         },
+         "yaxis": "y"
+        },
+        {
+         "customdata": [
+          [
+           "Using the Biopython library in Python, parse the P"
+          ],
+          [
+           "Use density measures from the chemistry materials "
+          ],
+          [
+           "What are the EC numbers of the two most commonly u"
+          ],
+          [
+           "If Eliud Kipchoge could maintain his record-making"
+          ],
+          [
+           "In April of 1977, who was the Prime Minister of th"
+          ],
+          [
+           "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+          ],
+          [
+           "In terms of geographical distance between capital "
+          ],
+          [
+           ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+          ],
+          [
+           "When you take the average of the standard populati"
+          ],
+          [
+           "An office held a Secret Santa gift exchange where "
+          ],
+          [
+           "The object in the British Museum's collection with"
+          ],
+          [
+           "What is the minimum number of page links a person "
+          ],
+          [
+           "How many studio albums were published by Mercedes "
+          ],
+          [
+           "In the NCATS PubChem compound database for Food Ad"
+          ],
+          [
+           "The attached spreadsheet shows the inventory for a"
+          ],
+          [
+           "In Unlambda, what exact charcter or text needs to "
+          ],
+          [
+           "If we assume all articles published by Nature in 2"
+          ],
+          [
+           "In Series 9, Episode 11 of Doctor Who, the Doctor "
+          ],
+          [
+           "What was the volume in m^3 of the fish bag that wa"
+          ],
+          [
+           "It is 1999. Before you party like it is 1999, plea"
+          ],
+          [
+           "My family reunion is this week, and I was assigned"
+          ],
+          [
+           "In the fictional language of Tizin, basic sentence"
+          ],
+          [
+           "Of the authors (First M. Last) that worked on the "
+          ],
+          [
+           "In July 2, 1959 United States standards for grades"
+          ],
+          [
+           "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+          ],
+          [
+           "Under DDC 633 on Bielefeld University Library's BA"
+          ],
+          [
+           "A paper about AI regulation that was originally su"
+          ],
+          [
+           "I’m researching species that became invasive after"
+          ],
+          [
+           "Each cell in the attached spreadsheet represents a"
+          ],
+          [
+           "How many High Energy Physics - Lattice articles li"
+          ],
+          [
+           "I went to Virtue restaurant & bar in Chicago for m"
+          ],
+          [
+           "Could you help me out with this assignment? Our pr"
+          ],
+          [
+           "Assuming scientists in the famous youtube video Th"
+          ],
+          [
+           "In the video https://www.youtube.com/watch?v=L1vXC"
+          ],
+          [
+           "What is the maximum length in meters of #9 in the "
+          ],
+          [
+           "In Nature journal's Scientific Reports conference "
+          ],
+          [
+           "Which contributor to the version of OpenCV where s"
+          ],
+          [
+           "In the year 2022, and before December, what does \""
+          ],
+          [
+           "The attached file contains a list of vendors in th"
+          ],
+          [
+           "What two-word type of model did Manash Pratim Kash"
+          ],
+          [
+           "Given this table defining * on the set S = {a, b, "
+          ],
+          [
+           "What's the last line of the rhyme under the flavor"
+          ],
+          [
+           "How many applicants for the job in the PDF are onl"
+          ],
+          [
+           "Which of the text elements under CATEGORIES in the"
+          ],
+          [
+           "According to github, when was Regression added to "
+          ],
+          [
+           "What writer is quoted by Merriam-Webster for the W"
+          ],
+          [
+           "The following numbers function similarly to ISBN 1"
+          ],
+          [
+           "The photograph in the Whitney Museum of American A"
+          ],
+          [
+           "As a comma separated list with no whitespace, usin"
+          ],
+          [
+           "Find the value of x to the nearest tenth: Lx = (d/"
+          ],
+          [
+           "Compute the check digit the Tropicos ID for the Or"
+          ],
+          [
+           "The Metropolitan Museum of Art has a portrait in i"
+          ],
+          [
+           "In the 2018 VSCode blog post on replit.com, what w"
+          ],
+          [
+           "Using bass clef notes, what is the age of someone "
+          ],
+          [
+           "The attached file shows a list of books in the col"
+          ],
+          [
+           "In the NIH translation of the original 1913 Michae"
+          ],
+          [
+           "How many slides in this PowerPoint presentation me"
+          ],
+          [
+           "What animals that were mentioned in both Ilias Lag"
+          ],
+          [
+           "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+          ],
+          [
+           "The attached file lists accommodations in the reso"
+          ],
+          [
+           "On July 15, 2008, Phys.org published an article ab"
+          ],
+          [
+           "You are Van Helsing, a renowned vampire hunter. A "
+          ],
+          [
+           "In Emily Midkiff's June 2014 article in a journal "
+          ],
+          [
+           "According to Google Finance, when was the first ye"
+          ],
+          [
+           "What is the area of the green polygon in the attac"
+          ],
+          [
+           "Who composed the song that was performed by a roos"
+          ],
+          [
+           "Review the chess position provided in the image. I"
+          ],
+          [
+           "The attached file shows the locomotives in the col"
+          ],
+          [
+           "This is a secret message my friend gave me. It say"
+          ],
+          [
+           "You are a telecommunications engineer who wants to"
+          ],
+          [
+           "Examine the video at https://www.youtube.com/watch"
+          ],
+          [
+           "In Valentina Re’s contribution to the 2017 book “W"
+          ],
+          [
+           "According to Box Office Mojo's 2020 Worldwide Box "
+          ],
+          [
+           "You are given this Excel file as a map. You start "
+          ],
+          [
+           "I'm making a grocery list for my mom, but she's a "
+          ],
+          [
+           "The attached spreadsheet contains the sales of men"
+          ],
+          [
+           "According to wikipedia, how many Asian countries s"
+          ],
+          [
+           "On a leap day before the year 2008, a joke was rem"
+          ],
+          [
+           "What time was the Tri-Rail train that carried the "
+          ],
+          [
+           "What is the last word before the second chorus of "
+          ],
+          [
+           "What integer-rounded percentage of the total lengt"
+          ],
+          [
+           "How many nonindigenous crocodiles were found in Fl"
+          ],
+          [
+           "The Latin root of the Yola word \"gimlie\" shares a "
+          ],
+          [
+           "Look at the attached image. The quiz is scored as "
+          ],
+          [
+           "I was trying to remember how well the Cheater Beat"
+          ],
+          [
+           "Hi, I'm making a pie but I could use some help wit"
+          ],
+          [
+           "I was referencing each of the tables in the file f"
+          ],
+          [
+           "I need to fact-check a citation. This is the citat"
+          ],
+          [
+           "I have the Standard plan in the image below, and I"
+          ],
+          [
+           "I’m thinking about selling my home, so I want to l"
+          ],
+          [
+           "This spreadsheet contains a list of clients for a "
+          ],
+          [
+           "What is the volume in milliliters of a system comp"
+          ],
+          [
+           "As of the 2020 census, what was the population dif"
+          ],
+          [
+           "Who nominated the only Featured Article on English"
+          ],
+          [
+           "In the endnote found in the second-to-last paragra"
+          ],
+          [
+           "The year is 2022. I am at the National Air and Spa"
+          ],
+          [
+           "What is the final numeric output from the attached"
+          ],
+          [
+           "The attached PDF lists accommodations in the resor"
+          ],
+          [
+           "If there is anything that doesn't make sense in th"
+          ],
+          [
+           "How many times was a Twitter/X post cited as a ref"
+          ],
+          [
+           "It's May 2023, and I'm about to drive across the U"
+          ],
+          [
+           "Pull out the sentence in the following 5x7 block o"
+          ],
+          [
+           "On the BBC Earth YouTube video of the Top 5 Sillie"
+          ],
+          [
+           "In the Scikit-Learn July 2017 changelog, what othe"
+          ],
+          [
+           "Of the cities within the United States where U.S. "
+          ],
+          [
+           "How many edits were made to the Wikipedia page on "
+          ],
+          [
+           "How many pages if the 2023 IPCC report (85 pages v"
+          ],
+          [
+           "What is the surname of the equine veterinarian men"
+          ],
+          [
+           "Bob was invited to participate in a game show, and"
+          ],
+          [
+           "How many more blocks (also denoted as layers) in B"
+          ],
+          [
+           "During the first week of August 2015, one of the N"
+          ],
+          [
+           "On Cornell Law School website's legal information "
+          ],
+          [
+           "The attached image contains a Python script. Run t"
+          ],
+          [
+           "The attached spreadsheet lists the locomotives own"
+          ],
+          [
+           "The YouTube channel Game Grumps began a Let’s Play"
+          ],
+          [
+           "What was the complete title of the book in which t"
+          ],
+          [
+           "On the DeepFruits fruit detection graph on Connect"
+          ],
+          [
+           "The attached file lists the locomotives owned by a"
+          ],
+          [
+           "What percentage of the total penguin population ac"
+          ],
+          [
+           "On ScienceDirect, what is the difference to 3 deci"
+          ],
+          [
+           "How many images are there in the latest 2022 Lego "
+          ],
+          [
+           "Who did the actor who played Ray in the Polish-lan"
+          ],
+          [
+           "I'd like to learn more about some popular reality "
+          ],
+          [
+           "What is the absolute difference in tens of thousan"
+          ],
+          [
+           "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+          ]
+         ],
+         "hovertemplate": "agent_name=code_o1_04_february_submission-medium<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
+         "legendgroup": "code_o1_04_february_submission-medium",
+         "line": {
+          "color": "#ab63fa",
+          "dash": "solid"
+         },
+         "marker": {
+          "symbol": "circle"
+         },
+         "mode": "lines",
+         "name": "code_o1_04_february_submission-medium",
+         "showlegend": true,
+         "type": "scattergl",
+         "x": {
+          "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eHl6e3w=",
+          "dtype": "i1"
+         },
+         "xaxis": "x",
+         "y": {
+          "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADkP3Icx3Ecx+E/MzMzMzMz4z9ddNFFF13kP1VVVVVVVeU/dmIndmIn5j8lSZIkSZLkP1VVVVVVVeU/AAAAAAAA5D/T0tLS0tLiP3Icx3Ecx+E/bCivobyG4j+amZmZmZnhP5IkSZIkSeI/6aKLLrro4j8hC1nIQhbiP1VVVVVVVeE/7FG4HoXr4T+xEzuxEzvhP3Icx3Ecx+E/kiRJkiRJ4j9PIyz3NMLiPyIiIiIiIuI/lVJKKaWU4j8AAAAAAADiP3TRRRdddOE/4uHh4eHh4T/xFV/xFV/hPzmO4ziO4+A/6wZT5LNu4D95DeU1lNfgP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhPzEMwzAMw+A/R9wRd8Qd4T+66KKLLrrgP7AFW7AFW+A/C1nIQhay4D/E5ApicgXhP6uqqqqqquA/FbycgpdT4D+kcD0K16PgP/Hw8PDw8OA/2Ymd2Imd4D+pCcZb2efgP3sJ7SW0l+A/37D2DWvf4D8lSZIkSZLgP3kN5TWU1+A/3dMIyz2N4D+c1H15bEXgPwAAAAAAAOA/afM+xSVD4D+EEEIIIYTgPzEMwzAMw+A/AAAAAACA4D/wAz/wAz/gP3zwwQcffOA/b+FXYyI94D94eHh4eHjgPwRz7cBcO+A/UAd1UAd14D82BxKtsDngPxzHcRzHceA/ggMHDhw44D8AAAAAAADgP5NfLPnFkt8/AAAAAAAA4D/H1MDeMTXgPwAAAAAAAOA/Mb+rxU2Y3z8AAAAAAADgP1ikDDzdmt8/OB+D8zE43z+U8EZT59feP57neZ7ned4/Hh4eHh4e3j+hL+gL+oLePyCT4gUyKd4/jC666KKL3j/vda973evePz/pkz7pk94/3uM93uM93j/f9KY3vendP5dddtlll90/eDbqOxv13T9G2rECYaTdPwAAAAAAAN4/nkSmYbtZ3j+sD431obHePw2JeTtDYt4/FK5H4XoU3j8pMOnUPMjdPx4eHh4eHt4/zCI+gVRy3j92Yid2YifeP57neZ7ned4/dEhNMN7K3j83+4VYURrfP4X2EtpLaN8/6fFdOIge3z9r37D2DWvfP2P7Hb0ytt8/27Zt27Zt3z8SePshgbffPwAAAAAAAOA//HVJ5cO43z8jLPc0wnLfP9/yLd/yLd8/yFYEDSd13z+fejGfejHfP+/u7u7u7t4/xfuR03yt3j9cMgTraPPePzgfg/MxON8/fO+999573z8IrBxaZDvfPw==",
+          "dtype": "f8"
+         },
+         "yaxis": "y"
+        },
+        {
+         "customdata": [
+          [
+           "When you take the average of the standard populati"
+          ],
+          [
+           "In April of 1977, who was the Prime Minister of th"
+          ],
+          [
+           "Use density measures from the chemistry materials "
+          ],
+          [
+           "An office held a Secret Santa gift exchange where "
+          ],
+          [
+           "In Unlambda, what exact charcter or text needs to "
+          ],
+          [
+           ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+          ],
+          [
+           "If Eliud Kipchoge could maintain his record-making"
+          ],
+          [
+           "What is the minimum number of page links a person "
+          ],
+          [
+           "I need to fact-check a citation. This is the citat"
+          ],
+          [
+           "What was the volume in m^3 of the fish bag that wa"
+          ],
+          [
+           "In July 2, 1959 United States standards for grades"
+          ],
+          [
+           "Using the Biopython library in Python, parse the P"
+          ],
+          [
+           "If we assume all articles published by Nature in 2"
+          ],
+          [
+           "According to github, when was Regression added to "
+          ],
+          [
+           "The attached spreadsheet shows the inventory for a"
+          ],
+          [
+           "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+          ],
+          [
+           "In Series 9, Episode 11 of Doctor Who, the Doctor "
+          ],
+          [
+           "Of the authors (First M. Last) that worked on the "
+          ],
+          [
+           "It is 1999. Before you party like it is 1999, plea"
+          ],
+          [
+           "I’m researching species that became invasive after"
+          ],
+          [
+           "What's the last line of the rhyme under the flavor"
+          ],
+          [
+           "The object in the British Museum's collection with"
+          ],
+          [
+           "What are the EC numbers of the two most commonly u"
+          ],
+          [
+           "In the video https://www.youtube.com/watch?v=L1vXC"
+          ],
+          [
+           "My family reunion is this week, and I was assigned"
+          ],
+          [
+           "The photograph in the Whitney Museum of American A"
+          ],
+          [
+           "In the NCATS PubChem compound database for Food Ad"
+          ],
+          [
+           "What two-word type of model did Manash Pratim Kash"
+          ],
+          [
+           "How many studio albums were published by Mercedes "
+          ],
+          [
+           "Each cell in the attached spreadsheet represents a"
+          ],
+          [
+           "In the fictional language of Tizin, basic sentence"
+          ],
+          [
+           "Under DDC 633 on Bielefeld University Library's BA"
+          ],
+          [
+           "What is the maximum length in meters of #9 in the "
+          ],
+          [
+           "The attached file contains a list of vendors in th"
+          ],
+          [
+           "In Emily Midkiff's June 2014 article in a journal "
+          ],
+          [
+           "Assuming scientists in the famous youtube video Th"
+          ],
+          [
+           "In the 2018 VSCode blog post on replit.com, what w"
+          ],
+          [
+           "What is the average number of pre-2020 works on th"
+          ],
+          [
+           "A paper about AI regulation that was originally su"
+          ],
+          [
+           "Given this table defining * on the set S = {a, b, "
+          ],
+          [
+           "In Valentina Re’s contribution to the 2017 book “W"
+          ],
+          [
+           "Could you help me out with this assignment? Our pr"
+          ],
+          [
+           "Review the chess position provided in the image. I"
+          ],
+          [
+           "What writer is quoted by Merriam-Webster for the W"
+          ],
+          [
+           "The following numbers function similarly to ISBN 1"
+          ],
+          [
+           "How many pages if the 2023 IPCC report (85 pages v"
+          ],
+          [
+           "As a comma separated list with no whitespace, usin"
+          ],
+          [
+           "In the year 2022, and before December, what does \""
+          ],
+          [
+           "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+          ]
+         ],
+         "hovertemplate": "agent_name=code_o1_04_february_submission3<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
+         "legendgroup": "code_o1_04_february_submission3",
+         "line": {
+          "color": "#FFA15A",
+          "dash": "solid"
+         },
+         "marker": {
+          "symbol": "circle"
+         },
+         "mode": "lines",
+         "name": "code_o1_04_february_submission3",
+         "showlegend": true,
+         "type": "scattergl",
+         "x": {
+          "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMA==",
+          "dtype": "i1"
+         },
+         "xaxis": "x",
+         "y": {
+          "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADkP1VVVVVVVeU/ZmZmZmZm5j9ddNFFF13kP6uqqqqqquI/sRM7sRM74T8AAAAAAADgPxEREREREeE/AAAAAAAA4j/x8PDw8PDgPwAAAAAAAOA/DeU1lNdQ3j8AAAAAAADgP57neZ7ned4/AAAAAAAA4D/qTW9605vePwAAAAAAAOA/pHA9Ctej4D8AAAAAAADgPwntJbSX0N4/btu2bdu23T9HWO5phOXePwAAAAAAAOA/hBBCCCGE4D8AAAAAAADgPwgffPDBB98/AAAAAAAA4D9QB3VQB3XgPwAAAAAAAOA/6wZT5LNu4D8AAAAAAADgP5AGaZAGaeA/AAAAAAAA4D9kcD4G52PgPwAAAAAAAOA/0Bf0BX1B3z8AAAAAAADgP7AFW7AFW+A/AAAAAAAA4D99Z6O+s1HfPwAAAAAAAOA/1ofG+tBY3z8=",
+          "dtype": "f8"
+         },
+         "yaxis": "y"
+        },
+        {
+         "customdata": [
+          [
+           "In April of 1977, who was the Prime Minister of th"
+          ],
+          [
+           "Use density measures from the chemistry materials "
+          ],
+          [
+           "If Eliud Kipchoge could maintain his record-making"
+          ],
+          [
+           "When you take the average of the standard populati"
+          ],
+          [
+           "The object in the British Museum's collection with"
+          ],
+          [
+           ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+          ]
+         ],
+         "hovertemplate": "agent_name=code_o1_04_february_submission4<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
+         "legendgroup": "code_o1_04_february_submission4",
+         "line": {
+          "color": "#19d3f3",
+          "dash": "solid"
+         },
+         "marker": {
+          "symbol": "circle"
+         },
+         "mode": "lines",
+         "name": "code_o1_04_february_submission4",
+         "showlegend": true,
+         "type": "scattergl",
+         "x": {
+          "bdata": "AAECAwQF",
+          "dtype": "i1"
+         },
+         "xaxis": "x",
+         "y": {
+          "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/",
+          "dtype": "f8"
+         },
+         "yaxis": "y"
+        },
+        {
+         "customdata": [
+          [
+           "In April of 1977, who was the Prime Minister of th"
+          ],
+          [
+           "When you take the average of the standard populati"
+          ],
+          [
+           "An office held a Secret Santa gift exchange where "
+          ],
+          [
+           "In Unlambda, what exact charcter or text needs to "
+          ],
+          [
+           "If Eliud Kipchoge could maintain his record-making"
+          ],
+          [
+           ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+          ],
+          [
+           "If we assume all articles published by Nature in 2"
+          ],
+          [
+           "Use density measures from the chemistry materials "
+          ],
+          [
+           "The attached spreadsheet shows the inventory for a"
+          ],
+          [
+           "In the video https://www.youtube.com/watch?v=L1vXC"
+          ],
+          [
+           "What was the volume in m^3 of the fish bag that wa"
+          ],
+          [
+           "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+          ],
+          [
+           "I need to fact-check a citation. This is the citat"
+          ],
+          [
+           "Of the authors (First M. Last) that worked on the "
+          ],
+          [
+           "How many studio albums were published by Mercedes "
+          ],
+          [
+           "Each cell in the attached spreadsheet represents a"
+          ],
+          [
+           "In terms of geographical distance between capital "
+          ],
+          [
+           "Using the Biopython library in Python, parse the P"
+          ],
+          [
+           "What is the minimum number of page links a person "
+          ],
+          [
+           "What are the EC numbers of the two most commonly u"
+          ],
+          [
+           "My family reunion is this week, and I was assigned"
+          ],
+          [
+           "What two-word type of model did Manash Pratim Kash"
+          ],
+          [
+           "What's the last line of the rhyme under the flavor"
+          ],
+          [
+           "Assuming scientists in the famous youtube video Th"
+          ],
+          [
+           "According to github, when was Regression added to "
+          ],
+          [
+           "Could you help me out with this assignment? Our pr"
+          ],
+          [
+           "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+          ],
+          [
+           "Compute the check digit the Tropicos ID for the Or"
+          ],
+          [
+           "In the fictional language of Tizin, basic sentence"
+          ],
+          [
+           "Review the chess position provided in the image. I"
+          ],
+          [
+           "Which contributor to the version of OpenCV where s"
+          ],
+          [
+           "The attached file contains a list of vendors in th"
+          ],
+          [
+           "What is the maximum length in meters of #9 in the "
+          ],
+          [
+           "The object in the British Museum's collection with"
+          ],
+          [
+           "Under DDC 633 on Bielefeld University Library's BA"
+          ],
+          [
+           "What integer-rounded percentage of the total lengt"
+          ],
+          [
+           "In Valentina Re’s contribution to the 2017 book “W"
+          ],
+          [
+           "How many applicants for the job in the PDF are onl"
+          ],
+          [
+           "I went to Virtue restaurant & bar in Chicago for m"
+          ],
+          [
+           "In the 2018 VSCode blog post on replit.com, what w"
+          ],
+          [
+           "Given this table defining * on the set S = {a, b, "
+          ],
+          [
+           "In the NCATS PubChem compound database for Food Ad"
+          ],
+          [
+           "Who nominated the only Featured Article on English"
+          ],
+          [
+           "As a comma separated list with no whitespace, usin"
+          ],
+          [
+           "How many High Energy Physics - Lattice articles li"
+          ],
+          [
+           "On July 15, 2008, Phys.org published an article ab"
+          ],
+          [
+           "According to Box Office Mojo's 2020 Worldwide Box "
+          ],
+          [
+           "Find the value of x to the nearest tenth: Lx = (d/"
+          ],
+          [
+           "What writer is quoted by Merriam-Webster for the W"
+          ],
+          [
+           "In the year 2022, and before December, what does \""
+          ],
+          [
+           "The following numbers function similarly to ISBN 1"
+          ],
+          [
+           "I’m researching species that became invasive after"
+          ],
+          [
+           "In Emily Midkiff's June 2014 article in a journal "
+          ],
+          [
+           "How many slides in this PowerPoint presentation me"
+          ],
+          [
+           "Using bass clef notes, what is the age of someone "
+          ],
+          [
+           "The attached file lists accommodations in the reso"
+          ],
+          [
+           "In July 2, 1959 United States standards for grades"
+          ],
+          [
+           "You are a telecommunications engineer who wants to"
+          ],
+          [
+           "A paper about AI regulation that was originally su"
+          ],
+          [
+           "What animals that were mentioned in both Ilias Lag"
+          ],
+          [
+           "According to Google Finance, when was the first ye"
+          ],
+          [
+           "What time was the Tri-Rail train that carried the "
+          ],
+          [
+           "You are Van Helsing, a renowned vampire hunter. A "
+          ],
+          [
+           "In Nature journal's Scientific Reports conference "
+          ],
+          [
+           "If there is anything that doesn't make sense in th"
+          ],
+          [
+           "This is a secret message my friend gave me. It say"
+          ],
+          [
+           "It is 1999. Before you party like it is 1999, plea"
+          ],
+          [
+           "The attached file shows the locomotives in the col"
+          ],
+          [
+           "I was trying to remember how well the Cheater Beat"
+          ],
+          [
+           "The Latin root of the Yola word \"gimlie\" shares a "
+          ],
+          [
+           "In the NIH translation of the original 1913 Michae"
+          ],
+          [
+           "What is the volume in milliliters of a system comp"
+          ],
+          [
+           "The photograph in the Whitney Museum of American A"
+          ],
+          [
+           "The attached spreadsheet contains the sales of men"
+          ],
+          [
+           "The attached file shows a list of books in the col"
+          ],
+          [
+           "How many pages if the 2023 IPCC report (85 pages v"
+          ],
+          [
+           "Who composed the song that was performed by a roos"
+          ],
+          [
+           "Examine the video at https://www.youtube.com/watch"
+          ],
+          [
+           "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+          ],
+          [
+           "What is the last word before the second chorus of "
+          ],
+          [
+           "What is the area of the green polygon in the attac"
+          ],
+          [
+           "How many nonindigenous crocodiles were found in Fl"
+          ],
+          [
+           "The year is 2022. I am at the National Air and Spa"
+          ],
+          [
+           "You are given this Excel file as a map. You start "
+          ],
+          [
+           "I'm making a grocery list for my mom, but she's a "
+          ],
+          [
+           "The attached PDF lists accommodations in the resor"
+          ],
+          [
+           "Hi, I'm making a pie but I could use some help wit"
+          ],
+          [
+           "How many times was a Twitter/X post cited as a ref"
+          ],
+          [
+           "I’m thinking about selling my home, so I want to l"
+          ],
+          [
+           "I have the Standard plan in the image below, and I"
+          ],
+          [
+           "The attached image contains a Python script. Run t"
+          ],
+          [
+           "According to wikipedia, how many Asian countries s"
+          ],
+          [
+           "How many more blocks (also denoted as layers) in B"
+          ],
+          [
+           "Look at the attached image. The quiz is scored as "
+          ],
+          [
+           "What is the surname of the equine veterinarian men"
+          ],
+          [
+           "This spreadsheet contains a list of clients for a "
+          ],
+          [
+           "What is the final numeric output from the attached"
+          ],
+          [
+           "In the endnote found in the second-to-last paragra"
+          ],
+          [
+           "Who did the actor who played Ray in the Polish-lan"
+          ],
+          [
+           "On a leap day before the year 2008, a joke was rem"
+          ],
+          [
+           "Pull out the sentence in the following 5x7 block o"
+          ],
+          [
+           "On the DeepFruits fruit detection graph on Connect"
+          ],
+          [
+           "The Metropolitan Museum of Art has a portrait in i"
+          ],
+          [
+           "In the Scikit-Learn July 2017 changelog, what othe"
+          ],
+          [
+           "I was referencing each of the tables in the file f"
+          ],
+          [
+           "All of the individuals who formally held the posit"
+          ],
+          [
+           "Consider the following symbols: 𒐜  𒐐𒐚\n\nThis is a n"
+          ],
+          [
+           "The book with the doi 10.1353/book.24372 concerns "
+          ],
+          [
+           "On Cornell Law School website's legal information "
+          ],
+          [
+           "Of the cities within the United States where U.S. "
+          ],
+          [
+           "The longest-lived vertebrate is named after an isl"
+          ],
+          [
+           "As of August 2023, who is the only winner of the U"
+          ],
+          [
+           "How many images are there in the latest 2022 Lego "
+          ],
+          [
+           "During the first week of August 2015, one of the N"
+          ],
+          [
+           "Which of the text elements under CATEGORIES in the"
+          ],
+          [
+           "The attached spreadsheet lists the locomotives own"
+          ],
+          [
+           "The attached file lists the locomotives owned by a"
+          ],
+          [
+           "It's May 2023, and I'm about to drive across the U"
+          ],
+          [
+           "On ScienceDirect, what is the difference to 3 deci"
+          ],
+          [
+           "How many edits were made to the Wikipedia page on "
+          ],
+          [
+           "What is the absolute difference in tens of thousan"
+          ],
+          [
+           "Hi, I was out sick from my classes on Friday, so I"
+          ],
+          [
+           "If this whole pint is made up of ice cream, how ma"
+          ],
+          [
+           "I'd like to learn more about some popular reality "
+          ],
+          [
+           "The YouTube channel Game Grumps began a Let’s Play"
+          ],
+          [
+           "A 5-man group made up of one tank, one healer, and"
+          ],
+          [
+           "Take the gender split from the 2011 Bulgarian cens"
+          ],
+          [
+           "What was the complete title of the book in which t"
+          ],
+          [
+           "What is the latest chronological year date written"
+          ],
+          [
+           "Eva Draconis has a personal website which can be a"
+          ],
+          [
+           "The attached Excel file contains the sales of menu"
+          ],
+          [
+           "Where were the Vietnamese specimens described by K"
+          ],
+          [
+           "The cover of the August 2021 issue of Vogue shows "
+          ],
+          [
+           "Bob was invited to participate in a game show, and"
+          ],
+          [
+           "What percentage of the total penguin population ac"
+          ],
+          [
+           "On the BBC Earth YouTube video of the Top 5 Sillie"
+          ],
+          [
+           "What country had the least number of athletes at t"
+          ],
+          [
+           "What is the first name of the only Malko Competiti"
+          ],
+          [
+           "According to Girls Who Code, how long did it take "
+          ],
+          [
+           "How many at bats did the Yankee with the most walk"
+          ],
+          [
+           "According to Openreview.net, at the NeurIPS 2022 C"
+          ],
+          [
+           "Who are the pitchers with the number before and af"
+          ],
+          [
+           "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+          ],
+          [
+           "In the YouTube 360 VR video from March 2018 narrat"
+          ],
+          [
+           "Which of the fruits shown in the 2008 painting \"Em"
+          ],
+          [
+           "A standard Rubik’s cube has been broken into cubes"
+          ],
+          [
+           "What was the actual enrollment count of the clinic"
+          ],
+          [
+           "In NASA's Astronomy Picture of the Day on 2006 Jan"
+          ],
+          [
+           "I'm curious about how much information is availabl"
+          ],
+          [
+           "When was a picture of St. Thomas Aquinas first add"
+          ],
+          [
+           "In the 2015 Metropolitan Museum of Art exhibition "
+          ],
+          [
+           "The attached spreadsheet contains a list of books "
+          ],
+          [
+           "According to the USGS, in what year was the Americ"
+          ],
+          [
+           "As of May 2023, how many stops are between South S"
+          ],
+          [
+           "The brand that makes these harnesses the dogs are "
+          ],
+          [
+           "I read a paper about multiwavelength observations "
+          ],
+          [
+           "In the film Goldfinger, what color was the object "
+          ],
+          [
+           "On June 6, 2023, an article by Carolyn Collins Pet"
+          ],
+          [
+           "The work referenced in footnote 397 of Federico La"
+          ],
+          [
+           "At the two-minute mark in the YouTube video upload"
+          ],
+          [
+           "As of the 2020 census, what was the population dif"
+          ],
+          [
+           "I thought we could try a fun word puzzle together "
+          ],
+          [
+           "What is the average number of pre-2020 works on th"
+          ],
+          [
+           "According to the World Bank, which countries had g"
+          ],
+          [
+           "In Series 9, Episode 11 of Doctor Who, the Doctor "
+          ]
+         ],
+         "hovertemplate": "agent_name=code_o1_04_february_submission5<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
+         "legendgroup": "code_o1_04_february_submission5",
+         "line": {
+          "color": "#FF6692",
+          "dash": "solid"
+         },
+         "marker": {
+          "symbol": "circle"
+         },
+         "mode": "lines",
+         "name": "code_o1_04_february_submission5",
+         "showlegend": true,
+         "type": "scattergl",
+         "x": {
+          "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
+          "dtype": "i2"
+         },
+         "xaxis": "x",
+         "y": {
+          "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADoPzmO4ziO4+g/mpmZmZmZ6T8vuuiiiy7qP6uqqqqqquo/O7ETO7ET6z9JkiRJkiTpP5qZmZmZmek/AAAAAAAA6j94eHh4eHjoP8dxHMdxHOc/Q3kN5TWU5z8AAAAAAADoPxiGYRiGYeg/RhdddNFF5z9kIQtZyELmP1VVVVVVVeU/exSuR+F65D8UO7ETO7HjP2gvob2E9uI/27Zt27Zt4z8Jyz2NsNzjPzMzMzMzM+M/lVJKKaWU4j8AAAAAAADjP22yySabbOI/09LS0tLS4j+SJEmSJEniP6uqqqqqquI/bzBFPusG4z/lNZTXUF7jPxQ7sRM7seM/AAAAAAAA5D9L1K5E7UrkP/Q8z/M8z+M/Bn1BX9AX5D+jiy666KLjPzMzMzMzM+M/OL3pTW964z9MriAmVxDjP1VVVVVVVeM/5hS8nIKX4z/Xo3A9CtfjPxQUFBQUFOQ/7MRO7MRO5D9NMN7KPofkP9pLaC+hveQ/XXTRRRdd5D8lSZIkSZLkP15DeQ3lNeQ/5p5GWO5p5D9fHlsRNJzkP0REREREROQ/JkOwjjbv4z+dc84555zjP/Q8z/M8z+M/AAAAAACA4z8zMzMzMzPjP+miiy666OI/oHJ2C78a4z9LS0tLS0vjPzDXDsy1A+M/4yu+4iu+4j9TT8Zvl3riP47jOI7jOOI/kB8/fvz44T+YIp91gyniP1nyiyW/WOI/bCivobyG4j8hzspPiLPiP/Mt3/It3+I/E+Z3tbgJ4z/NzMzMzMziP8HTrflhkeI/V6J2JWpX4j+/9pDLioHiP5IkSZIkSeI/EhISEhIS4j+PuCPuiDviP7xAJsULZOI/L7rooosu4j8g/ehHP/rhPyIiIiIiIuI/kiRJkiRJ4j+nN73pTW/iP5VSSimllOI/yhXE5Api4j9sKK+hvIbiP1VVVVVVVeI/EpmG7WZ54j+n4OUUvJziP2r9SoFav+I/j8L1KFyP4j/zIHf9bLHiP9PS0tLS0uI/cl4W8Qmk4j9iJ3ZiJ3biP5IkSZIkSeI/GG9ln0Nq4j/2C7GiND7iP+0ltJfQXuI/OyMVc6sz4j8J8pQgTwniP5gin3WDKeI/kiRJkiRJ4j94+yGBtx/iP3AfwX0E9+E/HYGirQbP4T+E5Z5GWO7hP9IgDdIgDeI/4qTuy2Mr4j9yTQRyTQTiPyIiIiIiIuI/y6BUmHg/4j+wjjbvU1ziPzbSYSMdNuI/U0oppZRS4j+TGARWDi3iP4IgCIIgCOI/iUQikUgk4j8AAAAAAADiP3fEHXFH3OE/gh/4gR/44T9q7oK/ihPiPy+66KKLLuI/kiRJkiRJ4j/l7BZ+NSbiPz0gWefKA+I/Hh4eHh4e4j/8cevyDjjiPyV+RomfUeI/oBvz9NFq4j87qIM6qIPiP8oVxOQKYuI/HUi0wuZA4j++CmZJOSDiP47jOI7jOOI/eoshnbcY4j8SI0aMGDHiP5IkSZIkSeI/DqbIZ91g4j+imo65RHjiP1nyiyW/WOI/kuZIc6Q54j8N5TWU11DiPxK9ZxK9Z+I/kiRJkiRJ4j8rEq8i8SriP9IgDdIgDeI/kROEu7Hv4T8NRKUjewbiP/P32oh16eE/zczMzMzM4T+w8Wj+YOPhP0bKwNOt+eE/yYB6pnLd4T/C+Ricj8HhP6YxYBoDpuE/",
+          "dtype": "f8"
+         },
+         "yaxis": "y"
+        },
         {
          "customdata": [
           [
@@ -7115,7 +8721,7 @@
          "hovertemplate": "agent_name=code_o1_22-01_managedagent-summary_planning<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_o1_22-01_managedagent-summary_planning",
          "line": {
-          "color": "#00cc96",
+          "color": "#B6E880",
           "dash": "solid"
          },
          "marker": {
@@ -7301,7 +8907,7 @@
          "hovertemplate": "agent_name=code_o1_25-01_visioon<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_o1_25-01_visioon",
          "line": {
-          "color": "#ab63fa",
+          "color": "#FF97FF",
           "dash": "solid"
          },
          "marker": {
@@ -7643,7 +9249,7 @@
          "hovertemplate": "agent_name=code_o1_29-01_text<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_o1_29-01_text",
          "line": {
-          "color": "#FFA15A",
+          "color": "#FECB52",
           "dash": "solid"
          },
          "marker": {
@@ -8165,7 +9771,7 @@
          "hovertemplate": "agent_name=code_o3-mini_03_february_remove-navigational<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_o3-mini_03_february_remove-navigational",
          "line": {
-          "color": "#19d3f3",
+          "color": "#636efa",
           "dash": "solid"
          },
          "marker": {
@@ -8321,7 +9927,7 @@
          "hovertemplate": "agent_name=code_qwen-coder-32B_03_february_text<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
          "legendgroup": "code_qwen-coder-32B_03_february_text",
          "line": {
-          "color": "#FF6692",
+          "color": "#EF553B",
           "dash": "solid"
          },
          "marker": {
@@ -8341,36 +9947,6 @@
           "dtype": "f8"
          },
          "yaxis": "y"
-        },
-        {
-         "customdata": [
-          [
-           "The attached spreadsheet shows the inventory for a"
-          ]
-         ],
-         "hovertemplate": "agent_name=code_sonnet_03_february_goodoldtext-unbroken<br>index=%{x}<br>is_correct=%{y}<br>question=%{customdata[0]}<extra></extra>",
-         "legendgroup": "code_sonnet_03_february_goodoldtext-unbroken",
-         "line": {
-          "color": "#B6E880",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "code_sonnet_03_february_goodoldtext-unbroken",
-         "showlegend": true,
-         "type": "scattergl",
-         "x": {
-          "bdata": "AA==",
-          "dtype": "i1"
-         },
-         "xaxis": "x",
-         "y": {
-          "bdata": "AAAAAAAAAAA=",
-          "dtype": "f8"
-         },
-         "yaxis": "y"
         }
        ],
        "layout": {
@@ -9266,7 +10842,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2022001392.py:10: SettingWithCopyWarning:\n",
+      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n",
       "\n",
       "\n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
@@ -9274,7 +10850,7 @@
       "\n",
       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
       "\n",
-      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2022001392.py:10: SettingWithCopyWarning:\n",
+      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n",
       "\n",
       "\n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
@@ -9282,7 +10858,7 @@
       "\n",
       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
       "\n",
-      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2022001392.py:10: SettingWithCopyWarning:\n",
+      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n",
       "\n",
       "\n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
@@ -9290,7 +10866,7 @@
       "\n",
       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
       "\n",
-      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2022001392.py:10: SettingWithCopyWarning:\n",
+      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n",
       "\n",
       "\n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
@@ -9298,7 +10874,7 @@
       "\n",
       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
       "\n",
-      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2022001392.py:11: SettingWithCopyWarning:\n",
+      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:11: SettingWithCopyWarning:\n",
       "\n",
       "\n",
       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
@@ -10289,13 +11865,11 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th></th>\n",
        "      <th>is_correct</th>\n",
        "      <th>count_steps</th>\n",
        "      <th>question</th>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>agent_name</th>\n",
        "      <th>attachment_type</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
@@ -10304,111 +11878,110 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th rowspan=\"14\" valign=\"top\">code_o1_01_february_text</th>\n",
        "      <th>None</th>\n",
-       "      <td>0.496063</td>\n",
-       "      <td>3.362205</td>\n",
-       "      <td>127</td>\n",
+       "      <td>0.423799</td>\n",
+       "      <td>4.959725</td>\n",
+       "      <td>2185</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>csv</th>\n",
        "      <td>0.000000</td>\n",
-       "      <td>7.000000</td>\n",
-       "      <td>1</td>\n",
+       "      <td>7.750000</td>\n",
+       "      <td>16</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>docx</th>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>3.000000</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0.571429</td>\n",
+       "      <td>4.904762</td>\n",
+       "      <td>21</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>jpg</th>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>3.000000</td>\n",
-       "      <td>2</td>\n",
+       "      <td>0.142857</td>\n",
+       "      <td>5.750000</td>\n",
+       "      <td>28</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>jsonld</th>\n",
        "      <td>0.000000</td>\n",
-       "      <td>8.000000</td>\n",
-       "      <td>1</td>\n",
+       "      <td>6.600000</td>\n",
+       "      <td>15</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>mp3</th>\n",
-       "      <td>0.333333</td>\n",
-       "      <td>2.333333</td>\n",
-       "      <td>3</td>\n",
+       "      <td>0.480000</td>\n",
+       "      <td>4.500000</td>\n",
+       "      <td>50</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>pdb</th>\n",
        "      <td>0.000000</td>\n",
-       "      <td>4.000000</td>\n",
-       "      <td>1</td>\n",
+       "      <td>4.444444</td>\n",
+       "      <td>18</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>pdf</th>\n",
-       "      <td>0.666667</td>\n",
-       "      <td>2.666667</td>\n",
-       "      <td>3</td>\n",
+       "      <td>0.588235</td>\n",
+       "      <td>4.137255</td>\n",
+       "      <td>51</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>png</th>\n",
-       "      <td>0.250000</td>\n",
-       "      <td>2.375000</td>\n",
-       "      <td>8</td>\n",
+       "      <td>0.216783</td>\n",
+       "      <td>4.412587</td>\n",
+       "      <td>143</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>pptx</th>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>3.000000</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0.882353</td>\n",
+       "      <td>4.058824</td>\n",
+       "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>py</th>\n",
        "      <td>1.000000</td>\n",
-       "      <td>3.000000</td>\n",
-       "      <td>1</td>\n",
+       "      <td>4.266667</td>\n",
+       "      <td>15</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>txt</th>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>4.000000</td>\n",
-       "      <td>1</td>\n",
+       "      <td>0.705882</td>\n",
+       "      <td>4.764706</td>\n",
+       "      <td>17</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>xlsx</th>\n",
-       "      <td>0.615385</td>\n",
-       "      <td>3.153846</td>\n",
-       "      <td>13</td>\n",
+       "      <td>0.612745</td>\n",
+       "      <td>4.823529</td>\n",
+       "      <td>204</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>zip</th>\n",
-       "      <td>0.500000</td>\n",
-       "      <td>4.000000</td>\n",
-       "      <td>2</td>\n",
+       "      <td>0.448276</td>\n",
+       "      <td>5.344828</td>\n",
+       "      <td>29</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                          is_correct  count_steps  question\n",
-       "agent_name               attachment_type                                   \n",
-       "code_o1_01_february_text None               0.496063     3.362205       127\n",
-       "                         csv                0.000000     7.000000         1\n",
-       "                         docx               1.000000     3.000000         1\n",
-       "                         jpg                0.000000     3.000000         2\n",
-       "                         jsonld             0.000000     8.000000         1\n",
-       "                         mp3                0.333333     2.333333         3\n",
-       "                         pdb                0.000000     4.000000         1\n",
-       "                         pdf                0.666667     2.666667         3\n",
-       "                         png                0.250000     2.375000         8\n",
-       "                         pptx               1.000000     3.000000         1\n",
-       "                         py                 1.000000     3.000000         1\n",
-       "                         txt                1.000000     4.000000         1\n",
-       "                         xlsx               0.615385     3.153846        13\n",
-       "                         zip                0.500000     4.000000         2"
+       "                 is_correct  count_steps  question\n",
+       "attachment_type                                   \n",
+       "None               0.423799     4.959725      2185\n",
+       "csv                0.000000     7.750000        16\n",
+       "docx               0.571429     4.904762        21\n",
+       "jpg                0.142857     5.750000        28\n",
+       "jsonld             0.000000     6.600000        15\n",
+       "mp3                0.480000     4.500000        50\n",
+       "pdb                0.000000     4.444444        18\n",
+       "pdf                0.588235     4.137255        51\n",
+       "png                0.216783     4.412587       143\n",
+       "pptx               0.882353     4.058824        17\n",
+       "py                 1.000000     4.266667        15\n",
+       "txt                0.705882     4.764706        17\n",
+       "xlsx               0.612745     4.823529       204\n",
+       "zip                0.448276     5.344828        29"
       ]
      },
      "metadata": {},
@@ -10417,7 +11990,7 @@
    ],
    "source": [
     "display(\n",
-    "    sel_df.groupby([\"agent_name\", \"attachment_type\"])[[\"is_correct\", \"count_steps\", \"question\"]].agg(\n",
+    "    result_df.groupby([\"attachment_type\"])[[\"is_correct\", \"count_steps\", \"question\"]].agg(\n",
     "        {\"is_correct\": \"mean\", \"count_steps\": \"mean\", \"question\": \"count\"}\n",
     "    )\n",
     ")"
@@ -10449,17 +12022,20 @@
      "data": {
       "text/plain": [
        "agent_name\n",
-       "code_gpt4o_03_february_goodoldtext-unbroken       38.36\n",
-       "code_gpt4o_03_february_magenticbrowser            35.22\n",
-       "code_gpt4o_03_february_magenticbrowser2           36.54\n",
-       "code_gpt4o_03_february_text                       37.58\n",
-       "code_o1_01_february_text                          49.09\n",
-       "code_o1_03_february_fix-print-outputs             51.83\n",
-       "code_o1_03_february_fix-print-outputs2            52.56\n",
-       "code_o1_03_february_goodoldtext-unbroken          53.42\n",
-       "code_o1_03_february_remove-navigational           53.66\n",
-       "code_o1_03_february_text_high-reasoning-effort    48.48\n",
-       "code_o3-mini_03_february_remove-navigational      29.09\n",
+       "code_gpt4o_03_february_goodoldtext-unbroken         38.36\n",
+       "code_gpt4o_03_february_magenticbrowser              35.22\n",
+       "code_gpt4o_03_february_magenticbrowser2             36.54\n",
+       "code_gpt4o_03_february_text                         37.58\n",
+       "code_o1_01_february_text                            49.09\n",
+       "code_o1_03_february_ablation-toolcalling-manager    32.73\n",
+       "code_o1_03_february_fix-print-outputs               51.83\n",
+       "code_o1_03_february_fix-print-outputs2              55.77\n",
+       "code_o1_03_february_goodoldtext-unbroken            53.42\n",
+       "code_o1_03_february_remove-navigational             53.66\n",
+       "code_o1_03_february_text_high-reasoning-effort      48.48\n",
+       "code_o1_04_february_submission                      49.38\n",
+       "code_o1_04_february_submission5                     55.15\n",
+       "code_o3-mini_03_february_remove-navigational        29.09\n",
        "Name: is_correct, dtype: float64"
       ]
      },
@@ -10471,14 +12047,14 @@
      "output_type": "stream",
      "text": [
       "Majority score: 58.18\n",
-      "Oracle score: 70.91\n"
+      "Oracle score: 72.73\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2283375871.py:25: DeprecationWarning:\n",
+      "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/3287428472.py:20: DeprecationWarning:\n",
       "\n",
       "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
       "\n"
@@ -10489,15 +12065,10 @@
     "def majority_vote(df):\n",
     "    df = df[(df[\"prediction\"] != \"Unable to determine\") & (~df[\"prediction\"].isna()) & (df[\"prediction\"] != \"None\")]\n",
     "\n",
-    "    # First get the mode (most common answer) for each question\n",
     "    answer_modes = df.groupby(\"question\")[\"prediction\"].agg(lambda x: x.mode()[0]).reset_index()\n",
-    "\n",
-    "    # For each question-answer pair, get the first occurrence's task and is_correct\n",
     "    first_occurrences = (\n",
     "        df.groupby([\"question\", \"prediction\"]).agg({\"task\": \"first\", \"is_correct\": \"first\"}).reset_index()\n",
     "    )\n",
-    "\n",
-    "    # Merge the mode answers with their corresponding first occurrences\n",
     "    result = answer_modes.merge(first_occurrences, on=[\"question\", \"prediction\"], how=\"left\")\n",
     "\n",
     "    return result\n",
@@ -10520,6 +12091,34 @@
     "print(f\"Oracle score: {oracle(long_series)['is_correct'].mean() * 100:.2f}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Submit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agent_run = \"code_o1_04_february_submission5.jsonl\"\n",
+    "df = pd.read_json(f\"output/validation/{agent_run}\", lines=True)\n",
+    "df = df[[\"task_id\", \"prediction\", \"intermediate_steps\"]]\n",
+    "df = df.rename(columns={\"prediction\": \"model_answer\", \"intermediate_steps\": \"reasoning_trace\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_json(\"submission.jsonl\", orient=\"records\", lines=True)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/examples/open_deep_research/scripts/mdconvert.py b/examples/open_deep_research/scripts/mdconvert.py
index 15df618..72cb0a0 100644
--- a/examples/open_deep_research/scripts/mdconvert.py
+++ b/examples/open_deep_research/scripts/mdconvert.py
@@ -13,6 +13,7 @@ import subprocess
 import sys
 import tempfile
 import traceback
+import zipfile
 from typing import Any, Dict, List, Optional, Union
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
 
@@ -624,6 +625,54 @@ class Mp3Converter(WavConverter):
         )
 
 
+class ZipConverter(DocumentConverter):
+    """
+    Extracts ZIP files to a permanent local directory and returns a listing of extracted files.
+    """
+
+    def __init__(self, extract_dir: str = "downloads"):
+        """
+        Initialize with path to extraction directory.
+
+        Args:
+            extract_dir: The directory where files will be extracted. Defaults to "downloads"
+        """
+        self.extract_dir = extract_dir
+        # Create the extraction directory if it doesn't exist
+        os.makedirs(self.extract_dir, exist_ok=True)
+
+    def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+        # Bail if not a ZIP file
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".zip":
+            return None
+
+        # Verify it's actually a ZIP file
+        if not zipfile.is_zipfile(local_path):
+            return None
+
+        # Extract all files and build list
+        extracted_files = []
+        with zipfile.ZipFile(local_path, "r") as zip_ref:
+            # Extract all files
+            zip_ref.extractall(self.extract_dir)
+            # Get list of all files
+            for file_path in zip_ref.namelist():
+                # Skip directories
+                if not file_path.endswith("/"):
+                    extracted_files.append(self.extract_dir + "/" + file_path)
+
+        # Sort files for consistent output
+        extracted_files.sort()
+
+        # Build the markdown content
+        md_content = "Downloaded the following files:\n"
+        for file in extracted_files:
+            md_content += f"* {file}\n"
+
+        return DocumentConverterResult(title="Extracted Files", text_content=md_content.strip())
+
+
 class ImageConverter(MediaConverter):
     """
     Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
@@ -705,11 +754,11 @@ class ImageConverter(MediaConverter):
         return response.choices[0].message.content
 
 
-class FileConversionException(BaseException):
+class FileConversionException(Exception):
     pass
 
 
-class UnsupportedFormatException(BaseException):
+class UnsupportedFormatException(Exception):
     pass
 
 
@@ -746,6 +795,7 @@ class MarkdownConverter:
         self.register_page_converter(WavConverter())
         self.register_page_converter(Mp3Converter())
         self.register_page_converter(ImageConverter())
+        self.register_page_converter(ZipConverter())
         self.register_page_converter(PdfConverter())
 
     def convert(
diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index 89f686e..7fb6c07 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -15,11 +15,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
+import os
+import re
+import textwrap
 import time
 from collections import deque
 from logging import getLogger
-from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Generator, List, Optional, Set, Tuple, Union
 
+import yaml
+from jinja2 import StrictUndefined, Template
 from rich.console import Group
 from rich.panel import Panel
 from rich.rule import Rule
@@ -56,70 +61,23 @@ from .models import (
     MessageRole,
 )
 from .monitoring import Monitor
-from .prompts import (
-    CODE_SYSTEM_PROMPT,
-    MANAGED_AGENT_PROMPT,
-    PLAN_UPDATE_FINAL_PLAN_REDACTION,
-    SYSTEM_PROMPT_FACTS,
-    SYSTEM_PROMPT_FACTS_UPDATE,
-    SYSTEM_PROMPT_PLAN,
-    SYSTEM_PROMPT_PLAN_UPDATE,
-    TOOL_CALLING_SYSTEM_PROMPT,
-    USER_PROMPT_FACTS_UPDATE,
-    USER_PROMPT_PLAN,
-    USER_PROMPT_PLAN_UPDATE,
-)
-from .tools import (
-    DEFAULT_TOOL_DESCRIPTION_TEMPLATE,
-    Tool,
-    get_tool_description_with_args,
-)
+from .tools import Tool
 
 
 logger = getLogger(__name__)
 
 
-def get_tool_descriptions(tools: Dict[str, Tool], tool_description_template: str) -> str:
-    return "\n".join([get_tool_description_with_args(tool, tool_description_template) for tool in tools.values()])
+def get_variable_names(self, template: str) -> Set[str]:
+    pattern = re.compile(r"\{\{([^{}]+)\}\}")
+    return {match.group(1).strip() for match in pattern.finditer(template)}
 
 
-def format_prompt_with_tools(tools: Dict[str, Tool], prompt_template: str, tool_description_template: str) -> str:
-    tool_descriptions = get_tool_descriptions(tools, tool_description_template)
-    prompt = prompt_template.replace("{{tool_descriptions}}", tool_descriptions)
-    if "{{tool_names}}" in prompt:
-        prompt = prompt.replace(
-            "{{tool_names}}",
-            ", ".join([f"'{tool.name}'" for tool in tools.values()]),
-        )
-    return prompt
-
-
-def show_agents_descriptions(managed_agents: Dict):
-    managed_agents_descriptions = """
-You can also give requests to team members.
-Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaining your request.
-Given that this team member is a real human, you should be very verbose in your request.
-Here is a list of the team members that you can call:"""
-    for agent in managed_agents.values():
-        managed_agents_descriptions += f"\n- {agent.name}: {agent.description}"
-    return managed_agents_descriptions
-
-
-def format_prompt_with_managed_agents_descriptions(
-    prompt_template,
-    managed_agents,
-    agent_descriptions_placeholder: Optional[str] = None,
-) -> str:
-    if agent_descriptions_placeholder is None:
-        agent_descriptions_placeholder = "{{managed_agents_descriptions}}"
-    if agent_descriptions_placeholder not in prompt_template:
-        raise ValueError(
-            f"Provided prompt template does not contain the managed agents descriptions placeholder '{agent_descriptions_placeholder}'"
-        )
-    if len(managed_agents.keys()) > 0:
-        return prompt_template.replace(agent_descriptions_placeholder, show_agents_descriptions(managed_agents))
-    else:
-        return prompt_template.replace(agent_descriptions_placeholder, "")
+def populate_template(template: str, variables: Dict[str, Any]) -> str:
+    compiled_template = Template(template, undefined=StrictUndefined)
+    try:
+        return compiled_template.render(**variables)
+    except Exception as e:
+        raise Exception(f"Error during jinja template rendering: {type(e).__name__}: {e}")
 
 
 class MultiStepAgent:
@@ -130,8 +88,7 @@ class MultiStepAgent:
     Args:
         tools (`list[Tool]`): [`Tool`]s that the agent can use.
         model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions.
-        system_prompt (`str`, *optional*): System prompt that will be used to generate the agent's actions.
-        tool_description_template (`str`, *optional*): Template used to describe the tools in the system prompt.
+        prompts_path (`str`, *optional*): The path from which to load this agent's prompt dictionary.
         max_steps (`int`, default `6`): Maximum number of steps the agent can take to solve the task.
         tool_parser (`Callable`, *optional*): Function used to parse the tool calls from the LLM output.
         add_base_tools (`bool`, default `False`): Whether to add the base tools to the agent's tools.
@@ -142,16 +99,15 @@ class MultiStepAgent:
         planning_interval (`int`, *optional*): Interval at which the agent will run a planning step.
         name (`str`, *optional*): Necessary for a managed agent only - the name by which this agent can be called.
         description (`str`, *optional*): Necessary for a managed agent only - the description of this agent.
-        managed_agent_prompt (`str`, *optional*): Custom prompt for the managed agent. Defaults to None.
-        provide_run_summary (`bool`, *optional*): Wether to provide a run summary when called as a managed agent.
+        provide_run_summary (`bool`, *optional*): Whether to provide a run summary when called as a managed agent.
+        final_answer_checks (`list`, *optional*): List of Callables to run before returning a final answer for checking validity.
     """
 
     def __init__(
         self,
         tools: List[Tool],
         model: Callable[[List[Dict[str, str]]], ChatMessage],
-        system_prompt: Optional[str] = None,
-        tool_description_template: Optional[str] = None,
+        prompts_path: Optional[str] = None,
         max_steps: int = 6,
         tool_parser: Optional[Callable] = None,
         add_base_tools: bool = False,
@@ -162,19 +118,13 @@ class MultiStepAgent:
         planning_interval: Optional[int] = None,
         name: Optional[str] = None,
         description: Optional[str] = None,
-        managed_agent_prompt: Optional[str] = None,
         provide_run_summary: bool = False,
+        final_answer_checks: Optional[List[Callable]] = None,
     ):
-        if system_prompt is None:
-            system_prompt = CODE_SYSTEM_PROMPT
         if tool_parser is None:
             tool_parser = parse_json_tool_call
         self.agent_name = self.__class__.__name__
         self.model = model
-        self.system_prompt_template = system_prompt
-        self.tool_description_template = (
-            tool_description_template if tool_description_template else DEFAULT_TOOL_DESCRIPTION_TEMPLATE
-        )
         self.max_steps = max_steps
         self.step_number: int = 0
         self.tool_parser = tool_parser
@@ -183,7 +133,6 @@ class MultiStepAgent:
         self.state = {}
         self.name = name
         self.description = description
-        self.managed_agent_prompt = managed_agent_prompt if managed_agent_prompt else MANAGED_AGENT_PROMPT
         self.provide_run_summary = provide_run_summary
 
         self.managed_agents = {}
@@ -206,11 +155,12 @@ class MultiStepAgent:
         self.system_prompt = self.initialize_system_prompt()
         self.input_messages = None
         self.task = None
-        self.memory = AgentMemory(system_prompt)
+        self.memory = AgentMemory(self.system_prompt)
         self.logger = AgentLogger(level=verbosity_level)
         self.monitor = Monitor(self.model, self.logger)
         self.step_callbacks = step_callbacks if step_callbacks is not None else []
         self.step_callbacks.append(self.monitor.update_metrics)
+        self.final_answer_checks = final_answer_checks
 
     @property
     def logs(self):
@@ -220,13 +170,8 @@ class MultiStepAgent:
         return [self.memory.system_prompt] + self.memory.steps
 
     def initialize_system_prompt(self):
-        system_prompt = format_prompt_with_tools(
-            self.tools,
-            self.system_prompt_template,
-            self.tool_description_template,
-        )
-        system_prompt = format_prompt_with_managed_agents_descriptions(system_prompt, self.managed_agents)
-        return system_prompt
+        """To be implemented in child classes"""
+        pass
 
     def write_memory_to_messages(
         self,
@@ -358,10 +303,10 @@ class MultiStepAgent:
             return observation
         except Exception as e:
             if tool_name in self.tools:
-                tool_description = get_tool_description_with_args(available_tools[tool_name])
+                tool = self.tools[tool_name]
                 error_msg = (
-                    f"Error in tool call execution: {type(e).__name__}: {e}\nYou should only use this tool with a correct input.\n"
-                    f"As a reminder, this tool's description is the following:\n{tool_description}"
+                    f"Error whene executing tool {tool_name} with arguments {arguments}: {type(e).__name__}: {e}\nYou should only use this tool with a correct input.\n"
+                    f"As a reminder, this tool's description is the following: '{tool.description}'.\nIt takes inputs: {tool.inputs} and returns output type {tool.output_type}"
                 )
                 raise AgentExecutionError(error_msg, self.logger)
             elif tool_name in self.managed_agents:
@@ -380,7 +325,6 @@ class MultiStepAgent:
         task: str,
         stream: bool = False,
         reset: bool = True,
-        single_step: bool = False,
         images: Optional[List[str]] = None,
         additional_args: Optional[Dict] = None,
     ):
@@ -391,7 +335,6 @@ class MultiStepAgent:
             task (`str`): Task to perform.
             stream (`bool`): Whether to run in a streaming way.
             reset (`bool`): Whether to reset the conversation or keep it going from previous run.
-            single_step (`bool`): Whether to run the agent in one-shot fashion.
             images (`list[str]`, *optional*): Paths to image(s).
             additional_args (`dict`): Any other variables that you want to pass to the agent run, for instance images or dataframes. Give them clear names!
 
@@ -420,19 +363,10 @@ You have been provided with these additional arguments, that you can access usin
             content=self.task.strip(),
             subtitle=f"{type(self.model).__name__} - {(self.model.model_id if hasattr(self.model, 'model_id') else '')}",
             level=LogLevel.INFO,
+            title=self.name if hasattr(self, "name") else None,
         )
 
         self.memory.steps.append(TaskStep(task=self.task, task_images=images))
-        if single_step:
-            self.step_number = 1
-            step_start_time = time.time()
-            memory_step = ActionStep(start_time=step_start_time, observations_images=images)
-            memory_step.end_time = time.time()
-            memory_step.duration = memory_step.end_time - step_start_time
-
-            # Run the agent's step
-            result = self.step(memory_step)
-            return result
 
         if stream:
             # The steps are returned as they are executed through a generator to iterate on.
@@ -468,6 +402,13 @@ You have been provided with these additional arguments, that you can access usin
 
                 # Run one step!
                 final_answer = self.step(memory_step)
+                if final_answer is not None and self.final_answer_checks is not None:
+                    for check_function in self.final_answer_checks:
+                        try:
+                            assert check_function(final_answer, self.memory)
+                        except Exception as e:
+                            final_answer = None
+                            raise AgentError(f"Check {check_function.__name__} failed with error: {e}", self.logger)
             except AgentError as e:
                 memory_step.error = e
             finally:
@@ -515,46 +456,32 @@ You have been provided with these additional arguments, that you can access usin
         if is_first_step:
             message_prompt_facts = {
                 "role": MessageRole.SYSTEM,
-                "content": [{"type": "text", "text": SYSTEM_PROMPT_FACTS}],
+                "content": [{"type": "text", "text": self.prompt_templates["planning"]["initial_facts"]}],
             }
-            message_prompt_task = {
-                "role": MessageRole.USER,
-                "content": [
-                    {
-                        "type": "text",
-                        "text": f"""Here is the task:
-```
-{task}
-```
-Now begin!""",
-                    }
-                ],
-            }
-            input_messages = [message_prompt_facts, message_prompt_task]
+            input_messages = [message_prompt_facts]
 
             chat_message_facts: ChatMessage = self.model(input_messages)
             answer_facts = chat_message_facts.content
 
-            message_system_prompt_plan = {
-                "role": MessageRole.SYSTEM,
-                "content": [{"type": "text", "text": SYSTEM_PROMPT_PLAN}],
-            }
-            message_user_prompt_plan = {
+            message_prompt_plan = {
                 "role": MessageRole.USER,
                 "content": [
                     {
                         "type": "text",
-                        "text": USER_PROMPT_PLAN.format(
-                            task=task,
-                            tool_descriptions=get_tool_descriptions(self.tools, self.tool_description_template),
-                            managed_agents_descriptions=(show_agents_descriptions(self.managed_agents)),
-                            answer_facts=answer_facts,
+                        "text": populate_template(
+                            self.prompt_templates["planning"]["initial_plan"],
+                            variables={
+                                "task": task,
+                                "tools": self.tools,
+                                "managed_agents": self.managed_agents,
+                                "answer_facts": answer_facts,
+                            },
                         ),
                     }
                 ],
             }
             chat_message_plan: ChatMessage = self.model(
-                [message_system_prompt_plan, message_user_prompt_plan],
+                [message_prompt_plan],
                 stop_sequences=["<end_plan>"],
             )
             answer_plan = chat_message_plan.content
@@ -587,51 +514,72 @@ Now begin!""",
             )  # This will not log the plan but will log facts
 
             # Redact updated facts
-            facts_update_system_prompt = {
+            facts_update_pre_messages = {
                 "role": MessageRole.SYSTEM,
-                "content": [{"type": "text", "text": SYSTEM_PROMPT_FACTS_UPDATE}],
+                "content": [{"type": "text", "text": self.prompt_templates["planning"]["update_facts_pre_messages"]}],
             }
-            facts_update_message = {
-                "role": MessageRole.USER,
-                "content": [{"type": "text", "text": USER_PROMPT_FACTS_UPDATE}],
+            facts_update_post_messages = {
+                "role": MessageRole.SYSTEM,
+                "content": [{"type": "text", "text": self.prompt_templates["planning"]["update_facts_post_messages"]}],
             }
-            input_messages = [facts_update_system_prompt] + memory_messages + [facts_update_message]
+            input_messages = [facts_update_pre_messages] + memory_messages + [facts_update_post_messages]
             chat_message_facts: ChatMessage = self.model(input_messages)
             facts_update = chat_message_facts.content
 
             # Redact updated plan
-            plan_update_message = {
+            update_plan_pre_messages = {
                 "role": MessageRole.SYSTEM,
-                "content": [{"type": "text", "text": SYSTEM_PROMPT_PLAN_UPDATE.format(task=task)}],
-            }
-            plan_update_message_user = {
-                "role": MessageRole.USER,
                 "content": [
                     {
                         "type": "text",
-                        "text": USER_PROMPT_PLAN_UPDATE.format(
-                            task=task,
-                            tool_descriptions=get_tool_descriptions(self.tools, self.tool_description_template),
-                            managed_agents_descriptions=(show_agents_descriptions(self.managed_agents)),
-                            facts_update=facts_update,
-                            remaining_steps=(self.max_steps - step),
+                        "text": populate_template(
+                            self.prompt_templates["planning"]["update_plan_pre_messages"], variables={"task": task}
+                        ),
+                    }
+                ],
+            }
+            update_plan_post_messages = {
+                "role": MessageRole.SYSTEM,
+                "content": [
+                    {
+                        "type": "text",
+                        "text": populate_template(
+                            self.prompt_templates["planning"]["update_plan_pre_messages"],
+                            variables={
+                                "task": task,
+                                "tools": self.tools,
+                                "managed_agents": self.managed_agents,
+                                "facts_update": facts_update,
+                                "remaining_steps": (self.max_steps - step),
+                            },
                         ),
                     }
                 ],
             }
             chat_message_plan: ChatMessage = self.model(
-                [plan_update_message] + memory_messages + [plan_update_message_user],
+                [update_plan_pre_messages] + memory_messages + [update_plan_post_messages],
                 stop_sequences=["<end_plan>"],
             )
 
             # Log final facts and plan
-            final_plan_redaction = PLAN_UPDATE_FINAL_PLAN_REDACTION.format(
-                task=task, plan_update=chat_message_plan.content
+            final_plan_redaction = textwrap.dedent(
+                f"""I still need to solve the task I was given:
+                ```
+                {task}
+                ```
+
+                Here is my new/updated plan of action to solve the task:
+                ```
+                {chat_message_plan.content}
+                ```"""
+            )
+
+            final_facts_redaction = textwrap.dedent(
+                f"""Here is the updated list of the facts that I know:
+                ```
+                {facts_update}
+                ```"""
             )
-            final_facts_redaction = f"""Here is the updated list of the facts that I know:
-```
-{facts_update}
-```"""
             self.memory.steps.append(
                 PlanningStep(
                     model_input_messages=input_messages,
@@ -656,20 +604,25 @@ Now begin!""",
         """
         self.memory.replay(self.logger, detailed=detailed)
 
-    def __call__(self, request: str, **kwargs):
+    def __call__(self, task: str, **kwargs):
         """
         This methd is called only by a manager agent.
         Adds additional prompting for the managed agent, runs it, and wraps the output.
         """
-        full_task = self.managed_agent_prompt.format(name=self.name, task=request).strip()
-        output = self.run(full_task, **kwargs)
-        answer = f"Here is the final answer from your managed agent '{self.name}':\n{str(output)}"
+        full_task = populate_template(
+            self.prompt_templates["managed_agent"]["task"],
+            variables=dict(name=self.name, task=task),
+        )
+        report = self.run(full_task, **kwargs)
+        answer = populate_template(
+            self.prompt_templates["managed_agent"]["report"], variables=dict(name=self.name, final_answer=report)
+        )
         if self.provide_run_summary:
-            answer += f"\n\nFor more detail, find below a summary of this agent's work:\nSUMMARY OF WORK FROM AGENT '{self.name}':\n"
+            answer += "\n\nFor more detail, find below a summary of this agent's work:\n<summary_of_work>\n"
             for message in self.write_memory_to_messages(summary_mode=True):
                 content = message["content"]
                 answer += "\n" + truncate_content(str(content)) + "\n---"
-            answer += f"\nEND OF SUMMARY OF WORK FROM AGENT '{self.name}'."
+            answer += "\n</summary_of_work>"
         return answer
 
 
@@ -680,30 +633,37 @@ class ToolCallingAgent(MultiStepAgent):
     Args:
         tools (`list[Tool]`): [`Tool`]s that the agent can use.
         model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions.
-        system_prompt (`str`, *optional*): System prompt that will be used to generate the agent's actions.
+        prompts_path (`str`, *optional*): The path from which to load this agent's prompt dictionary.
         planning_interval (`int`, *optional*): Interval at which the agent will run a planning step.
         **kwargs: Additional keyword arguments.
-
     """
 
     def __init__(
         self,
         tools: List[Tool],
         model: Callable[[List[Dict[str, str]]], ChatMessage],
-        system_prompt: Optional[str] = None,
+        prompts_path: Optional[str] = None,
         planning_interval: Optional[int] = None,
         **kwargs,
     ):
-        if system_prompt is None:
-            system_prompt = TOOL_CALLING_SYSTEM_PROMPT
+        yaml_path = os.path.join(os.path.dirname(__file__), "prompts", "toolcalling_agent.yaml")
+        with open(yaml_path, "r") as f:
+            self.prompt_templates = yaml.safe_load(f)
         super().__init__(
             tools=tools,
             model=model,
-            system_prompt=system_prompt,
+            prompts_path=prompts_path,
             planning_interval=planning_interval,
             **kwargs,
         )
 
+    def initialize_system_prompt(self) -> str:
+        system_prompt = populate_template(
+            self.prompt_templates["system_prompt"],
+            variables={"tools": self.tools, "managed_agents": self.managed_agents},
+        )
+        return system_prompt
+
     def step(self, memory_step: ActionStep) -> Union[None, Any]:
         """
         Perform one step in the ReAct framework: the agent thinks, acts, and observes the result.
@@ -795,7 +755,7 @@ class CodeAgent(MultiStepAgent):
     Args:
         tools (`list[Tool]`): [`Tool`]s that the agent can use.
         model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions.
-        system_prompt (`str`, *optional*): System prompt that will be used to generate the agent's actions.
+        prompts_path (`str`, *optional*): The path from which to load this agent's prompt dictionary.
         grammar (`dict[str, str]`, *optional*): Grammar used to parse the LLM output.
         additional_authorized_imports (`list[str]`, *optional*): Additional authorized imports for the agent.
         planning_interval (`int`, *optional*): Interval at which the agent will run a planning step.
@@ -809,7 +769,7 @@ class CodeAgent(MultiStepAgent):
         self,
         tools: List[Tool],
         model: Callable[[List[Dict[str, str]]], ChatMessage],
-        system_prompt: Optional[str] = None,
+        prompts_path: Optional[str] = None,
         grammar: Optional[Dict[str, str]] = None,
         additional_authorized_imports: Optional[List[str]] = None,
         planning_interval: Optional[int] = None,
@@ -817,17 +777,14 @@ class CodeAgent(MultiStepAgent):
         max_print_outputs_length: Optional[int] = None,
         **kwargs,
     ):
-        if system_prompt is None:
-            system_prompt = CODE_SYSTEM_PROMPT
-
         self.additional_authorized_imports = additional_authorized_imports if additional_authorized_imports else []
         self.authorized_imports = list(set(BASE_BUILTIN_MODULES) | set(self.additional_authorized_imports))
-        if "{{authorized_imports}}" not in system_prompt:
-            raise ValueError("Tag '{{authorized_imports}}' should be provided in the prompt.")
+        yaml_path = os.path.join(os.path.dirname(__file__), "prompts", "code_agent.yaml")
+        with open(yaml_path, "r") as f:
+            self.prompt_templates = yaml.safe_load(f)
         super().__init__(
             tools=tools,
             model=model,
-            system_prompt=system_prompt,
             grammar=grammar,
             planning_interval=planning_interval,
             **kwargs,
@@ -857,17 +814,20 @@ class CodeAgent(MultiStepAgent):
                 max_print_outputs_length=max_print_outputs_length,
             )
 
-    def initialize_system_prompt(self):
-        self.system_prompt = super().initialize_system_prompt()
-        self.system_prompt = self.system_prompt.replace(
-            "{{authorized_imports}}",
-            (
-                "You can import from any package you want."
-                if "*" in self.authorized_imports
-                else str(self.authorized_imports)
-            ),
+    def initialize_system_prompt(self) -> str:
+        system_prompt = populate_template(
+            self.prompt_templates["system_prompt"],
+            variables={
+                "tools": self.tools,
+                "managed_agents": self.managed_agents,
+                "authorized_imports": (
+                    "You can import from any package you want."
+                    if "*" in self.authorized_imports
+                    else str(self.authorized_imports)
+                ),
+            },
         )
-        return self.system_prompt
+        return system_prompt
 
     def step(self, memory_step: ActionStep) -> Union[None, Any]:
         """
diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py
index c3ff6bb..5bd1b0b 100644
--- a/src/smolagents/memory.py
+++ b/src/smolagents/memory.py
@@ -156,7 +156,7 @@ class PlanningStep(MemoryStep):
             )
         )
 
-        if not summary_mode:
+        if not summary_mode:  # This step is not shown to a model writing a plan to avoid influencing the new plan
             messages.append(
                 Message(
                     role=MessageRole.ASSISTANT, content=[{"type": "text", "text": f"[PLAN]:\n{self.plan.strip()}"}]
@@ -186,7 +186,7 @@ class SystemPromptStep(MemoryStep):
     def to_messages(self, summary_mode: bool = False, **kwargs) -> List[Message]:
         if summary_mode:
             return []
-        return [Message(role=MessageRole.SYSTEM, content=[{"type": "text", "text": self.system_prompt.strip()}])]
+        return [Message(role=MessageRole.SYSTEM, content=[{"type": "text", "text": self.system_prompt}])]
 
 
 class AgentMemory:
diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py
index 33ae2ab..44001eb 100644
--- a/src/smolagents/monitoring.py
+++ b/src/smolagents/monitoring.py
@@ -141,11 +141,11 @@ class AgentLogger:
             level=LogLevel.INFO,
         )
 
-    def log_task(self, content: str, subtitle: str, level: int = LogLevel.INFO) -> None:
+    def log_task(self, content: str, subtitle: str, title: Optional[str] = None, level: int = LogLevel.INFO) -> None:
         self.log(
             Panel(
                 f"\n[bold]{content}\n",
-                title="[bold]New run",
+                title="[bold]New run" + (f" - {title}" if title else ""),
                 subtitle=subtitle,
                 border_style=YELLOW_HEX,
                 subtitle_align="left",
@@ -167,7 +167,7 @@ class AgentLogger:
     def visualize_agent_tree(self, agent):
         def create_tools_section(tools_dict):
             table = Table(show_header=True, header_style="bold")
-            table.add_column("Name", style="blue")
+            table.add_column("Name", style="#1E90FF")
             table.add_column("Description")
             table.add_column("Arguments")
 
@@ -178,26 +178,32 @@ class AgentLogger:
                 ]
                 table.add_row(name, getattr(tool, "description", str(tool)), "\n".join(args))
 
-            return Group(Text("🛠️ Tools", style="bold italic blue"), table)
+            return Group("🛠️ [italic #1E90FF]Tools:[/italic #1E90FF]", table)
+
+        def get_agent_headline(agent, name: Optional[str] = None):
+            name_headline = f"{name} | " if name else ""
+            return f"[bold {YELLOW_HEX}]{name_headline}{agent.__class__.__name__} | {agent.model.model_id}"
 
         def build_agent_tree(parent_tree, agent_obj):
             """Recursively builds the agent tree."""
-            if agent_obj.tools:
-                parent_tree.add(create_tools_section(agent_obj.tools))
+            parent_tree.add(create_tools_section(agent_obj.tools))
 
             if agent_obj.managed_agents:
-                agents_branch = parent_tree.add("[bold italic blue]🤖 Managed agents")
+                agents_branch = parent_tree.add("🤖 [italic #1E90FF]Managed agents:")
                 for name, managed_agent in agent_obj.managed_agents.items():
-                    agent_node_text = f"[bold {YELLOW_HEX}]{name} - {managed_agent.agent.__class__.__name__}"
-                    agent_tree = agents_branch.add(agent_node_text)
-                    if hasattr(managed_agent, "description"):
+                    agent_tree = agents_branch.add(get_agent_headline(managed_agent, name))
+                    if managed_agent.__class__.__name__ == "CodeAgent":
                         agent_tree.add(
-                            f"[bold italic blue]📝 Description:[/bold italic blue] {managed_agent.description}"
+                            f"✅ [italic #1E90FF]Authorized imports:[/italic #1E90FF] {managed_agent.additional_authorized_imports}"
                         )
-                    if hasattr(managed_agent, "agent"):
-                        build_agent_tree(agent_tree, managed_agent.agent)
+                    agent_tree.add(f"📝 [italic #1E90FF]Description:[/italic #1E90FF] {managed_agent.description}")
+                    build_agent_tree(agent_tree, managed_agent)
 
-        main_tree = Tree(f"[bold {YELLOW_HEX}]{agent.__class__.__name__}")
+        main_tree = Tree(get_agent_headline(agent))
+        if agent.__class__.__name__ == "CodeAgent":
+            main_tree.add(
+                f"✅ [italic #1E90FF]Authorized imports:[/italic #1E90FF] {agent.additional_authorized_imports}"
+            )
         build_agent_tree(main_tree, agent)
         self.console.print(main_tree)
 
diff --git a/src/smolagents/prompts.py b/src/smolagents/prompts.py
deleted file mode 100644
index b3686e9..0000000
--- a/src/smolagents/prompts.py
+++ /dev/null
@@ -1,523 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-SINGLE_STEP_CODE_SYSTEM_PROMPT = """You will be given a task to solve, your job is to come up with a series of simple commands in Python that will perform the task.
-To help you, I will give you access to a set of tools that you can use. Each tool is a Python function and has a description explaining the task it performs, the inputs it expects and the outputs it returns.
-You should first explain which tool you will use to perform the task and for what reason, then write the code in Python.
-Each instruction in Python should be a simple assignment. You can print intermediate results if it makes sense to do so.
-In the end, use tool 'final_answer' to return your answer, its argument will be what gets returned.
-You can use imports in your code, but only from the following list of modules: <<authorized_imports>>
-Be sure to provide a 'Code:' token, else the run will fail.
-
-Tools:
-{{tool_descriptions}}
-
-Examples:
----
-Task:
-"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
-You have been provided with these additional arguments, that you can access using the keys as variables in your python code:
-{'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}"
-
-Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
-Code:
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(image=image, question=translated_question)
-final_answer(f"The answer is {answer}")
-```<end_code>
-
----
-Task: "Identify the oldest person in the `document` and create an image showcasing the result."
-
-Thought: I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
-Code:
-```py
-answer = document_qa(document, question="What is the oldest person?")
-print(f"The answer is {answer}.")
-image = image_generator(answer)
-final_answer(image)
-```<end_code>
-
----
-Task: "Generate an image using the text given in the variable `caption`."
-
-Thought: I will use the following tool: `image_generator` to generate an image.
-Code:
-```py
-image = image_generator(prompt=caption)
-final_answer(image)
-```<end_code>
-
----
-Task: "Summarize the text given in the variable `text` and read it out loud."
-
-Thought: I will use the following tools: `summarizer` to create a summary of the input text, then `text_reader` to read it out loud.
-Code:
-```py
-summarized_text = summarizer(text)
-print(f"Summary: {summarized_text}")
-audio_summary = text_reader(summarized_text)
-final_answer(audio_summary)
-```<end_code>
-
----
-Task: "Answer the question in the variable `question` about the text in the variable `text`. Use the answer to generate an image."
-
-Thought: I will use the following tools: `text_qa` to create the answer, then `image_generator` to generate an image according to the answer.
-Code:
-```py
-answer = text_qa(text=text, question=question)
-print(f"The answer is {answer}.")
-image = image_generator(answer)
-final_answer(image)
-```<end_code>
-
----
-Task: "Caption the following `image`."
-
-Thought: I will use the following tool: `image_captioner` to generate a caption for the image.
-Code:
-```py
-caption = image_captioner(image)
-final_answer(caption)
-```<end_code>
-
----
-Above example were using tools that might not exist for you. You only have access to these tools:
-{{tool_names}}
-
-{{managed_agents_descriptions}}
-
-Remember to make sure that variables you use are all defined. In particular don't import packages!
-Be sure to provide a 'Code:\n```' sequence before the code and '```<end_code>' after, else you will get an error.
-DO NOT pass the arguments as a dict as in 'answer = ask_search_agent({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = ask_search_agent(query="What is the place where James Bond lives?")'.
-
-Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
-"""
-
-
-TOOL_CALLING_SYSTEM_PROMPT = """You are an expert assistant who can solve any task using  tool calls. You will be given a task to solve as best you can.
-To do so, you have been given access to the following tools: {{tool_names}}
-
-The tool call you write is an action: after the tool is executed, you will get the result of the tool call as an "observation".
-This Action/Observation can repeat N times, you should take several steps when needed.
-
-You can use the result of the previous action as input for the next action.
-The observation will always be a string: it can represent a file, like "image_1.jpg".
-Then you can use it as input for the next action. You can do it for instance as follows:
-
-Observation: "image_1.jpg"
-
-Action:
-{
-  "name": "image_transformer",
-  "arguments": {"image": "image_1.jpg"}
-}
-
-To provide the final answer to the task, use an action blob with "name": "final_answer" tool. It is the only way to complete the task, else you will be stuck on a loop. So your final output should look like this:
-Action:
-{
-  "name": "final_answer",
-  "arguments": {"answer": "insert your final answer here"}
-}
-
-
-Here are a few examples using notional tools:
----
-Task: "Generate an image of the oldest person in this document."
-
-Action:
-{
-  "name": "document_qa",
-  "arguments": {"document": "document.pdf", "question": "Who is the oldest person mentioned?"}
-}
-Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
-
-Action:
-{
-  "name": "image_generator",
-  "arguments": {"prompt": "A portrait of John Doe, a 55-year-old man living in Canada."}
-}
-Observation: "image.png"
-
-Action:
-{
-  "name": "final_answer",
-  "arguments": "image.png"
-}
-
----
-Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
-
-Action:
-{
-    "name": "python_interpreter",
-    "arguments": {"code": "5 + 3 + 1294.678"}
-}
-Observation: 1302.678
-
-Action:
-{
-  "name": "final_answer",
-  "arguments": "1302.678"
-}
-
----
-Task: "Which city has the highest population , Guangzhou or Shanghai?"
-
-Action:
-{
-    "name": "search",
-    "arguments": "Population Guangzhou"
-}
-Observation: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
-
-
-Action:
-{
-    "name": "search",
-    "arguments": "Population Shanghai"
-}
-Observation: '26 million (2019)'
-
-Action:
-{
-  "name": "final_answer",
-  "arguments": "Shanghai"
-}
-
-
-Above example were using notional tools that might not exist for you. You only have access to these tools:
-
-{{tool_descriptions}}
-
-{{managed_agents_descriptions}}
-
-Here are the rules you should always follow to solve your task:
-1. ALWAYS provide a tool call, else you will fail.
-2. Always use the right arguments for the tools. Never use variable names as the action arguments, use the value instead.
-3. Call a tool only when needed: do not call the search agent if you do not need information, try to solve the task yourself.
-If no tool call is needed, use final_answer tool to return your answer.
-4. Never re-do a tool call that you previously did with the exact same parameters.
-
-Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
-"""
-
-CODE_SYSTEM_PROMPT = """You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can.
-To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
-To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
-
-At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
-Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
-During each intermediate step, you can use 'print()' to save whatever important information you will then need.
-These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
-In the end you have to return a final answer using the `final_answer` tool.
-
-Here are a few examples using notional tools:
----
-Task: "Generate an image of the oldest person in this document."
-
-Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
-Code:
-```py
-answer = document_qa(document=document, question="Who is the oldest person mentioned?")
-print(answer)
-```<end_code>
-Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
-
-Thought: I will now generate an image showcasing the oldest person.
-Code:
-```py
-image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.")
-final_answer(image)
-```<end_code>
-
----
-Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
-
-Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
-Code:
-```py
-result = 5 + 3 + 1294.678
-final_answer(result)
-```<end_code>
-
----
-Task:
-"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
-You have been provided with these additional arguments, that you can access using the keys as variables in your python code:
-{'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}"
-
-Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
-Code:
-```py
-translated_question = translator(question=question, src_lang="French", tgt_lang="English")
-print(f"The translated question is {translated_question}.")
-answer = image_qa(image=image, question=translated_question)
-final_answer(f"The answer is {answer}")
-```<end_code>
-
----
-Task:
-In a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer.
-What does he say was the consequence of Einstein learning too much math on his creativity, in one word?
-
-Thought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin.
-Code:
-```py
-pages = search(query="1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein")
-print(pages)
-```<end_code>
-Observation:
-No result found for query "1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein".
-
-Thought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query.
-Code:
-```py
-pages = search(query="1979 interview Stanislaus Ulam")
-print(pages)
-```<end_code>
-Observation:
-Found 6 pages:
-[Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/)
-
-[Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/)
-
-(truncated)
-
-Thought: I will read the first 2 pages to know more.
-Code:
-```py
-for url in ["https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/", "https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/"]:
-    whole_page = visit_webpage(url)
-    print(whole_page)
-    print("\n" + "="*80 + "\n")  # Print separator between pages
-```<end_code>
-Observation:
-Manhattan Project Locations:
-Los Alamos, NM
-Stanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at
-(truncated)
-
-Thought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: "He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity." Let's answer in one word.
-Code:
-```py
-final_answer("diminished")
-```<end_code>
-
----
-Task: "Which city has the highest population: Guangzhou or Shanghai?"
-
-Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
-Code:
-```py
-for city in ["Guangzhou", "Shanghai"]:
-    print(f"Population {city}:", search(f"{city} population")
-```<end_code>
-Observation:
-Population Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
-Population Shanghai: '26 million (2019)'
-
-Thought: Now I know that Shanghai has the highest population.
-Code:
-```py
-final_answer("Shanghai")
-```<end_code>
-
----
-Task: "What is the current age of the pope, raised to the power 0.36?"
-
-Thought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search.
-Code:
-```py
-pope_age_wiki = wiki(query="current pope age")
-print("Pope age as per wikipedia:", pope_age_wiki)
-pope_age_search = web_search(query="current pope age")
-print("Pope age as per google search:", pope_age_search)
-```<end_code>
-Observation:
-Pope age as per wikipedia: "The pope Francis is currently 88 years old."
-Pope age as per google search: "The current pope, Francis, just turned 88."
-
-Thought: I know that the pope is 88 years old. Let's compute the result using python code.
-Code:
-```py
-pope_current_age = 88 ** 0.36
-final_answer(pope_current_age)
-```<end_code>
-
-Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
-
-{{tool_descriptions}}
-
-{{managed_agents_descriptions}}
-
-Here are the rules you should always follow to solve your task:
-1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```<end_code>' sequence, else you will fail.
-2. Use only variables that you have defined!
-3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = wiki(query="What is the place where James Bond lives?")'.
-4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
-5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
-6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
-7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
-8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
-9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
-10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
-
-Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
-"""
-
-SYSTEM_PROMPT_FACTS = """Below I will present you a task.
-
-You will now build a comprehensive preparatory survey of which facts we have at our disposal and which ones we still need.
-To do so, you will have to read the task and identify things that must be discovered in order to successfully complete it.
-Don't make any assumptions. For each item, provide a thorough reasoning. Here is how you will structure this survey:
-
----
-### 1. Facts given in the task
-List here the specific facts given in the task that could help you (there might be nothing here).
-
-### 2. Facts to look up
-List here any facts that we may need to look up.
-Also list where to find each of these, for instance a website, a file... - maybe the task contains some sources that you should re-use here.
-
-### 3. Facts to derive
-List here anything that we want to derive from the above by logical reasoning, for instance computation or simulation.
-
-Keep in mind that "facts" will typically be specific names, dates, values, etc. Your answer should use the below headings:
-### 1. Facts given in the task
-### 2. Facts to look up
-### 3. Facts to derive
-Do not add anything else."""
-
-SYSTEM_PROMPT_PLAN = """You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.
-
-Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.
-This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.
-Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.
-After writing the final step of the plan, write the '\n<end_plan>' tag and stop there."""
-
-USER_PROMPT_PLAN = """
-Here is your task:
-
-Task:
-```
-{task}
-```
-
-Your plan can leverage any of these tools:
-{tool_descriptions}
-
-{managed_agents_descriptions}
-
-List of facts that you know:
-```
-{answer_facts}
-```
-
-Now begin! Write your plan below."""
-
-SYSTEM_PROMPT_FACTS_UPDATE = """
-You are a world expert at gathering known and unknown facts based on a conversation.
-Below you will find a task, and a history of attempts made to solve the task. You will have to produce a list of these:
-### 1. Facts given in the task
-### 2. Facts that we have learned
-### 3. Facts still to look up
-### 4. Facts still to derive
-Find the task and history below."""
-
-USER_PROMPT_FACTS_UPDATE = """Earlier we've built a list of facts.
-But since in your previous steps you may have learned useful new facts or invalidated some false ones.
-Please update your list of facts based on the previous history, and provide these headings:
-### 1. Facts given in the task
-### 2. Facts that we have learned
-### 3. Facts still to look up
-### 4. Facts still to derive
-
-Now write your new list of facts below."""
-
-SYSTEM_PROMPT_PLAN_UPDATE = """You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.
-
-You have been given a task:
-```
-{task}
-```
-
-Find below the record of what has been tried so far to solve it. Then you will be asked to make an updated plan to solve the task.
-If the previous tries so far have met some success, you can make an updated plan based on these actions.
-If you are stalled, you can make a completely new plan starting from scratch.
-"""
-
-USER_PROMPT_PLAN_UPDATE = """You're still working towards solving this task:
-```
-{task}
-```
-
-You have access to these tools and only these:
-{tool_descriptions}
-
-{managed_agents_descriptions}
-
-Here is the up to date list of facts that you know:
-```
-{facts_update}
-```
-
-Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.
-This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.
-Beware that you have {remaining_steps} steps remaining.
-Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.
-After writing the final step of the plan, write the '\n<end_plan>' tag and stop there.
-
-Now write your new plan below."""
-
-PLAN_UPDATE_FINAL_PLAN_REDACTION = """I still need to solve the task I was given:
-```
-{task}
-```
-
-Here is my new/updated plan of action to solve the task:
-```
-{plan_update}
-```"""
-
-MANAGED_AGENT_PROMPT = """You're a helpful agent named '{name}'.
-You have been submitted this task by your manager.
----
-Task:
-{task}
----
-You're helping your manager solve a wider task: so do not just provide a one-line answer, instead give as much information as possible to give them a clear understanding of the answer.
-
-Your final_answer WILL HAVE to contain these parts:
-### 1. Task outcome (short version):
-### 2. Task outcome (extremely detailed version):
-### 3. Additional context (if relevant):
-
-Put all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost.
-And even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback.
-"""
-
-__all__ = [
-    "USER_PROMPT_PLAN_UPDATE",
-    "PLAN_UPDATE_FINAL_PLAN_REDACTION",
-    "SINGLE_STEP_CODE_SYSTEM_PROMPT",
-    "CODE_SYSTEM_PROMPT",
-    "TOOL_CALLING_SYSTEM_PROMPT",
-    "MANAGED_AGENT_PROMPT",
-]
diff --git a/src/smolagents/prompts/code_agent.yaml b/src/smolagents/prompts/code_agent.yaml
new file mode 100644
index 0000000..2076bcc
--- /dev/null
+++ b/src/smolagents/prompts/code_agent.yaml
@@ -0,0 +1,321 @@
+system_prompt: |-
+  You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can.
+  To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
+  To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
+
+  At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
+  Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
+  During each intermediate step, you can use 'print()' to save whatever important information you will then need.
+  These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
+  In the end you have to return a final answer using the `final_answer` tool.
+
+  Here are a few examples using notional tools:
+  ---
+  Task: "Generate an image of the oldest person in this document."
+
+  Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+  Code:
+  ```py
+  answer = document_qa(document=document, question="Who is the oldest person mentioned?")
+  print(answer)
+  ```<end_code>
+  Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
+
+  Thought: I will now generate an image showcasing the oldest person.
+  Code:
+  ```py
+  image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.")
+  final_answer(image)
+  ```<end_code>
+
+  ---
+  Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
+
+  Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
+  Code:
+  ```py
+  result = 5 + 3 + 1294.678
+  final_answer(result)
+  ```<end_code>
+
+  ---
+  Task:
+  "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
+  You have been provided with these additional arguments, that you can access using the keys as variables in your python code:
+  {'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}"
+
+  Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
+  Code:
+  ```py
+  translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+  print(f"The translated question is {translated_question}.")
+  answer = image_qa(image=image, question=translated_question)
+  final_answer(f"The answer is {answer}")
+  ```<end_code>
+
+  ---
+  Task:
+  In a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer.
+  What does he say was the consequence of Einstein learning too much math on his creativity, in one word?
+
+  Thought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin.
+  Code:
+  ```py
+  pages = search(query="1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein")
+  print(pages)
+  ```<end_code>
+  Observation:
+  No result found for query "1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein".
+
+  Thought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query.
+  Code:
+  ```py
+  pages = search(query="1979 interview Stanislaus Ulam")
+  print(pages)
+  ```<end_code>
+  Observation:
+  Found 6 pages:
+  [Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/)
+
+  [Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/)
+
+  (truncated)
+
+  Thought: I will read the first 2 pages to know more.
+  Code:
+  ```py
+  for url in ["https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/", "https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/"]:
+      whole_page = visit_webpage(url)
+      print(whole_page)
+      print("\n" + "="*80 + "\n")  # Print separator between pages
+  ```<end_code>
+  Observation:
+  Manhattan Project Locations:
+  Los Alamos, NM
+  Stanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at
+  (truncated)
+
+  Thought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: "He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity." Let's answer in one word.
+  Code:
+  ```py
+  final_answer("diminished")
+  ```<end_code>
+
+  ---
+  Task: "Which city has the highest population: Guangzhou or Shanghai?"
+
+  Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
+  Code:
+  ```py
+  for city in ["Guangzhou", "Shanghai"]:
+      print(f"Population {city}:", search(f"{city} population")
+  ```<end_code>
+  Observation:
+  Population Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
+  Population Shanghai: '26 million (2019)'
+
+  Thought: Now I know that Shanghai has the highest population.
+  Code:
+  ```py
+  final_answer("Shanghai")
+  ```<end_code>
+
+  ---
+  Task: "What is the current age of the pope, raised to the power 0.36?"
+
+  Thought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search.
+  Code:
+  ```py
+  pope_age_wiki = wiki(query="current pope age")
+  print("Pope age as per wikipedia:", pope_age_wiki)
+  pope_age_search = web_search(query="current pope age")
+  print("Pope age as per google search:", pope_age_search)
+  ```<end_code>
+  Observation:
+  Pope age: "The pope Francis is currently 88 years old."
+
+  Thought: I know that the pope is 88 years old. Let's compute the result using python code.
+  Code:
+  ```py
+  pope_current_age = 88 ** 0.36
+  final_answer(pope_current_age)
+  ```<end_code>
+
+  Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
+  {%- for tool in tools.values() %}
+  - {{ tool.name }}: {{ tool.description }}
+      Takes inputs: {{tool.inputs}}
+      Returns an output of type: {{tool.output_type}}
+  {%- endfor %}
+
+  {%- if managed_agents and managed_agents.values() | list %}
+  You can also give tasks to team members.
+  Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
+  Given that this team member is a real human, you should be very verbose in your task.
+  Here is a list of the team members that you can call:
+  {%- for agent in managed_agents.values() %}
+  - {{ agent.name }}: {{ agent.description }}
+  {%- endfor %}
+  {%- else %}
+  {%- endif %}
+
+  Here are the rules you should always follow to solve your task:
+  1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```<end_code>' sequence, else you will fail.
+  2. Use only variables that you have defined!
+  3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = wiki(query="What is the place where James Bond lives?")'.
+  4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
+  5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
+  6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
+  7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
+  8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
+  9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
+  10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
+
+  Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
+planning:
+  initial_facts: |-
+    Below I will present you a task.
+
+    You will now build a comprehensive preparatory survey of which facts we have at our disposal and which ones we still need.
+    To do so, you will have to read the task and identify things that must be discovered in order to successfully complete it.
+    Don't make any assumptions. For each item, provide a thorough reasoning. Here is how you will structure this survey:
+
+    ---
+    ### 1. Facts given in the task
+    List here the specific facts given in the task that could help you (there might be nothing here).
+
+    ### 2. Facts to look up
+    List here any facts that we may need to look up.
+    Also list where to find each of these, for instance a website, a file... - maybe the task contains some sources that you should re-use here.
+
+    ### 3. Facts to derive
+    List here anything that we want to derive from the above by logical reasoning, for instance computation or simulation.
+
+    Keep in mind that "facts" will typically be specific names, dates, values, etc. Your answer should use the below headings:
+    ### 1. Facts given in the task
+    ### 2. Facts to look up
+    ### 3. Facts to derive
+    Do not add anything else.
+  initial_plan : |-
+    You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.
+
+    Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.
+    This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.
+    Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.
+    After writing the final step of the plan, write the '\n<end_plan>' tag and stop there.
+
+    Here is your task:
+
+    Task:
+    ```
+    {{task}}
+    ```
+    You can leverage these tools:
+    {%- for tool in tools.values() %}
+    - {{ tool.name }}: {{ tool.description }}
+        Takes inputs: {{tool.inputs}}
+        Returns an output of type: {{tool.output_type}}
+    {%- endfor %}
+
+    {%- if managed_agents and managed_agents.values() | list %}
+    You can also give tasks to team members.
+    Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaining your request.
+    Given that this team member is a real human, you should be very verbose in your request.
+    Here is a list of the team members that you can call:
+    {%- for agent in managed_agents.values() %}
+    - {{ agent.name }}: {{ agent.description }}
+    {%- endfor %}
+    {%- else %}
+    {%- endif %}
+
+    List of facts that you know:
+    ```
+    {{answer_facts}}
+    ```
+
+    Now begin! Write your plan below.
+  update_facts_pre_messages: |-
+    You are a world expert at gathering known and unknown facts based on a conversation.
+    Below you will find a task, and a history of attempts made to solve the task. You will have to produce a list of these:
+    ### 1. Facts given in the task
+    ### 2. Facts that we have learned
+    ### 3. Facts still to look up
+    ### 4. Facts still to derive
+    Find the task and history below:
+  update_facts_post_messages: |-
+    Earlier we've built a list of facts.
+    But since in your previous steps you may have learned useful new facts or invalidated some false ones.
+    Please update your list of facts based on the previous history, and provide these headings:
+    ### 1. Facts given in the task
+    ### 2. Facts that we have learned
+    ### 3. Facts still to look up
+    ### 4. Facts still to derive
+
+    Now write your new list of facts below.
+  update_plan_pre_messages: |-
+    You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.
+
+    You have been given a task:
+    ```
+    {{task}}
+    ```
+
+    Find below the record of what has been tried so far to solve it. Then you will be asked to make an updated plan to solve the task.
+    If the previous tries so far have met some success, you can make an updated plan based on these actions.
+    If you are stalled, you can make a completely new plan starting from scratch.
+  update_plan_post_messages: |-
+    You're still working towards solving this task:
+    ```
+    {{task}}
+    ```
+
+    You can leverage these tools:
+    {%- for tool in tools.values() %}
+    - {{ tool.name }}: {{ tool.description }}
+        Takes inputs: {{tool.inputs}}
+        Returns an output of type: {{tool.output_type}}
+    {%- endfor %}
+
+    {%- if managed_agents and managed_agents.values() | list %}
+    You can also give tasks to team members.
+    Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task'.
+    Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.
+    Here is a list of the team members that you can call:
+    {%- for agent in managed_agents.values() %}
+    - {{ agent.name }}: {{ agent.description }}
+    {%- endfor %}
+    {%- else %}
+    {%- endif %}
+
+    Here is the up to date list of facts that you know:
+    ```
+    {{facts_update}}
+    ```
+
+    Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.
+    This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.
+    Beware that you have {remaining_steps} steps remaining.
+    Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.
+    After writing the final step of the plan, write the '\n<end_plan>' tag and stop there.
+
+    Now write your new plan below.
+managed_agent:
+  task: |-
+      You're a helpful agent named '{{name}}'.
+      You have been submitted this task by your manager.
+      ---
+      Task:
+      {{task}}
+      ---
+      You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.
+
+      Your final_answer WILL HAVE to contain these parts:
+      ### 1. Task outcome (short version):
+      ### 2. Task outcome (extremely detailed version):
+      ### 3. Additional context (if relevant):
+
+      Put all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost.
+      And even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback.
+  report: |-
+      Here is the final answer from your managed agent '{{name}}':
+      {{final_answer}}
\ No newline at end of file
diff --git a/src/smolagents/prompts/toolcalling_agent.yaml b/src/smolagents/prompts/toolcalling_agent.yaml
new file mode 100644
index 0000000..8e11798
--- /dev/null
+++ b/src/smolagents/prompts/toolcalling_agent.yaml
@@ -0,0 +1,264 @@
+system_prompt: |-
+  You are an expert assistant who can solve any task using  tool calls. You will be given a task to solve as best you can.
+  To do so, you have been given access to some tools.
+
+  The tool call you write is an action: after the tool is executed, you will get the result of the tool call as an "observation".
+  This Action/Observation can repeat N times, you should take several steps when needed.
+
+  You can use the result of the previous action as input for the next action.
+  The observation will always be a string: it can represent a file, like "image_1.jpg".
+  Then you can use it as input for the next action. You can do it for instance as follows:
+
+  Observation: "image_1.jpg"
+
+  Action:
+  {
+    "name": "image_transformer",
+    "arguments": {"image": "image_1.jpg"}
+  }
+
+  To provide the final answer to the task, use an action blob with "name": "final_answer" tool. It is the only way to complete the task, else you will be stuck on a loop. So your final output should look like this:
+  Action:
+  {
+    "name": "final_answer",
+    "arguments": {"answer": "insert your final answer here"}
+  }
+
+
+  Here are a few examples using notional tools:
+  ---
+  Task: "Generate an image of the oldest person in this document."
+
+  Action:
+  {
+    "name": "document_qa",
+    "arguments": {"document": "document.pdf", "question": "Who is the oldest person mentioned?"}
+  }
+  Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
+
+  Action:
+  {
+    "name": "image_generator",
+    "arguments": {"prompt": "A portrait of John Doe, a 55-year-old man living in Canada."}
+  }
+  Observation: "image.png"
+
+  Action:
+  {
+    "name": "final_answer",
+    "arguments": "image.png"
+  }
+
+  ---
+  Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
+
+  Action:
+  {
+      "name": "python_interpreter",
+      "arguments": {"code": "5 + 3 + 1294.678"}
+  }
+  Observation: 1302.678
+
+  Action:
+  {
+    "name": "final_answer",
+    "arguments": "1302.678"
+  }
+
+  ---
+  Task: "Which city has the highest population , Guangzhou or Shanghai?"
+
+  Action:
+  {
+      "name": "search",
+      "arguments": "Population Guangzhou"
+  }
+  Observation: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
+
+
+  Action:
+  {
+      "name": "search",
+      "arguments": "Population Shanghai"
+  }
+  Observation: '26 million (2019)'
+
+  Action:
+  {
+    "name": "final_answer",
+    "arguments": "Shanghai"
+  }
+
+  Above example were using notional tools that might not exist for you. You only have access to these tools:
+  {%- for tool in tools.values() %}
+  - {{ tool.name }}: {{ tool.description }}
+      Takes inputs: {{tool.inputs}}
+      Returns an output of type: {{tool.output_type}}
+  {%- endfor %}
+
+  {%- if managed_agents and managed_agents.values() | list %}
+  You can also give requests to team members.
+  Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaining your request.
+  Given that this team member is a real human, you should be very verbose in your request.
+  Here is a list of the team members that you can call:
+  {%- for agent in managed_agents.values() %}
+  - {{ agent.name }}: {{ agent.description }}
+  {%- endfor %}
+  {%- else %}
+  {%- endif %}
+
+  Here are the rules you should always follow to solve your task:
+  1. ALWAYS provide a tool call, else you will fail.
+  2. Always use the right arguments for the tools. Never use variable names as the action arguments, use the value instead.
+  3. Call a tool only when needed: do not call the search agent if you do not need information, try to solve the task yourself.
+  If no tool call is needed, use final_answer tool to return your answer.
+  4. Never re-do a tool call that you previously did with the exact same parameters.
+
+  Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
+planning:
+  initial_facts: |-
+    Below I will present you a task.
+
+    You will now build a comprehensive preparatory survey of which facts we have at our disposal and which ones we still need.
+    To do so, you will have to read the task and identify things that must be discovered in order to successfully complete it.
+    Don't make any assumptions. For each item, provide a thorough reasoning. Here is how you will structure this survey:
+
+    ---
+    ### 1. Facts given in the task
+    List here the specific facts given in the task that could help you (there might be nothing here).
+
+    ### 2. Facts to look up
+    List here any facts that we may need to look up.
+    Also list where to find each of these, for instance a website, a file... - maybe the task contains some sources that you should re-use here.
+
+    ### 3. Facts to derive
+    List here anything that we want to derive from the above by logical reasoning, for instance computation or simulation.
+
+    Keep in mind that "facts" will typically be specific names, dates, values, etc. Your answer should use the below headings:
+    ### 1. Facts given in the task
+    ### 2. Facts to look up
+    ### 3. Facts to derive
+    Do not add anything else.
+  initial_plan : |-
+    You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.
+
+    Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.
+    This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.
+    Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.
+    After writing the final step of the plan, write the '\n<end_plan>' tag and stop there.
+
+    Here is your task:
+
+    Task:
+    ```
+    {{task}}
+    ```
+    You can leverage these tools:
+    {%- for tool in tools.values() %}
+    - {{ tool.name }}: {{ tool.description }}
+        Takes inputs: {{tool.inputs}}
+        Returns an output of type: {{tool.output_type}}
+    {%- endfor %}
+
+    {%- if managed_agents and managed_agents.values() | list %}
+    You can also give requests to team members.
+    Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaining your request.
+    Given that this team member is a real human, you should be very verbose in your request.
+    Here is a list of the team members that you can call:
+    {%- for agent in managed_agents.values() %}
+    - {{ agent.name }}: {{ agent.description }}
+    {%- endfor %}
+    {%- else %}
+    {%- endif %}
+
+    List of facts that you know:
+    ```
+    {{answer_facts}}
+    ```
+
+    Now begin! Write your plan below.
+  update_facts_pre_messages: |-
+    You are a world expert at gathering known and unknown facts based on a conversation.
+    Below you will find a task, and a history of attempts made to solve the task. You will have to produce a list of these:
+    ### 1. Facts given in the task
+    ### 2. Facts that we have learned
+    ### 3. Facts still to look up
+    ### 4. Facts still to derive
+    Find the task and history below:
+  update_facts_post_messages: |-
+    Earlier we've built a list of facts.
+    But since in your previous steps you may have learned useful new facts or invalidated some false ones.
+    Please update your list of facts based on the previous history, and provide these headings:
+    ### 1. Facts given in the task
+    ### 2. Facts that we have learned
+    ### 3. Facts still to look up
+    ### 4. Facts still to derive
+
+    Now write your new list of facts below.
+  update_plan_pre_messages: |-
+    You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools.
+
+    You have been given a task:
+    ```
+    {{task}}
+    ```
+
+    Find below the record of what has been tried so far to solve it. Then you will be asked to make an updated plan to solve the task.
+    If the previous tries so far have met some success, you can make an updated plan based on these actions.
+    If you are stalled, you can make a completely new plan starting from scratch.
+  update_plan_post_messages: |-
+    You're still working towards solving this task:
+    ```
+    {{task}}
+    ```
+
+    You can leverage these tools:
+    {%- for tool in tools.values() %}
+    - {{ tool.name }}: {{ tool.description }}
+        Takes inputs: {{tool.inputs}}
+        Returns an output of type: {{tool.output_type}}
+    {%- endfor %}
+
+    {%- if managed_agents and managed_agents.values() | list %}
+    You can also give requests to team members.
+    Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task'.
+    Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary.
+    Here is a list of the team members that you can call:
+    {%- for agent in managed_agents.values() %}
+    - {{ agent.name }}: {{ agent.description }}
+    {%- endfor %}
+    {%- else %}
+    {%- endif %}
+
+    Here is the up to date list of facts that you know:
+    ```
+    {{facts_update}}
+    ```
+
+    Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts.
+    This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer.
+    Beware that you have {remaining_steps} steps remaining.
+    Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS.
+    After writing the final step of the plan, write the '\n<end_plan>' tag and stop there.
+
+    Now write your new plan below.
+managed_agent:
+  task: |-
+      You're a helpful agent named '{{name}}'.
+      You have been submitted this task by your manager.
+      ---
+      Task:
+      {{task}}
+      ---
+      You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.
+
+      Your final_answer WILL HAVE to contain these parts:
+      ### 1. Task outcome (short version):
+      ### 2. Task outcome (extremely detailed version):
+      ### 3. Additional context (if relevant):
+
+      Put all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost.
+      And even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback.
+  report: |-
+      Here is the final answer from your managed agent '{{name}}':
+      {{final_answer}}
\ No newline at end of file
diff --git a/src/smolagents/tools.py b/src/smolagents/tools.py
index 4b8c3f3..d73fcce 100644
--- a/src/smolagents/tools.py
+++ b/src/smolagents/tools.py
@@ -24,7 +24,7 @@ import sys
 import tempfile
 import textwrap
 from contextlib import contextmanager
-from functools import lru_cache, wraps
+from functools import wraps
 from pathlib import Path
 from typing import Callable, Dict, List, Optional, Union
 
@@ -36,7 +36,6 @@ from huggingface_hub import (
     upload_folder,
 )
 from huggingface_hub.utils import is_torch_available
-from packaging import version
 
 from ._function_type_hints_utils import (
     TypeHintParsingException,
@@ -632,43 +631,6 @@ class Tool:
         return LangChainToolWrapper(langchain_tool)
 
 
-DEFAULT_TOOL_DESCRIPTION_TEMPLATE = """
-- {{ tool.name }}: {{ tool.description }}
-    Takes inputs: {{tool.inputs}}
-    Returns an output of type: {{tool.output_type}}
-"""
-
-
-def get_tool_description_with_args(tool: Tool, description_template: Optional[str] = None) -> str:
-    if description_template is None:
-        description_template = DEFAULT_TOOL_DESCRIPTION_TEMPLATE
-    compiled_template = compile_jinja_template(description_template)
-    tool_description = compiled_template.render(
-        tool=tool,
-    )
-    return tool_description
-
-
-@lru_cache
-def compile_jinja_template(template):
-    try:
-        import jinja2
-        from jinja2.exceptions import TemplateError
-        from jinja2.sandbox import ImmutableSandboxedEnvironment
-    except ImportError:
-        raise ImportError("template requires jinja2 to be installed.")
-
-    if version.parse(jinja2.__version__) < version.parse("3.1.0"):
-        raise ImportError(f"template requires jinja2>=3.1.0 to be installed. Your version is {jinja2.__version__}.")
-
-    def raise_exception(message):
-        raise TemplateError(message)
-
-    jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
-    jinja_env.globals["raise_exception"] = raise_exception
-    return jinja_env.from_string(template)
-
-
 def launch_gradio_demo(tool: Tool):
     """
     Launches a gradio demo for a tool. The corresponding tool class needs to properly implement the class attributes
diff --git a/tests/test_agents.py b/tests/test_agents.py
index 52808aa..3a06747 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -301,12 +301,6 @@ print(result)
 
 
 class AgentTests(unittest.TestCase):
-    def test_fake_single_step_code_agent(self):
-        agent = CodeAgent(tools=[PythonInterpreterTool()], model=fake_code_model_single_step)
-        output = agent.run("What is 2 multiplied by 3.6452?", single_step=True)
-        assert isinstance(output, str)
-        assert "7.2904" in output
-
     def test_fake_toolcalling_agent(self):
         agent = ToolCallingAgent(tools=[PythonInterpreterTool()], model=FakeToolCallModel())
         output = agent.run("What is 2 multiplied by 3.6452?")
@@ -475,10 +469,9 @@ class AgentTests(unittest.TestCase):
             model=fake_code_functiondef,
             managed_agents=[managed_agent],
         )
-        assert "You can also give requests to team members." not in managed_agent.system_prompt
-        print("ok1")
+        assert "You can also give tasks to team members." not in managed_agent.system_prompt
         assert "{{managed_agents_descriptions}}" not in managed_agent.system_prompt
-        assert "You can also give requests to team members." in manager_agent.system_prompt
+        assert "You can also give tasks to team members." in manager_agent.system_prompt
 
     def test_code_agent_missing_import_triggers_advice_in_error_log(self):
         # Set explicit verbosity level to 1 to override the default verbosity level of -1 set in CI fixture
@@ -491,6 +484,8 @@ class AgentTests(unittest.TestCase):
 
     def test_multiagents(self):
         class FakeModelMultiagentsManagerAgent:
+            model_id = "fake_model"
+
             def __call__(
                 self,
                 messages,
@@ -557,6 +552,8 @@ final_answer("Final report.")
         manager_model = FakeModelMultiagentsManagerAgent()
 
         class FakeModelMultiagentsManagedAgent:
+            model_id = "fake_model"
+
             def __call__(
                 self,
                 messages,
@@ -608,6 +605,9 @@ final_answer("Final report.")
         report = manager_toolcalling_agent.run("Fake question.")
         assert report == "Final report."
 
+        # Test that visualization works
+        manager_code_agent.visualize()
+
     def test_code_nontrivial_final_answer_works(self):
         def fake_code_model_final_answer(messages, stop_sequences=None, grammar=None):
             return ChatMessage(
@@ -628,9 +628,9 @@ nested_answer()
 
     def test_transformers_toolcalling_agent(self):
         @tool
-        def get_weather(location: str, celsius: bool = False) -> str:
+        def weather_api(location: str, celsius: bool = False) -> str:
             """
-            Get weather in the next days at given location.
+            Gets the weather in the next days at given location.
             Secretly this tool does not care about the location, it hates the weather everywhere.
 
             Args:
@@ -645,15 +645,23 @@ nested_answer()
             device_map="auto",
             do_sample=False,
         )
-        agent = ToolCallingAgent(model=model, tools=[get_weather], max_steps=1)
+        agent = ToolCallingAgent(model=model, tools=[weather_api], max_steps=1)
         agent.run("What's the weather in Paris?")
         assert agent.memory.steps[0].task == "What's the weather in Paris?"
-        assert agent.memory.steps[1].tool_calls[0].name == "get_weather"
+        assert agent.memory.steps[1].tool_calls[0].name == "weather_api"
         step_memory_dict = agent.memory.get_succinct_steps()[1]
-        assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "get_weather"
+        assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "weather_api"
         assert step_memory_dict["model_output_message"].raw["completion_kwargs"]["max_new_tokens"] == 100
         assert "model_input_messages" in agent.memory.get_full_steps()[1]
 
+    def test_final_answer_checks(self):
+        def check_always_fails(final_answer, agent_memory):
+            assert False, "Error raised in check"
+
+        agent = CodeAgent(model=fake_code_model, tools=[], final_answer_checks=[check_always_fails])
+        agent.run("Dummy task.")
+        assert "Error raised in check" in str(agent.write_memory_to_messages())
+
 
 class TestMultiStepAgent:
     def test_logging_to_terminal_is_disabled(self):
@@ -663,16 +671,19 @@ class TestMultiStepAgent:
 
     def test_step_number(self):
         fake_model = MagicMock()
-        agent = MultiStepAgent(tools=[], model=fake_model)
+        fake_model.last_input_token_count = 10
+        fake_model.last_output_token_count = 20
+        max_steps = 2
+        agent = MultiStepAgent(tools=[], model=fake_model, max_steps=max_steps)
         assert hasattr(agent, "step_number"), "step_number attribute should be defined"
         assert agent.step_number == 0, "step_number should be initialized to 0"
-        agent.run("Test task", single_step=True)
+        agent.run("Test task")
         assert hasattr(agent, "step_number"), "step_number attribute should be defined"
-        assert agent.step_number == 1, "step_number should be set to 1 after run method is called"
+        assert agent.step_number == max_steps + 1, "step_number should be max_steps + 1 after run method is called"
 
     def test_planning_step_first_step(self):
         fake_model = MagicMock()
-        agent = MultiStepAgent(
+        agent = CodeAgent(
             tools=[],
             model=fake_model,
         )
@@ -683,7 +694,7 @@ class TestMultiStepAgent:
         assert isinstance(planning_step, PlanningStep)
         messages = planning_step.model_input_messages
         assert isinstance(messages, list)
-        assert len(messages) == 2
+        assert len(messages) == 1
         for message in messages:
             assert isinstance(message, dict)
             assert "role" in message
@@ -701,7 +712,7 @@ class TestMultiStepAgent:
             assert len(call_args.args) == 1
             messages = call_args.args[0]
             assert isinstance(messages, list)
-            assert len(messages) == 2
+            assert len(messages) == 1
             for message in messages:
                 assert isinstance(message, dict)
                 assert "role" in message