In [1]:
import datasets

eval_ds = datasets.load_dataset("m-ric/agents_medium_benchmark_2")["train"]

### Define utilities and tools
To run the SERPAPI tool, you will need to have a [SerpAPI](https://serpapi.com/dashboard) API key: for this you need a paid account.

In [2]:
import time
import json
import os
import re
import string
import warnings
from tqdm import tqdm
from typing import List

from smolagents import (
    GoogleSearchTool,
    CodeAgent,
    ToolCallingAgent,
    HfApiModel,
    AgentError,
    VisitWebpageTool,
    PythonInterpreterTool,
)
from smolagents.agents import ActionStep
from dotenv import load_dotenv

load_dotenv()
os.makedirs("output", exist_ok=True)


def serialize_agent_error(obj):
    if isinstance(obj, AgentError):
        return {"error_type": obj.__class__.__name__, "message": obj.message}
    else:
        return str(obj)


def answer_questions(eval_ds, file_name, agent, model_id, action_type):
    answered_questions = []
    if os.path.exists(file_name):
        with open(file_name, "r") as f:
            for line in f:
                answered_questions.append(json.loads(line)["question"])

    for _, example in tqdm(enumerate(eval_ds), total=len(eval_ds)):
        try:
            question = example["question"]
            if example["source"] == "SimpleQA":
                question += " Answer with only the final number."
            if question in answered_questions:
                continue
            start_time = time.time()
            answer = agent.run(question)
            end_time = time.time()
            for step_log in agent.logs:
                if hasattr(step_log, "memory"):
                    step_log.memory = None

            # Remove memory from logs to make them more compact.
            for step in agent.logs:
                if isinstance(step, ActionStep):
                    step.agent_memory = None

            annotated_example = {
                "model_id": model_id,
                "agent_action_type": action_type,
                "question": question,
                "answer": answer,
                "true_answer": example["true_answer"],
                "source": example["source"],
                "intermediate_steps": str(agent.logs),
                "start_time": start_time,
                "end_time": end_time,
                "token_counts": agent.monitor.get_total_token_counts(),
            }

            with open(file_name, "a") as f:
                json.dump(annotated_example, f, default=serialize_agent_error)
                f.write("\n")  # add a newline for JSONL format
        except Exception as e:
            print("Failed:", e)


def normalize_number_str(number_str: str) -> float:
    # we replace these common units and commas to allow
    # conversion to float
    for char in ["$", "%", ","]:
        number_str = number_str.replace(char, "")
    try:
        return float(number_str)
    except ValueError:
        print(f"String {number_str} cannot be normalized to number str.")
        return float("inf")


def split_string(
    s: str,
    char_list: list[str] = [",", ";"],
) -> list[str]:
    pattern = f"[{''.join(char_list)}]"
    return re.split(pattern, s)


def is_float(element: any) -> bool:
    try:
        float(element)
        return True
    except ValueError:
        return False


def normalize_str(input_str, remove_punct=True) -> str:
    """
    Normalize a string by:
    - Removing all white spaces
    - Optionally removing punctuation (if remove_punct is True)
    - Converting to lowercase
    Parameters:
    - input_str: str, the string to normalize
    - remove_punct: bool, whether to remove punctuation (default: True)
    Returns:
    - str, the normalized string
    """
    # Remove all white spaces. Required e.g for seagull vs. sea gull
    no_spaces = re.sub(r"\s", "", input_str)

    # Remove punctuation, if specified.
    if remove_punct:
        translator = str.maketrans("", "", string.punctuation)
        return no_spaces.lower().translate(translator)
    else:
        return no_spaces.lower()


def extract_numbers(text: str) -> List[str]:
    """This pattern matches:
    - Optional negative sign
    - Numbers with optional comma thousand separators
    - Optional decimal points with decimal numbers
    """
    pattern = r"-?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?"

    return [el.replace(",", "") for el in re.findall(pattern, text)]


def get_question_score_gaia(
    model_answer: str,
    ground_truth: str,
) -> bool:
    if is_float(ground_truth):
        normalized_answer = normalize_number_str(str(model_answer))
        return normalized_answer == float(ground_truth)

    elif any(char in ground_truth for char in [",", ";"]):  # if gt is a list
        # question with the fish: normalization removes punct
        gt_elems = split_string(ground_truth)
        ma_elems = split_string(model_answer)

        if len(gt_elems) != len(ma_elems):  # check length is the same
            warnings.warn(
                "Answer lists have different lengths, returning False.", UserWarning
            )
            return False

        comparisons = []
        for ma_elem, gt_elem in zip(
            ma_elems, gt_elems
        ):  # compare each element as float or str
            if is_float(gt_elem):
                normalized_ma_elem = normalize_number_str(ma_elem)
                comparisons.append(normalized_ma_elem == float(gt_elem))
            else:
                # we do not remove punct since comparisons can include punct
                comparisons.append(
                    normalize_str(ma_elem, remove_punct=False)
                    == normalize_str(gt_elem, remove_punct=False)
                )
        return all(comparisons)

    else:  # if gt is a str
        return normalize_str(model_answer) == normalize_str(ground_truth)

### Evaluate open models

In [None]:
open_model_ids = [
    "meta-llama/Llama-3.3-70B-Instruct",
    # "Qwen/QwQ-32B-Preview",
    "Qwen/Qwen2.5-72B-Instruct",
    "Qwen/Qwen2.5-Coder-32B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    # "HuggingFaceTB/SmolLM2-1.7B-Instruct",
    # "meta-llama/Llama-3.1-70B-Instruct",
]

for model_id in open_model_ids:
    print(f"Evaluating '{model_id}'...")
    action_type = "tool_calling"
    agent = ToolCallingAgent(
        tools=[GoogleSearchTool(), VisitWebpageTool(), PythonInterpreterTool()],
        model=HfApiModel(model_id),
        max_iterations=10,
    )
    file_name = f"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl"
    answer_questions(eval_ds, file_name, agent, model_id, action_type)

    action_type = "code"
    agent = CodeAgent(
        tools=[GoogleSearchTool(), VisitWebpageTool()],
        model=HfApiModel(model_id),
        additional_authorized_imports=["numpy"],
        max_iterations=10,
    )
    file_name = f"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl"
    answer_questions(eval_ds, file_name, agent, model_id, action_type)

Evaluating 'meta-llama/Llama-3.3-70B-Instruct'...


  0%|          | 0/142 [00:00<?, ?it/s]

 68%|██████▊   | 96/142 [00:11<00:05,  8.13it/s]



 68%|██████▊   | 97/142 [01:22<00:51,  1.15s/it]

 69%|██████▉   | 98/142 [04:06<03:18,  4.50s/it]

 70%|██████▉   | 99/142 [04:18<03:23,  4.72s/it]

 70%|███████   | 100/142 [07:00<07:35, 10.85s/it]

 71%|███████   | 101/142 [11:23<16:32, 24.20s/it]

 72%|███████▏  | 102/142 [12:23<17:47, 26.69s/it]

 73%|███████▎  | 103/142 [12:26<15:57, 24.55s/it]

 73%|███████▎  | 104/142 [13:17<17:29, 27.63s/it]

 74%|███████▍  | 105/142 [16:48<33:00, 53.53s/it]

 75%|███████▍  | 106/142 [17:43<32:15, 53.76s/it]

 75%|███████▌  | 107/142 [17:57<26:50, 46.02s/it]

 76%|███████▌  | 108/142 [18:10<22:05, 38.98s/it]

 77%|███████▋  | 109/142 [18:18<17:25, 31.67s/it]

 77%|███████▋  | 110/142 [18:25<13:28, 25.27s/it]

 78%|███████▊  | 111/142 [19:02<14:45, 28.57s/it]

 79%|███████▉  | 112/142 [19:05<10:43, 21.44s/it]

 80%|███████▉  | 113/142 [19:30<10:48, 22.38s/it]

 80%|████████  | 114/142 [19:34<08:01, 17.20s/it]

 81%|████████  | 115/142 [19:36<05:47, 12.86s/it]

 82%|████████▏ | 116/142 [19:55<06:18, 14.54s/it]

 82%|████████▏ | 117/142 [20:39<09:43, 23.35s/it]

 83%|████████▎ | 118/142 [20:46<07:21, 18.38s/it]

 84%|████████▍ | 119/142 [20:57<06:15, 16.32s/it]

 85%|████████▍ | 120/142 [21:21<06:49, 18.63s/it]

 85%|████████▌ | 121/142 [21:57<08:14, 23.54s/it]

 86%|████████▌ | 122/142 [23:23<14:09, 42.46s/it]

 87%|████████▋ | 123/142 [23:33<10:19, 32.59s/it]

 87%|████████▋ | 124/142 [24:08<09:58, 33.26s/it]

 88%|████████▊ | 125/142 [24:20<07:36, 26.86s/it]

 89%|████████▊ | 126/142 [27:00<17:51, 66.97s/it]

 89%|████████▉ | 127/142 [27:05<12:04, 48.32s/it]

 90%|█████████ | 128/142 [27:13<08:26, 36.15s/it]

 91%|█████████ | 129/142 [29:42<15:10, 70.07s/it]

 92%|█████████▏| 130/142 [29:54<10:31, 52.61s/it]

 92%|█████████▏| 131/142 [34:28<21:51, 119.22s/it]

 93%|█████████▎| 132/142 [34:30<14:00, 84.05s/it] 

 94%|█████████▎| 133/142 [34:41<09:19, 62.11s/it]

 94%|█████████▍| 134/142 [35:03<06:40, 50.09s/it]

 95%|█████████▌| 135/142 [35:07<04:13, 36.20s/it]

 96%|█████████▌| 136/142 [35:17<02:50, 28.42s/it]

 96%|█████████▋| 137/142 [35:20<01:43, 20.64s/it]

 97%|█████████▋| 138/142 [36:45<02:39, 39.87s/it]

 98%|█████████▊| 139/142 [36:54<01:32, 30.74s/it]

 99%|█████████▊| 140/142 [36:59<00:46, 23.08s/it]

 99%|█████████▉| 141/142 [37:01<00:16, 16.60s/it]

100%|██████████| 142/142 [37:30<00:00, 15.85s/it]
100%|██████████| 142/142 [00:00<00:00, 38018.08it/s]


Evaluating 'Qwen/Qwen2.5-72B-Instruct'...


  0%|          | 0/142 [00:00<?, ?it/s]

  1%|          | 1/142 [01:54<4:28:29, 114.25s/it]

  1%|▏         | 2/142 [02:24<2:30:56, 64.69s/it] 

  2%|▏         | 3/142 [05:10<4:17:28, 111.14s/it]

  3%|▎         | 4/142 [05:37<2:59:19, 77.97s/it] 

  4%|▎         | 5/142 [06:13<2:23:16, 62.75s/it]

  4%|▍         | 6/142 [07:06<2:14:38, 59.40s/it]

  5%|▍         | 7/142 [08:20<2:24:38, 64.28s/it]

  6%|▌         | 8/142 [08:36<1:49:03, 48.84s/it]

  6%|▋         | 9/142 [11:49<3:28:18, 93.98s/it]

  7%|▋         | 10/142 [13:48<3:43:44, 101.70s/it]

  8%|▊         | 11/142 [26:18<10:55:19, 300.15s/it]

  8%|▊         | 12/142 [28:38<9:04:31, 251.32s/it] 

  9%|▉         | 13/142 [30:22<7:24:29, 206.74s/it]

 10%|▉         | 14/142 [31:56<6:08:33, 172.76s/it]

 11%|█         | 15/142 [39:03<8:47:26, 249.18s/it]

 11%|█▏        | 16/142 [51:29<13:57:27, 398.79s/it]

 12%|█▏        | 17/142 [54:49<11:46:09, 338.95s/it]

 13%|█▎        | 18/142 [55:30<8:35:41, 249.53s/it] 

 13%|█▎        | 19/142 [56:13<6:24:01, 187.33s/it]

 14%|█▍        | 20/142 [56:21<4:31:55, 133.73s/it]

 15%|█▍        | 21/142 [57:09<3:37:45, 107.98s/it]

 15%|█▌        | 22/142 [57:33<2:45:39, 82.83s/it] 

 16%|█▌        | 23/142 [57:40<1:58:41, 59.85s/it]

 17%|█▋        | 24/142 [59:50<2:39:25, 81.07s/it]

 18%|█▊        | 25/142 [1:01:33<2:50:58, 87.68s/it]

 18%|█▊        | 26/142 [1:02:19<2:25:03, 75.03s/it]

 19%|█▉        | 27/142 [1:05:10<3:18:50, 103.74s/it]

 20%|█▉        | 28/142 [1:05:48<2:40:02, 84.23s/it] 

 20%|██        | 29/142 [1:05:58<1:56:22, 61.79s/it]

 21%|██        | 30/142 [1:06:12<1:28:27, 47.39s/it]

 22%|██▏       | 31/142 [1:07:49<1:55:23, 62.38s/it]

 23%|██▎       | 32/142 [1:09:18<2:09:03, 70.39s/it]

 23%|██▎       | 33/142 [1:14:15<4:11:10, 138.26s/it]

 24%|██▍       | 34/142 [1:14:54<3:15:20, 108.52s/it]

 25%|██▍       | 35/142 [1:15:56<2:48:46, 94.64s/it] 

 25%|██▌       | 36/142 [1:16:37<2:18:56, 78.64s/it]

 26%|██▌       | 37/142 [1:17:47<2:12:59, 75.99s/it]

 27%|██▋       | 38/142 [1:18:37<1:57:53, 68.01s/it]

 27%|██▋       | 39/142 [1:21:11<2:41:09, 93.88s/it]

 28%|██▊       | 40/142 [1:21:21<1:57:07, 68.90s/it]

 29%|██▉       | 41/142 [1:22:20<1:50:45, 65.79s/it]

 30%|██▉       | 42/142 [1:23:37<1:55:30, 69.31s/it]

 30%|███       | 43/142 [1:23:52<1:27:12, 52.85s/it]

 31%|███       | 44/142 [1:24:38<1:23:01, 50.83s/it]

 32%|███▏      | 45/142 [1:25:25<1:20:17, 49.67s/it]

 32%|███▏      | 46/142 [1:25:50<1:07:26, 42.15s/it]

 33%|███▎      | 47/142 [1:26:36<1:08:51, 43.49s/it]

 34%|███▍      | 48/142 [1:26:55<56:43, 36.21s/it]  

 35%|███▍      | 49/142 [1:27:19<50:26, 32.54s/it]

 35%|███▌      | 50/142 [1:27:39<43:55, 28.64s/it]

 36%|███▌      | 51/142 [1:29:57<1:33:11, 61.44s/it]

 37%|███▋      | 52/142 [1:30:42<1:24:59, 56.66s/it]

 37%|███▋      | 53/142 [1:30:47<1:00:59, 41.12s/it]

 38%|███▊      | 54/142 [1:30:54<45:08, 30.78s/it]  

 39%|███▊      | 55/142 [1:31:01<34:28, 23.78s/it]

 39%|███▉      | 56/142 [1:31:43<41:55, 29.25s/it]

 40%|████      | 57/142 [1:31:53<33:06, 23.37s/it]

 41%|████      | 58/142 [1:35:36<1:56:33, 83.26s/it]

 42%|████▏     | 59/142 [1:36:35<1:44:59, 75.89s/it]

 42%|████▏     | 60/142 [1:36:56<1:21:17, 59.48s/it]

 43%|████▎     | 61/142 [1:37:01<58:05, 43.03s/it]  

 44%|████▎     | 62/142 [1:37:04<41:28, 31.11s/it]

 44%|████▍     | 63/142 [1:37:13<32:19, 24.55s/it]

 45%|████▌     | 64/142 [1:37:15<23:06, 17.78s/it]

 46%|████▌     | 65/142 [1:37:38<24:54, 19.41s/it]

 46%|████▋     | 66/142 [1:37:51<22:08, 17.49s/it]

 47%|████▋     | 67/142 [1:38:01<18:57, 15.17s/it]

 48%|████▊     | 68/142 [1:38:07<15:27, 12.54s/it]

 49%|████▊     | 69/142 [1:38:09<11:18,  9.30s/it]

 49%|████▉     | 70/142 [1:38:11<08:19,  6.94s/it]

 50%|█████     | 71/142 [1:38:37<14:56, 12.63s/it]

 51%|█████     | 72/142 [1:38:54<16:23, 14.05s/it]

 51%|█████▏    | 73/142 [1:38:56<11:55, 10.37s/it]

 52%|█████▏    | 74/142 [1:39:09<12:42, 11.22s/it]

 53%|█████▎    | 75/142 [1:39:23<13:31, 12.12s/it]

 54%|█████▎    | 76/142 [1:39:28<11:01, 10.03s/it]

 54%|█████▍    | 77/142 [1:39:34<09:20,  8.62s/it]

 55%|█████▍    | 78/142 [1:39:42<09:08,  8.56s/it]

 56%|█████▌    | 79/142 [1:39:52<09:18,  8.87s/it]

 56%|█████▋    | 80/142 [1:40:21<15:30, 15.00s/it]

 57%|█████▋    | 81/142 [1:40:26<12:09, 11.95s/it]

 58%|█████▊    | 82/142 [1:41:52<34:19, 34.33s/it]

 58%|█████▊    | 83/142 [1:42:12<29:22, 29.88s/it]

 59%|█████▉    | 84/142 [1:42:16<21:19, 22.06s/it]

 60%|█████▉    | 85/142 [1:42:23<16:54, 17.80s/it]

 61%|██████    | 86/142 [1:42:28<13:00, 13.94s/it]

 61%|██████▏   | 87/142 [1:42:44<13:08, 14.34s/it]

 62%|██████▏   | 88/142 [1:43:04<14:23, 16.00s/it]

 63%|██████▎   | 89/142 [1:43:09<11:12, 12.69s/it]

 63%|██████▎   | 90/142 [1:43:41<16:13, 18.72s/it]

 64%|██████▍   | 91/142 [1:44:21<21:20, 25.11s/it]

 65%|██████▍   | 92/142 [1:44:25<15:31, 18.64s/it]

 65%|██████▌   | 93/142 [1:47:31<56:20, 69.00s/it]

 66%|██████▌   | 94/142 [1:49:55<1:13:06, 91.39s/it]

 67%|██████▋   | 95/142 [1:52:45<1:29:57, 114.83s/it]

 68%|██████▊   | 96/142 [1:53:43<1:15:03, 97.90s/it] 

 68%|██████▊   | 97/142 [1:55:05<1:09:58, 93.30s/it]

 69%|██████▉   | 98/142 [1:55:43<56:02, 76.42s/it]  

 70%|██████▉   | 99/142 [1:55:51<40:04, 55.91s/it]

 70%|███████   | 100/142 [1:55:56<28:36, 40.87s/it]

 71%|███████   | 101/142 [1:56:24<25:13, 36.91s/it]

 72%|███████▏  | 102/142 [1:57:48<34:02, 51.06s/it]

 73%|███████▎  | 103/142 [1:57:58<25:11, 38.75s/it]

 73%|███████▎  | 104/142 [1:58:15<20:27, 32.32s/it]

 74%|███████▍  | 105/142 [1:58:21<14:57, 24.26s/it]

 75%|███████▍  | 106/142 [1:58:53<16:00, 26.69s/it]

 75%|███████▌  | 107/142 [1:59:08<13:32, 23.21s/it]

 76%|███████▌  | 108/142 [1:59:15<10:22, 18.32s/it]

 77%|███████▋  | 109/142 [1:59:21<08:00, 14.55s/it]

 77%|███████▋  | 110/142 [1:59:25<06:08, 11.51s/it]

 78%|███████▊  | 111/142 [1:59:38<06:03, 11.73s/it]

 79%|███████▉  | 112/142 [1:59:42<04:44,  9.50s/it]

 80%|███████▉  | 113/142 [1:59:58<05:36, 11.60s/it]

 80%|████████  | 114/142 [2:00:11<05:31, 11.84s/it]

 81%|████████  | 115/142 [2:00:19<04:47, 10.65s/it]

 82%|████████▏ | 116/142 [2:00:33<05:04, 11.73s/it]

 82%|████████▏ | 117/142 [2:01:05<07:26, 17.88s/it]

 83%|████████▎ | 118/142 [2:01:11<05:39, 14.13s/it]

 84%|████████▍ | 119/142 [2:01:14<04:14, 11.05s/it]

 85%|████████▍ | 120/142 [2:01:18<03:13,  8.81s/it]

 85%|████████▌ | 121/142 [2:01:21<02:26,  6.98s/it]

 86%|████████▌ | 122/142 [2:01:56<05:10, 15.53s/it]

 87%|████████▋ | 123/142 [2:02:00<03:45, 11.88s/it]

 87%|████████▋ | 124/142 [2:02:02<02:40,  8.93s/it]

 88%|████████▊ | 125/142 [2:02:30<04:09, 14.68s/it]

 89%|████████▊ | 126/142 [2:02:42<03:43, 13.98s/it]

 89%|████████▉ | 127/142 [2:04:47<11:50, 47.39s/it]

 90%|█████████ | 128/142 [2:05:00<08:37, 36.93s/it]

 91%|█████████ | 129/142 [2:05:34<07:49, 36.10s/it]

 92%|█████████▏| 130/142 [2:05:53<06:11, 30.94s/it]

 92%|█████████▏| 131/142 [2:06:14<05:08, 28.05s/it]

 93%|█████████▎| 132/142 [2:06:37<04:23, 26.36s/it]

### Evaluate closed models

In [4]:
from smolagents import LiteLLMModel

litellm_model_ids = ["gpt-4o", "anthropic/claude-3-5-sonnet-latest"]

for model_id in litellm_model_ids:
    print(f"Evaluating '{model_id}'...")
    action_type = "tool_calling"
    agent = ToolCallingAgent(
        tools=[GoogleSearchTool(), VisitWebpageTool(), PythonInterpreterTool()],
        model=LiteLLMModel(model_id),
        max_iterations=10,
    )
    file_name = f"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl"
    answer_questions(eval_ds, file_name, agent, model_id, action_type)

    action_type = "code"
    agent = CodeAgent(
        tools=[GoogleSearchTool(), VisitWebpageTool()],
        model=LiteLLMModel(model_id),
        additional_authorized_imports=["numpy"],
        max_iterations=10,
    )
    file_name = f"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl"
    answer_questions(eval_ds, file_name, agent, model_id, action_type)

Evaluating 'gpt-4o'...


  0%|          | 0/142 [00:00<?, ?it/s]

 63%|██████▎   | 89/142 [00:06<00:03, 14.23it/s]

Failed: closing tag '[/`qa�|�k�8�\*����]' at position 66529 doesn't match any open tag


 63%|██████▎   | 89/142 [00:19<00:03, 14.23it/s]

100%|██████████| 142/142 [00:39<00:00,  3.62it/s]
100%|██████████| 142/142 [00:00<00:00, 38161.80it/s]


Evaluating 'anthropic/claude-3-5-sonnet-latest'...


100%|██████████| 142/142 [00:00<00:00, 34475.06it/s]
100%|██████████| 142/142 [00:00<00:00, 35696.20it/s]


In [3]:
# import glob
# import json
# jsonl_files = glob.glob(f"output/*.jsonl")

# for file_path in jsonl_files:
#     print(file_path)
#     # Read all lines and filter out SimpleQA sources
#     filtered_lines = []
#     removed = 0
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             try:
#                 data = json.loads(line.strip())
#                 if data["source"] == "SimpleQA" and "Answer with only the final number." not in data["question"]:
#                     removed +=1
#                 else:
#                     filtered_lines.append(line)
#             except json.JSONDecodeError:
#                 print("Invalid line:", line)
#                 continue  # Skip invalid JSON lines
#     print(f"Removed {removed} lines.")
#     # Write filtered content back to the same file
#     with open(file_path, 'w', encoding='utf-8') as f:
#         f.writelines(filtered_lines)

In [None]:
import pandas as pd
import glob

res = []
for f in glob.glob(f"output/*.jsonl"):
    res.append(pd.read_json(f, lines=True))
result_df = pd.concat(res)


def get_correct(row):
    if row["source"] == "GSM8K":
        numbers_answer = extract_numbers(str(row["answer"]))
        if len(numbers_answer) == 0:
            print(f"No number found in {row['answer']}")
            return False
        return float(numbers_answer[-1]) == float(row["true_answer"])
    else:
        return get_question_score_gaia(str(row["answer"]), str(row["true_answer"]))


result_df["correct"] = result_df.apply(get_correct, axis=1)

result_df = result_df.loc[
    (result_df["agent_action_type"] == "code")
    & (
        ~result_df["model_id"].isin(
            [
                "meta-llama/Llama-3.2-3B-Instruct",
                "meta-llama/Llama-3.1-70B-Instruct",
                "HuggingFaceTB/SmolLM2-1.7B-Instruct",
            ]
        )
    )
]
result_df = (
    (result_df.groupby(["model_id", "source"])[["correct"]].mean() * 100)
    .round(1)
    .reset_index()
)
result_df["type"] = "agent"
display(result_df)

String Based on the information available from various sources particularly from the search results we can determine the information needed to answer the user's question.

From the results we find a specific mention in the article titled "The Evolution of Women's Participation in Computer Science" by the University of Pennsylvania which states:
> "computer science bachelor's degree recipients has fluctuated during the past four decades from a low of 13.6 (in 1971) to a high of 37.1 (in 1984) to a low of 18 (in 2007)."

Similarly the article "Chart of the Day: The Declining Female Share of Computer Science Degrees from 28 to 18" from the American Enterprise Institute confirms the data:
> "The female share of computer science bachelor's degrees actually peaked at 37.1 in 1984 before going into a steady decline for about the next quarter century."

To answer the user's query about how long it took for the percentage of computer scientists that were women to change by 13 from a starting po



Unnamed: 0,model_id,source,correct,type
0,Qwen/Qwen2.5-72B-Instruct,GAIA,12.5,agent
1,Qwen/Qwen2.5-72B-Instruct,GSM8K,82.9,agent
2,Qwen/Qwen2.5-72B-Instruct,SimpleQA,42.5,agent
3,Qwen/Qwen2.5-Coder-32B-Instruct,GAIA,28.1,agent
4,Qwen/Qwen2.5-Coder-32B-Instruct,GSM8K,92.9,agent
5,Qwen/Qwen2.5-Coder-32B-Instruct,SimpleQA,42.5,agent
6,anthropic/claude-3-5-sonnet-latest,GAIA,43.8,agent
7,anthropic/claude-3-5-sonnet-latest,GSM8K,91.4,agent
8,anthropic/claude-3-5-sonnet-latest,SimpleQA,47.5,agent
9,gpt-4o,GAIA,25.0,agent


In [27]:
vanilla_data = [
    ["gpt-4o", "SimpleQA", 38.2],
    ["gpt-4o", "GAIA", 9.3],
    ["Qwen/Qwen2.5-72B-Instruct", "SimpleQA", 9.1],
    ["anthropic/claude-3-5-sonnet-latest", "SimpleQA", 28.4],
    ["gpt-4o", "GSM8K", 94.3],
    ["anthropic/claude-3-5-sonnet-latest", "GSM8K", 96.4],
    ["meta-llama/Llama-3.3-70B-Instruct", "GSM8K", 95.1],
]

df2 = pd.DataFrame(vanilla_data, columns=["model_id", "source", "correct"])
df2["type"] = "vanilla"

combined_df = pd.concat([result_df, df2], ignore_index=True)

pivot_df = combined_df.pivot_table(
    index=["model_id", "source"],
    columns=["type"],
    values="correct",
    fill_value=float("nan"),
).reset_index()

In [29]:
def create_mathjax_table(pivot_df, formatted_df):
    # Start the matrix environment with 4 columns
    # l for left-aligned model and task, c for centered numbers
    mathjax_table = "\\begin{array}{llcc}\n"
    mathjax_table += (
        "\\text{Model} & \\text{Task} & \\text{Agent} & \\text{Vanilla} \\\\\n"
    )
    mathjax_table += "\\hline\n"

    # Sort the DataFrame by model_id and source
    formatted_df = formatted_df.sort_values(["model_id", "source"])

    current_model = None
    for _, row in formatted_df.iterrows():
        model = row["model_id"]
        source = row["source"]

        # Add a horizontal line between different models
        if current_model is not None and current_model != model:
            mathjax_table += "\\hline\n"

        # Format model name
        model_display = model.replace("_", "\\_")
        if "Qwen" in model or "anthropic" in model:
            model_display = f"\\textit{{{model_display}}}"

        # If it's the same model as previous row, use empty space
        if current_model == model:
            model_display = "\\;"

        # Add the data row
        mathjax_table += (
            f"{model_display} & {source} & {row['agent']} & {row['vanilla']} \\\\\n"
        )

        current_model = model

    mathjax_table += "\\hline\n"
    mathjax_table += "\\end{array}"

    return mathjax_table


# Usage (after running your previous data processing code):
mathjax_table = create_mathjax_table(pivot_df, formatted_df)
print(mathjax_table)

\begin{array}{llcc}
\text{Model} & \text{Task} & \text{Agent} & \text{Vanilla} \\
\hline
\textit{Qwen/Qwen2.5-72B-Instruct} & GAIA & 12.500 & - \\
\; & GSM8K & 82.900 & - \\
\; & SimpleQA & \textbf{42.500} & 9.100 \\
\hline
\textit{Qwen/Qwen2.5-Coder-32B-Instruct} & GAIA & 28.100 & - \\
\; & GSM8K & 92.900 & - \\
\; & SimpleQA & 42.500 & - \\
\hline
\textit{anthropic/claude-3-5-sonnet-latest} & GAIA & 43.800 & - \\
\; & GSM8K & 91.400 & \textbf{96.400} \\
\; & SimpleQA & \textbf{47.500} & 28.400 \\
\hline
gpt-4o & GAIA & \textbf{25.000} & 9.300 \\
\; & GSM8K & 91.400 & \textbf{94.300} \\
\; & SimpleQA & \textbf{60.000} & 38.200 \\
\hline
meta-llama/Llama-3.3-70B-Instruct & GAIA & 21.900 & - \\
\; & GSM8K & \textbf{95.700} & 95.100 \\
\; & SimpleQA & 30.000 & - \\
\hline
\end{array}
