{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install -e .. sympy numpy matplotlib seaborn -q # Install dev version of smolagents + some packages" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using the latest cached version of the dataset since m-ric/smolagentsbenchmark couldn't be found on the Hugging Face Hub\n", "Found the latest cached dataset configuration 'default' at /Users/aymeric/.cache/huggingface/datasets/m-ric___smolagentsbenchmark/default/0.0.0/0ad5fb2293ab185eece723a4ac0e4a7188f71add (last modified on Wed Jan 8 17:50:13 2025).\n" ] }, { "data": { "text/html": [ "
\n", " | question | \n", "source | \n", "true_answer | \n", "true_reasoning | \n", "
---|---|---|---|---|
0 | \n", "If Eliud Kipchoge could maintain his record-ma... | \n", "GAIA | \n", "17 | \n", "None | \n", "
1 | \n", "How many studio albums were published by Merce... | \n", "GAIA | \n", "3 | \n", "None | \n", "
2 | \n", "Here's a fun riddle that I think you'll enjoy.... | \n", "GAIA | \n", "3 | \n", "None | \n", "
3 | \n", "My family reunion is this week, and I was assi... | \n", "GAIA | \n", "2 | \n", "None | \n", "
4 | \n", "In Emily Midkiff's June 2014 article in a jour... | \n", "GAIA | \n", "fluffy | \n", "None | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
127 | \n", "What year was the municipality of San Carlos, ... | \n", "SimpleQA | \n", "1786 | \n", "['https://en.wikipedia.org/wiki/San_Carlos,_An... | \n", "
128 | \n", "In which year was Maria Elena Walsh named Illu... | \n", "SimpleQA | \n", "1985 | \n", "['https://en.wikipedia.org/wiki/Mar%C3%ADa_Ele... | \n", "
129 | \n", "What is the durability of the Istarelle spear ... | \n", "SimpleQA | \n", "800 | \n", "['http://demonssouls.wikidot.com/spear', 'http... | \n", "
130 | \n", "What is the number of the executive order that... | \n", "SimpleQA | \n", "7034 | \n", "['https://www.loc.gov/collections/federal-thea... | \n", "
131 | \n", "Within plus or minus one minute, when was Marq... | \n", "SimpleQA | \n", "77 | \n", "['https://www.fifa.com/fifaplus/en/match-centr... | \n", "
132 rows × 4 columns
\n", "action_type | \n", "model_id | \n", "source | \n", "code | \n", "vanilla | \n", "
---|---|---|---|---|
0 | \n", "Qwen/Qwen2.5-72B-Instruct | \n", "GAIA | \n", "28.1 | \n", "6.2 | \n", "
1 | \n", "Qwen/Qwen2.5-72B-Instruct | \n", "MATH | \n", "74.0 | \n", "31.9 | \n", "
2 | \n", "Qwen/Qwen2.5-72B-Instruct | \n", "SimpleQA | \n", "70.0 | \n", "10.0 | \n", "
3 | \n", "Qwen/Qwen2.5-Coder-32B-Instruct | \n", "GAIA | \n", "18.8 | \n", "3.1 | \n", "
4 | \n", "Qwen/Qwen2.5-Coder-32B-Instruct | \n", "MATH | \n", "76.0 | \n", "60.0 | \n", "
5 | \n", "Qwen/Qwen2.5-Coder-32B-Instruct | \n", "SimpleQA | \n", "86.0 | \n", "8.0 | \n", "
6 | \n", "anthropic/claude-3-5-sonnet-latest | \n", "GAIA | \n", "40.6 | \n", "3.1 | \n", "
7 | \n", "anthropic/claude-3-5-sonnet-latest | \n", "MATH | \n", "67.0 | \n", "50.0 | \n", "
8 | \n", "anthropic/claude-3-5-sonnet-latest | \n", "SimpleQA | \n", "90.0 | \n", "34.0 | \n", "
9 | \n", "gpt-4o | \n", "GAIA | \n", "28.1 | \n", "3.1 | \n", "
10 | \n", "gpt-4o | \n", "MATH | \n", "70.0 | \n", "40.0 | \n", "
11 | \n", "gpt-4o | \n", "SimpleQA | \n", "88.0 | \n", "6.0 | \n", "
12 | \n", "meta-llama/Llama-3.1-8B-Instruct | \n", "GAIA | \n", "0.0 | \n", "0.0 | \n", "
13 | \n", "meta-llama/Llama-3.1-8B-Instruct | \n", "MATH | \n", "42.0 | \n", "18.0 | \n", "
14 | \n", "meta-llama/Llama-3.1-8B-Instruct | \n", "SimpleQA | \n", "54.0 | \n", "6.0 | \n", "
15 | \n", "meta-llama/Llama-3.2-3B-Instruct | \n", "GAIA | \n", "3.1 | \n", "0.0 | \n", "
16 | \n", "meta-llama/Llama-3.2-3B-Instruct | \n", "MATH | \n", "32.0 | \n", "12.0 | \n", "
17 | \n", "meta-llama/Llama-3.2-3B-Instruct | \n", "SimpleQA | \n", "4.0 | \n", "0.0 | \n", "
18 | \n", "meta-llama/Llama-3.3-70B-Instruct | \n", "GAIA | \n", "34.4 | \n", "3.1 | \n", "
19 | \n", "meta-llama/Llama-3.3-70B-Instruct | \n", "MATH | \n", "82.0 | \n", "40.0 | \n", "
20 | \n", "meta-llama/Llama-3.3-70B-Instruct | \n", "SimpleQA | \n", "84.0 | \n", "12.0 | \n", "