diff --git a/examples/benchmark.ipynb b/examples/benchmark.ipynb index 065adce..bd3e11a 100644 --- a/examples/benchmark.ipynb +++ b/examples/benchmark.ipynb @@ -16,190 +16,43 @@ } ], "source": [ - "!pip install -e .. datasets sympy numpy matplotlib seaborn -q # Install dev version of smolagents + some packages" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/aymeric/venv/test/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - " | question | \n", - "source | \n", - "true_answer | \n", - "true_reasoning | \n", - "
---|---|---|---|---|
0 | \n", - "If Eliud Kipchoge could maintain his record-ma... | \n", - "GAIA | \n", - "17 | \n", - "None | \n", - "
1 | \n", - "How many studio albums were published by Merce... | \n", - "GAIA | \n", - "3 | \n", - "None | \n", - "
2 | \n", - "Here's a fun riddle that I think you'll enjoy.... | \n", - "GAIA | \n", - "3 | \n", - "None | \n", - "
3 | \n", - "My family reunion is this week, and I was assi... | \n", - "GAIA | \n", - "2 | \n", - "None | \n", - "
4 | \n", - "In Emily Midkiff's June 2014 article in a jour... | \n", - "GAIA | \n", - "fluffy | \n", - "None | \n", - "
... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "
127 | \n", - "What year was the municipality of San Carlos, ... | \n", - "SimpleQA | \n", - "1786 | \n", - "['https://en.wikipedia.org/wiki/San_Carlos,_An... | \n", - "
128 | \n", - "In which year was Maria Elena Walsh named Illu... | \n", - "SimpleQA | \n", - "1985 | \n", - "['https://en.wikipedia.org/wiki/Mar%C3%ADa_Ele... | \n", - "
129 | \n", - "What is the durability of the Istarelle spear ... | \n", - "SimpleQA | \n", - "800 | \n", - "['http://demonssouls.wikidot.com/spear', 'http... | \n", - "
130 | \n", - "What is the number of the executive order that... | \n", - "SimpleQA | \n", - "7034 | \n", - "['https://www.loc.gov/collections/federal-thea... | \n", - "
131 | \n", - "Within plus or minus one minute, when was Marq... | \n", - "SimpleQA | \n", - "77 | \n", - "['https://www.fifa.com/fifaplus/en/match-centr... | \n", - "
132 rows × 4 columns
\n", - "\n", + " | question | \n", + "source | \n", + "true_answer | \n", + "true_reasoning | \n", + "
---|---|---|---|---|
0 | \n", + "What year was the municipality of Ramiriquí, B... | \n", + "SimpleQA | \n", + "1541 | \n", + "['https://en.wikipedia.org/wiki/Ramiriqu%C3%AD... | \n", + "
1 | \n", + "In what year did Hjalmar Hvam invent a mechani... | \n", + "SimpleQA | \n", + "1937 | \n", + "['https://www.kgw.com/article/features/portlan... | \n", + "
2 | \n", + "In which year did Fayaz A. Malik (an Indian ph... | \n", + "SimpleQA | \n", + "2009 | \n", + "['https://en.wikipedia.org/wiki/Fayaz_A._Malik... | \n", + "
3 | \n", + "In which year was John B. Goodenough elected a... | \n", + "SimpleQA | \n", + "2010 | \n", + "['https://en.wikipedia.org/wiki/John_B._Gooden... | \n", + "
4 | \n", + "In which year did Atul Gawande earn an M.A. in... | \n", + "SimpleQA | \n", + "1989 | \n", + "['https://en.wikipedia.org/wiki/Atul_Gawande',... | \n", + "
\n", + " | model_id | \n", + "agent_action_type | \n", + "source | \n", + "acc | \n", + "
---|---|---|---|---|
0 | \n", + "Qwen/Qwen2.5-72B-Instruct | \n", + "code | \n", + "GAIA | \n", + "28.12 | \n", + "
1 | \n", + "Qwen/Qwen2.5-72B-Instruct | \n", + "code | \n", + "MATH | \n", + "76.00 | \n", + "
2 | \n", + "Qwen/Qwen2.5-72B-Instruct | \n", + "code | \n", + "SimpleQA | \n", + "88.00 | \n", + "
3 | \n", + "Qwen/Qwen2.5-72B-Instruct | \n", + "vanilla | \n", + "GAIA | \n", + "6.25 | \n", + "
4 | \n", + "Qwen/Qwen2.5-72B-Instruct | \n", + "vanilla | \n", + "MATH | \n", + "30.00 | \n", + "