smolagents/examples/open_deep_research/visual_vs_text_browser.ipynb

351 lines
10 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install \"smolagents[litellm]\" -q"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import datasets\n",
"\n",
"\n",
"eval_ds = datasets.load_dataset(\"gaia-benchmark/GAIA\", \"2023_all\")[\"validation\"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"to_keep = [\n",
" \"What's the last line of the rhyme under the flavor\",\n",
" 'Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus',\n",
" \"In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.\",\n",
" \"Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?\",\n",
" \"The photograph in the Whitney Museum of American Art's collection with accession number 2022.128 shows a person holding a book. Which military unit did the author of this book join in 1813? Answer without using articles.\",\n",
" \"I went to Virtue restaurant & bar in Chicago for my birthday on March 22, 2021 and the main course I had was delicious! Unfortunately, when I went back about a month later on April 21, it was no longer on the dinner menu.\",\n",
" \"In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's \",\n",
" \"Under DDC 633 on Bielefeld University Library's BASE, as of 2020\",\n",
" \"In the 2018 VSCode blog post on replit.com, what was the command they clicked on in the last video to remove extra lines?\",\n",
" \"The Metropolitan Museum of Art has a portrait in its collection with an accession number of 29.100.5. Of the consecrators and co-consecrators\",\n",
" \"In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied?\",\n",
" 'In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content',\n",
" \"Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?\",\n",
"]\n",
"eval_ds = eval_ds.filter(lambda row: any([el in row[\"Question\"] for el in to_keep]))\n",
"eval_ds = eval_ds.rename_columns({\"Question\": \"question\", \"Final answer\": \"true_answer\", \"Level\": \"task\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"from dotenv import load_dotenv\n",
"from huggingface_hub import login\n",
"\n",
"\n",
"load_dotenv(override=True)\n",
"\n",
"login(os.getenv(\"HF_TOKEN\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Text browser"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from scripts.run_agents import answer_questions\n",
"from scripts.text_inspector_tool import TextInspectorTool\n",
"from scripts.text_web_browser import (\n",
" ArchiveSearchTool,\n",
" FinderTool,\n",
" FindNextTool,\n",
" NavigationalSearchTool,\n",
" PageDownTool,\n",
" PageUpTool,\n",
" SearchInformationTool,\n",
" VisitTool,\n",
")\n",
"from scripts.visual_qa import VisualQAGPT4Tool\n",
"\n",
"from smolagents import CodeAgent, LiteLLMModel\n",
"\n",
"\n",
"proprietary_model = LiteLLMModel(\"gpt-4o\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"### BUILD AGENTS & TOOLS\n",
"\n",
"WEB_TOOLS = [\n",
" SearchInformationTool(),\n",
" NavigationalSearchTool(),\n",
" VisitTool(),\n",
" PageUpTool(),\n",
" PageDownTool(),\n",
" FinderTool(),\n",
" FindNextTool(),\n",
" ArchiveSearchTool(),\n",
"]\n",
"\n",
"\n",
"surfer_agent = CodeAgent(\n",
" model=proprietary_model,\n",
" tools=WEB_TOOLS,\n",
" max_steps=20,\n",
" verbosity_level=2,\n",
")\n",
"\n",
"results_text = answer_questions(\n",
" eval_ds,\n",
" surfer_agent,\n",
" \"code_gpt4o_27-01_text\",\n",
" reformulation_model=proprietary_model,\n",
" output_folder=\"output_browsers\",\n",
" visual_inspection_tool=VisualQAGPT4Tool(),\n",
" text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Vision browser"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install helium -q"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from scripts.visual_qa import VisualQAGPT4Tool\n",
"\n",
"from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel\n",
"from smolagents.vision_web_browser import (\n",
" close_popups,\n",
" go_back,\n",
" helium_instructions,\n",
" initialize_agent,\n",
" save_screenshot,\n",
" search_item_ctrl_f,\n",
")\n",
"\n",
"\n",
"proprietary_model = LiteLLMModel(\"gpt-4o\")\n",
"vision_browser_agent = initialize_agent(proprietary_model)\n",
"### BUILD AGENTS & TOOLS\n",
"\n",
"CodeAgent(\n",
" tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],\n",
" model=proprietary_model,\n",
" additional_authorized_imports=[\"helium\"],\n",
" step_callbacks=[save_screenshot],\n",
" max_steps=20,\n",
" verbosity_level=2,\n",
")\n",
"\n",
"results_vision = answer_questions(\n",
" eval_ds,\n",
" vision_browser_agent,\n",
" \"code_gpt4o_27-01_vision\",\n",
" reformulation_model=proprietary_model,\n",
" output_folder=\"output_browsers\",\n",
" visual_inspection_tool=VisualQAGPT4Tool(),\n",
" text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
" postprompt=helium_instructions\n",
" + \"Any web browser controls won't work on .pdf urls, rather use the tool 'inspect_file_as_text' to read them\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Browser-use browser"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install browser-use lxml_html_clean -q\n",
"!playwright install"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import asyncio\n",
"\n",
"import nest_asyncio\n",
"\n",
"\n",
"nest_asyncio.apply()\n",
"\n",
"from browser_use import Agent\n",
"from dotenv import load_dotenv\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"\n",
"load_dotenv()\n",
"\n",
"\n",
"class BrowserUseAgent:\n",
" logs = []\n",
"\n",
" def write_inner_memory_from_logs(self, summary_mode):\n",
" return self.results\n",
"\n",
" def run(self, task, **kwargs):\n",
" agent = Agent(\n",
" task=task,\n",
" llm=ChatOpenAI(model=\"gpt-4o\"),\n",
" )\n",
" self.results = asyncio.get_event_loop().run_until_complete(agent.run())\n",
" return self.results.history[-1].result[0].extracted_content\n",
"\n",
"\n",
"browser_use_agent = BrowserUseAgent()\n",
"\n",
"results_browseruse = answer_questions(\n",
" eval_ds,\n",
" browser_use_agent,\n",
" \"gpt-4o_27-01_browseruse\",\n",
" reformulation_model=proprietary_model,\n",
" output_folder=\"output_browsers\",\n",
" visual_inspection_tool=VisualQAGPT4Tool(),\n",
" text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
" postprompt=\"\",\n",
" run_simple=True,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from scripts.gaia_scorer import question_scorer\n",
"\n",
"\n",
"results_vision, results_text, results_browseruse = (\n",
" pd.DataFrame(results_vision),\n",
" pd.DataFrame(results_text),\n",
" pd.DataFrame(results_browseruse),\n",
")\n",
"\n",
"results_vision[\"is_correct\"] = results_vision.apply(\n",
" lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n",
")\n",
"results_text[\"is_correct\"] = results_text.apply(lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1)\n",
"results_browseruse[\"is_correct\"] = results_browseruse.apply(\n",
" lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = pd.concat([results_vision, results_text, results_browseruse])\n",
"results.groupby(\"agent_name\")[\"is_correct\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"correct_vision_results = results_vision.loc[results_vision[\"is_correct\"]]\n",
"correct_vision_results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"false_text_results = results_text.loc[~results_text[\"is_correct\"]]\n",
"false_text_results"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "gaia",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}