smolagents/examples/open_deep_research/visual_vs_text_browser.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install \"smolagents[litellm]\" -q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "\n",
    "\n",
    "eval_ds = datasets.load_dataset(\"gaia-benchmark/GAIA\", \"2023_all\")[\"validation\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "to_keep = [\n",
    "    \"What's the last line of the rhyme under the flavor\",\n",
    "    'Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus',\n",
    "    \"In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.\",\n",
    "    \"Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?\",\n",
    "    \"The photograph in the Whitney Museum of American Art's collection with accession number 2022.128 shows a person holding a book. Which military unit did the author of this book join in 1813? Answer without using articles.\",\n",
    "    \"I went to Virtue restaurant & bar in Chicago for my birthday on March 22, 2021 and the main course I had was delicious! Unfortunately, when I went back about a month later on April 21, it was no longer on the dinner menu.\",\n",
    "    \"In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's \",\n",
    "    \"Under DDC 633 on Bielefeld University Library's BASE, as of 2020\",\n",
    "    \"In the 2018 VSCode blog post on replit.com, what was the command they clicked on in the last video to remove extra lines?\",\n",
    "    \"The Metropolitan Museum of Art has a portrait in its collection with an accession number of 29.100.5. Of the consecrators and co-consecrators\",\n",
    "    \"In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied?\",\n",
    "    'In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content',\n",
    "    \"Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?\",\n",
    "]\n",
    "eval_ds = eval_ds.filter(lambda row: any([el in row[\"Question\"] for el in to_keep]))\n",
    "eval_ds = eval_ds.rename_columns({\"Question\": \"question\", \"Final answer\": \"true_answer\", \"Level\": \"task\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "from huggingface_hub import login\n",
    "\n",
    "\n",
    "load_dotenv(override=True)\n",
    "\n",
    "login(os.getenv(\"HF_TOKEN\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Text browser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scripts.run_agents import answer_questions\n",
    "from scripts.text_inspector_tool import TextInspectorTool\n",
    "from scripts.text_web_browser import (\n",
    "    ArchiveSearchTool,\n",
    "    FinderTool,\n",
    "    FindNextTool,\n",
    "    NavigationalSearchTool,\n",
    "    PageDownTool,\n",
    "    PageUpTool,\n",
    "    SearchInformationTool,\n",
    "    VisitTool,\n",
    ")\n",
    "from scripts.visual_qa import VisualQAGPT4Tool\n",
    "\n",
    "from smolagents import CodeAgent, LiteLLMModel\n",
    "\n",
    "\n",
    "proprietary_model = LiteLLMModel(\"gpt-4o\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### BUILD AGENTS & TOOLS\n",
    "\n",
    "WEB_TOOLS = [\n",
    "    SearchInformationTool(),\n",
    "    NavigationalSearchTool(),\n",
    "    VisitTool(),\n",
    "    PageUpTool(),\n",
    "    PageDownTool(),\n",
    "    FinderTool(),\n",
    "    FindNextTool(),\n",
    "    ArchiveSearchTool(),\n",
    "]\n",
    "\n",
    "\n",
    "surfer_agent = CodeAgent(\n",
    "    model=proprietary_model,\n",
    "    tools=WEB_TOOLS,\n",
    "    max_steps=20,\n",
    "    verbosity_level=2,\n",
    ")\n",
    "\n",
    "results_text = answer_questions(\n",
    "    eval_ds,\n",
    "    surfer_agent,\n",
    "    \"code_gpt4o_27-01_text\",\n",
    "    reformulation_model=proprietary_model,\n",
    "    output_folder=\"output_browsers\",\n",
    "    visual_inspection_tool=VisualQAGPT4Tool(),\n",
    "    text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vision browser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install helium -q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scripts.visual_qa import VisualQAGPT4Tool\n",
    "\n",
    "from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel\n",
    "from smolagents.vision_web_browser import (\n",
    "    close_popups,\n",
    "    go_back,\n",
    "    helium_instructions,\n",
    "    initialize_agent,\n",
    "    save_screenshot,\n",
    "    search_item_ctrl_f,\n",
    ")\n",
    "\n",
    "\n",
    "proprietary_model = LiteLLMModel(\"gpt-4o\")\n",
    "vision_browser_agent = initialize_agent(proprietary_model)\n",
    "### BUILD AGENTS & TOOLS\n",
    "\n",
    "CodeAgent(\n",
    "    tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],\n",
    "    model=proprietary_model,\n",
    "    additional_authorized_imports=[\"helium\"],\n",
    "    step_callbacks=[save_screenshot],\n",
    "    max_steps=20,\n",
    "    verbosity_level=2,\n",
    ")\n",
    "\n",
    "results_vision = answer_questions(\n",
    "    eval_ds,\n",
    "    vision_browser_agent,\n",
    "    \"code_gpt4o_27-01_vision\",\n",
    "    reformulation_model=proprietary_model,\n",
    "    output_folder=\"output_browsers\",\n",
    "    visual_inspection_tool=VisualQAGPT4Tool(),\n",
    "    text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
    "    postprompt=helium_instructions\n",
    "    + \"Any web browser controls won't work on .pdf urls, rather use the tool 'inspect_file_as_text' to read them\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Browser-use browser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install browser-use lxml_html_clean -q\n",
    "!playwright install"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import asyncio\n",
    "\n",
    "import nest_asyncio\n",
    "\n",
    "\n",
    "nest_asyncio.apply()\n",
    "\n",
    "from browser_use import Agent\n",
    "from dotenv import load_dotenv\n",
    "from langchain_openai import ChatOpenAI\n",
    "\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "\n",
    "class BrowserUseAgent:\n",
    "    logs = []\n",
    "\n",
    "    def write_inner_memory_from_logs(self, summary_mode):\n",
    "        return self.results\n",
    "\n",
    "    def run(self, task, **kwargs):\n",
    "        agent = Agent(\n",
    "            task=task,\n",
    "            llm=ChatOpenAI(model=\"gpt-4o\"),\n",
    "        )\n",
    "        self.results = asyncio.get_event_loop().run_until_complete(agent.run())\n",
    "        return self.results.history[-1].result[0].extracted_content\n",
    "\n",
    "\n",
    "browser_use_agent = BrowserUseAgent()\n",
    "\n",
    "results_browseruse = answer_questions(\n",
    "    eval_ds,\n",
    "    browser_use_agent,\n",
    "    \"gpt-4o_27-01_browseruse\",\n",
    "    reformulation_model=proprietary_model,\n",
    "    output_folder=\"output_browsers\",\n",
    "    visual_inspection_tool=VisualQAGPT4Tool(),\n",
    "    text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
    "    postprompt=\"\",\n",
    "    run_simple=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from scripts.gaia_scorer import question_scorer\n",
    "\n",
    "\n",
    "results_vision, results_text, results_browseruse = (\n",
    "    pd.DataFrame(results_vision),\n",
    "    pd.DataFrame(results_text),\n",
    "    pd.DataFrame(results_browseruse),\n",
    ")\n",
    "\n",
    "results_vision[\"is_correct\"] = results_vision.apply(\n",
    "    lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n",
    ")\n",
    "results_text[\"is_correct\"] = results_text.apply(lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1)\n",
    "results_browseruse[\"is_correct\"] = results_browseruse.apply(\n",
    "    lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = pd.concat([results_vision, results_text, results_browseruse])\n",
    "results.groupby(\"agent_name\")[\"is_correct\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "correct_vision_results = results_vision.loc[results_vision[\"is_correct\"]]\n",
    "correct_vision_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "false_text_results = results_text.loc[~results_text[\"is_correct\"]]\n",
    "false_text_results"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "gaia",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}