Make web browser example a CLI script (#416)

* Makes web browser example a CLI script

---------

Co-authored-by: Merve Noyan <mervenoyan@Merve-MacBook-Pro.local>
Co-authored-by: Aymeric <aymeric.roucher@gmail.com>
This commit is contained in:
Merve Noyan 2025-01-30 01:55:54 +01:00 committed by GitHub
parent 3b8e519f77
commit dcbbe448af
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 66 additions and 77 deletions

View File

@ -22,7 +22,7 @@ elif chosen_inference == "ollama":
model_id="ollama_chat/llama3.2",
api_base="http://localhost:11434", # replace with remote open-ai compatible server if necessary
api_key="your-api-key", # replace with API key if necessary
num_ctx=8192 # ollama default is 2048 which will often fail horribly. 8192 works for easy tasks, more is better. Check https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator to calculate how much VRAM this will need for the selected model.
num_ctx=8192, # ollama default is 2048 which will often fail horribly. 8192 works for easy tasks, more is better. Check https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator to calculate how much VRAM this will need for the selected model.
)
elif chosen_inference == "litellm":

View File

@ -1,3 +1,5 @@
import argparse
import os
from io import BytesIO
from time import sleep
@ -5,40 +7,68 @@ import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.common.exceptions import ElementNotInteractableException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from smolagents import CodeAgent, LiteLLMModel, OpenAIServerModel, TransformersModel, tool # noqa: F401
from smolagents import CodeAgent, HfApiModel, LiteLLMModel, OpenAIServerModel, TransformersModel, tool # noqa: F401
from smolagents.agents import ActionStep
github_request = """
I'm trying to find how hard I have to work to get a repo in github.com/trending.
Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
""" # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet.
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""
def parse_arguments():
parser = argparse.ArgumentParser(description="Run a web browser automation script with a specified model.")
parser.add_argument(
"--model",
type=str,
default="LiteLLMModel",
help="The model type to use (e.g., OpenAIServerModel, LiteLLMModel, TransformersModel, HfApiModel)",
)
parser.add_argument(
"--model-id",
type=str,
default="gpt-4o",
help="The model ID to use for the specified model type",
)
parser.add_argument("--prompt", type=str, default=search_request, help="The prompt to run with the agent")
return parser.parse_args()
# Load environment variables
load_dotenv()
import os
# Parse command line arguments
args = parse_arguments()
# Let's use Qwen2-VL-72B via an inference provider like Fireworks AI
model = OpenAIServerModel(
api_key=os.getenv("FIREWORKS_API_KEY"),
api_base="https://api.fireworks.ai/inference/v1",
model_id="accounts/fireworks/models/qwen2-vl-72b-instruct",
)
# You can also use a close model
# model = LiteLLMModel(
# model_id="gpt-4o",
# api_key=os.getenv("OPENAI_API_KEY"),
# )
# locally a good candidate is Qwen2-VL-7B-Instruct
# model = TransformersModel(
# model_id="Qwen/Qwen2-VL-7B-Instruct",
# device_map = "auto",
# flatten_messages_as_text=False
# )
# Initialize the model based on the provided arguments
if args.model == "OpenAIServerModel":
model = OpenAIServerModel(
api_key=os.getenv("FIREWORKS_API_KEY"),
api_base="https://api.fireworks.ai/inference/v1",
model_id=args.model_id,
)
elif args.model == "LiteLLMModel":
model = LiteLLMModel(
model_id=args.model_id,
api_key=os.getenv("OPENAI_API_KEY"),
)
elif args.model == "TransformersModel":
model = TransformersModel(model_id=args.model_id, device_map="auto", flatten_messages_as_text=False)
elif args.model == "HfApiModel":
model = HfApiModel(
token=os.getenv("HF_API_KEY"),
model_id=args.model_id,
)
else:
raise ValueError(f"Unsupported model type: {args.model}")
# Prepare callback
@ -64,8 +94,9 @@ def save_screenshot(step_log: ActionStep, agent: CodeAgent) -> None:
# Initialize driver and agent
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1300")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
driver = helium.start_chrome(headless=False, options=chrome_options)
@ -101,42 +132,7 @@ def close_popups() -> str:
"""
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
"""
# Common selectors for modal close buttons and overlay elements
modal_selectors = [
"button[class*='close']",
"[class*='modal']",
"[class*='modal'] button",
"[class*='CloseButton']",
"[aria-label*='close']",
".modal-close",
".close-modal",
".modal .close",
".modal-backdrop",
".modal-overlay",
"[class*='overlay']",
]
wait = WebDriverWait(driver, timeout=0.5)
for selector in modal_selectors:
try:
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
for element in elements:
if element.is_displayed():
try:
# Try clicking with JavaScript as it's more reliable
driver.execute_script("arguments[0].click();", element)
except ElementNotInteractableException:
# If JavaScript click fails, try regular click
element.click()
except TimeoutException:
continue
except Exception as e:
print(f"Error handling selector {selector}: {str(e)}")
continue
return "Modals closed"
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
agent = CodeAgent(
@ -150,10 +146,10 @@ agent = CodeAgent(
helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
First you need to import everything from helium, then you can do other actions!
We've already ran "from helium import *"
Then you can go to pages!
Code:
```py
from helium import *
go_to('github.com/trending')
```<end_code>
@ -206,17 +202,10 @@ Of course, you can act on buttons like a user would do when navigating.
After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url.
But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states.
Don't kill the browser.
When you have modals or cookie banners on screen, you should get rid of them before you can click anything else.
"""
# Run the agent!
# Run the agent with the provided prompt
github_request = """
I'm trying to find how hard I have to work to get a repo in github.com/trending.
Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
""" # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet.
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""
agent.run(search_request + helium_instructions)
agent.python_executor("from helium import *", agent.state)
agent.run(args.prompt + helium_instructions)