Make web browser example a CLI script (#416)

* Makes web browser example a CLI script

---------

Co-authored-by: Merve Noyan <mervenoyan@Merve-MacBook-Pro.local>
Co-authored-by: Aymeric <aymeric.roucher@gmail.com>
This commit is contained in:
Merve Noyan 2025-01-30 01:55:54 +01:00 committed by GitHub
parent 3b8e519f77
commit dcbbe448af
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 66 additions and 77 deletions

View File

@ -22,7 +22,7 @@ elif chosen_inference == "ollama":
model_id="ollama_chat/llama3.2", model_id="ollama_chat/llama3.2",
api_base="http://localhost:11434", # replace with remote open-ai compatible server if necessary api_base="http://localhost:11434", # replace with remote open-ai compatible server if necessary
api_key="your-api-key", # replace with API key if necessary api_key="your-api-key", # replace with API key if necessary
num_ctx=8192 # ollama default is 2048 which will often fail horribly. 8192 works for easy tasks, more is better. Check https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator to calculate how much VRAM this will need for the selected model. num_ctx=8192, # ollama default is 2048 which will often fail horribly. 8192 works for easy tasks, more is better. Check https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator to calculate how much VRAM this will need for the selected model.
) )
elif chosen_inference == "litellm": elif chosen_inference == "litellm":

View File

@ -1,3 +1,5 @@
import argparse
import os
from io import BytesIO from io import BytesIO
from time import sleep from time import sleep
@ -5,40 +7,68 @@ import helium
from dotenv import load_dotenv from dotenv import load_dotenv
from PIL import Image from PIL import Image
from selenium import webdriver from selenium import webdriver
from selenium.common.exceptions import ElementNotInteractableException, TimeoutException
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from smolagents import CodeAgent, LiteLLMModel, OpenAIServerModel, TransformersModel, tool # noqa: F401 from smolagents import CodeAgent, HfApiModel, LiteLLMModel, OpenAIServerModel, TransformersModel, tool # noqa: F401
from smolagents.agents import ActionStep from smolagents.agents import ActionStep
github_request = """
I'm trying to find how hard I have to work to get a repo in github.com/trending.
Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
""" # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet.
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""
def parse_arguments():
parser = argparse.ArgumentParser(description="Run a web browser automation script with a specified model.")
parser.add_argument(
"--model",
type=str,
default="LiteLLMModel",
help="The model type to use (e.g., OpenAIServerModel, LiteLLMModel, TransformersModel, HfApiModel)",
)
parser.add_argument(
"--model-id",
type=str,
default="gpt-4o",
help="The model ID to use for the specified model type",
)
parser.add_argument("--prompt", type=str, default=search_request, help="The prompt to run with the agent")
return parser.parse_args()
# Load environment variables
load_dotenv() load_dotenv()
import os
# Parse command line arguments
args = parse_arguments()
# Let's use Qwen2-VL-72B via an inference provider like Fireworks AI # Initialize the model based on the provided arguments
if args.model == "OpenAIServerModel":
model = OpenAIServerModel( model = OpenAIServerModel(
api_key=os.getenv("FIREWORKS_API_KEY"), api_key=os.getenv("FIREWORKS_API_KEY"),
api_base="https://api.fireworks.ai/inference/v1", api_base="https://api.fireworks.ai/inference/v1",
model_id="accounts/fireworks/models/qwen2-vl-72b-instruct", model_id=args.model_id,
) )
elif args.model == "LiteLLMModel":
# You can also use a close model model = LiteLLMModel(
model_id=args.model_id,
# model = LiteLLMModel( api_key=os.getenv("OPENAI_API_KEY"),
# model_id="gpt-4o", )
# api_key=os.getenv("OPENAI_API_KEY"), elif args.model == "TransformersModel":
# ) model = TransformersModel(model_id=args.model_id, device_map="auto", flatten_messages_as_text=False)
elif args.model == "HfApiModel":
# locally a good candidate is Qwen2-VL-7B-Instruct model = HfApiModel(
# model = TransformersModel( token=os.getenv("HF_API_KEY"),
# model_id="Qwen/Qwen2-VL-7B-Instruct", model_id=args.model_id,
# device_map = "auto", )
# flatten_messages_as_text=False else:
# ) raise ValueError(f"Unsupported model type: {args.model}")
# Prepare callback # Prepare callback
@ -64,8 +94,9 @@ def save_screenshot(step_log: ActionStep, agent: CodeAgent) -> None:
# Initialize driver and agent # Initialize driver and agent
chrome_options = webdriver.ChromeOptions() chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--force-device-scale-factor=1") chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1300") chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer") chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
driver = helium.start_chrome(headless=False, options=chrome_options) driver = helium.start_chrome(headless=False, options=chrome_options)
@ -101,42 +132,7 @@ def close_popups() -> str:
""" """
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners. Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
""" """
# Common selectors for modal close buttons and overlay elements webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
modal_selectors = [
"button[class*='close']",
"[class*='modal']",
"[class*='modal'] button",
"[class*='CloseButton']",
"[aria-label*='close']",
".modal-close",
".close-modal",
".modal .close",
".modal-backdrop",
".modal-overlay",
"[class*='overlay']",
]
wait = WebDriverWait(driver, timeout=0.5)
for selector in modal_selectors:
try:
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
for element in elements:
if element.is_displayed():
try:
# Try clicking with JavaScript as it's more reliable
driver.execute_script("arguments[0].click();", element)
except ElementNotInteractableException:
# If JavaScript click fails, try regular click
element.click()
except TimeoutException:
continue
except Exception as e:
print(f"Error handling selector {selector}: {str(e)}")
continue
return "Modals closed"
agent = CodeAgent( agent = CodeAgent(
@ -150,10 +146,10 @@ agent = CodeAgent(
helium_instructions = """ helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed. You can use helium to access websites. Don't bother about the helium driver, it's already managed.
First you need to import everything from helium, then you can do other actions! We've already ran "from helium import *"
Then you can go to pages!
Code: Code:
```py ```py
from helium import *
go_to('github.com/trending') go_to('github.com/trending')
```<end_code> ```<end_code>
@ -206,17 +202,10 @@ Of course, you can act on buttons like a user would do when navigating.
After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url. After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url.
But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states. But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states.
Don't kill the browser. Don't kill the browser.
When you have modals or cookie banners on screen, you should get rid of them before you can click anything else.
""" """
# Run the agent! # Run the agent with the provided prompt
github_request = """ agent.python_executor("from helium import *", agent.state)
I'm trying to find how hard I have to work to get a repo in github.com/trending. agent.run(args.prompt + helium_instructions)
Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
""" # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet.
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""
agent.run(search_request + helium_instructions)