update BrowserUseTool

This commit is contained in:
liangxinbing 2025-03-18 02:38:56 +08:00
parent 91d14a3a47
commit c3203e7fa3

View File

@ -10,6 +10,7 @@ from pydantic import Field, field_validator
from pydantic_core.core_schema import ValidationInfo from pydantic_core.core_schema import ValidationInfo
from app.config import config from app.config import config
from app.llm import LLM
from app.tool.base import BaseTool, ToolResult from app.tool.base import BaseTool, ToolResult
from app.tool.web_search import WebSearch from app.tool.web_search import WebSearch
@ -18,27 +19,22 @@ _BROWSER_DESCRIPTION = """
Interact with a web browser to perform various actions such as navigation, element interaction, content extraction, and tab management. This tool provides a comprehensive set of browser automation capabilities: Interact with a web browser to perform various actions such as navigation, element interaction, content extraction, and tab management. This tool provides a comprehensive set of browser automation capabilities:
Navigation: Navigation:
- 'navigate': Go to a specific URL - 'go_to_url': Go to a specific URL in the current tab
- 'go_back': Navigate back in browser history - 'go_back': Go back
- 'refresh': Refresh the current page - 'refresh': Refresh the current page
- 'web_search': Search the web with a specific query - 'web_search': Search the query in the current tab, the query should be a search query like humans search in web, concrete and not vague or super long. More the single most important items.
Element Interaction: Element Interaction:
- 'click_element': Click an element by index - 'click_element': Click an element by index
- 'input_text': Input text into a form element - 'input_text': Input text into a form element
- 'scroll_down'/'scroll_up': Scroll the page (with optional pixel amount) - 'scroll_down'/'scroll_up': Scroll the page (with optional pixel amount)
- 'scroll_to_text': Scroll to specific text on the page - 'scroll_to_text': If you dont find something which you want to interact with, scroll to it
- 'send_keys': Send keyboard shortcuts or special keys - 'send_keys': Send strings of special keys like Escape,Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press.
- 'get_dropdown_options': Get all options from a dropdown - 'get_dropdown_options': Get all options from a dropdown
- 'select_dropdown_option': Select an option from a dropdown by text - 'select_dropdown_option': Select dropdown option for interactive element index by the text of the option you want to select
Content Extraction: Content Extraction:
- 'get_current_state': Get detailed browser state including URL, title, tabs, and interactive elements - 'extract_content': Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links
- 'get_html': Get page HTML content
- 'get_text': Get text content of the page (supports start_index and end_index parameters)
- 'read_links': Get all links on the page
- 'extract_content': Extract specific information from the page using AI
- 'screenshot': Capture a screenshot
Tab Management: Tab Management:
- 'switch_tab': Switch to a specific tab - 'switch_tab': Switch to a specific tab
@ -47,12 +43,6 @@ Tab Management:
Utility: Utility:
- 'wait': Wait for a specified number of seconds - 'wait': Wait for a specified number of seconds
- 'execute_js': Execute JavaScript code on the page
Task Completion:
- 'done': Complete the task and return results
Each action requires specific parameters. Use get_current_state first to understand the current browser context.
""" """
Context = TypeVar("Context") Context = TypeVar("Context")
@ -67,15 +57,9 @@ class BrowserUseTool(BaseTool, Generic[Context]):
"action": { "action": {
"type": "string", "type": "string",
"enum": [ "enum": [
"navigate", "go_to_url",
"click_element", "click_element",
"get_current_state",
"input_text", "input_text",
"screenshot",
"get_html",
"get_text",
"read_links",
"execute_js",
"scroll_down", "scroll_down",
"scroll_up", "scroll_up",
"scroll_to_text", "scroll_to_text",
@ -85,22 +69,24 @@ class BrowserUseTool(BaseTool, Generic[Context]):
"go_back", "go_back",
"web_search", "web_search",
"wait", "wait",
"done", "extract_content",
"switch_tab",
"open_tab",
"close_tab",
], ],
"description": "The browser action to perform", "description": "The browser action to perform",
}, },
"url": { "url": {
"type": "string", "type": "string",
"description": "URL for 'navigate' or 'new_tab' actions", "description": "URL for 'go_to_url' or 'open_tab' actions",
}, },
"index": { "index": {
"type": "integer", "type": "integer",
"description": "Element index (retrieved using get_current_state) for 'click_element' or 'input_text' actions", "description": "Element index for 'click_element', 'input_text', 'get_dropdown_options', or 'select_dropdown_option' actions",
}, },
"text": {"type": "string", "description": "Text for 'input_text' action"}, "text": {
"script": {
"type": "string", "type": "string",
"description": "JavaScript code for 'execute_js' action", "description": "Text for 'input_text', 'scroll_to_text', or 'select_dropdown_option' actions",
}, },
"scroll_amount": { "scroll_amount": {
"type": "integer", "type": "integer",
@ -110,14 +96,6 @@ class BrowserUseTool(BaseTool, Generic[Context]):
"type": "integer", "type": "integer",
"description": "Tab ID for 'switch_tab' action", "description": "Tab ID for 'switch_tab' action",
}, },
"start_index": {
"type": "integer",
"description": "Starting character index for text observation (for 'scroll_to_text' and 'get_text' actions)",
},
"end_index": {
"type": "integer",
"description": "Ending character index for text observation (for 'scroll_to_text' and 'get_text' actions)",
},
"query": { "query": {
"type": "string", "type": "string",
"description": "Search query for 'web_search' action", "description": "Search query for 'web_search' action",
@ -126,10 +104,6 @@ class BrowserUseTool(BaseTool, Generic[Context]):
"type": "string", "type": "string",
"description": "Extraction goal for 'extract_content' action", "description": "Extraction goal for 'extract_content' action",
}, },
"success": {
"type": "boolean",
"description": "Success status for 'done' action",
},
"keys": { "keys": {
"type": "string", "type": "string",
"description": "Keys to send for 'send_keys' action", "description": "Keys to send for 'send_keys' action",
@ -141,12 +115,11 @@ class BrowserUseTool(BaseTool, Generic[Context]):
}, },
"required": ["action"], "required": ["action"],
"dependencies": { "dependencies": {
"navigate": ["url"], "go_to_url": ["url"],
"click_element": ["index"], "click_element": ["index"],
"input_text": ["index", "text"], "input_text": ["index", "text"],
"execute_js": ["script"],
"switch_tab": ["tab_id"], "switch_tab": ["tab_id"],
"new_tab": ["url"], "open_tab": ["url"],
"scroll_down": ["scroll_amount"], "scroll_down": ["scroll_amount"],
"scroll_up": ["scroll_amount"], "scroll_up": ["scroll_amount"],
"scroll_to_text": ["text"], "scroll_to_text": ["text"],
@ -156,7 +129,7 @@ class BrowserUseTool(BaseTool, Generic[Context]):
"go_back": [], "go_back": [],
"web_search": ["query"], "web_search": ["query"],
"wait": ["seconds"], "wait": ["seconds"],
"done": ["text"], "extract_content": ["goal"],
}, },
} }
@ -169,6 +142,8 @@ class BrowserUseTool(BaseTool, Generic[Context]):
# Context for generic functionality # Context for generic functionality
tool_context: Optional[Context] = Field(default=None, exclude=True) tool_context: Optional[Context] = Field(default=None, exclude=True)
llm: Optional[LLM] = Field(default_factory=LLM)
@field_validator("parameters", mode="before") @field_validator("parameters", mode="before")
def validate_parameters(cls, v: dict, info: ValidationInfo) -> dict: def validate_parameters(cls, v: dict, info: ValidationInfo) -> dict:
if not v: if not v:
@ -230,14 +205,10 @@ class BrowserUseTool(BaseTool, Generic[Context]):
url: Optional[str] = None, url: Optional[str] = None,
index: Optional[int] = None, index: Optional[int] = None,
text: Optional[str] = None, text: Optional[str] = None,
script: Optional[str] = None,
scroll_amount: Optional[int] = None, scroll_amount: Optional[int] = None,
tab_id: Optional[int] = None, tab_id: Optional[int] = None,
start_index: Optional[int] = None,
end_index: Optional[int] = None,
query: Optional[str] = None, query: Optional[str] = None,
goal: Optional[str] = None, goal: Optional[str] = None,
success: Optional[bool] = None,
keys: Optional[str] = None, keys: Optional[str] = None,
seconds: Optional[int] = None, seconds: Optional[int] = None,
**kwargs, **kwargs,
@ -250,14 +221,10 @@ class BrowserUseTool(BaseTool, Generic[Context]):
url: URL for navigation or new tab url: URL for navigation or new tab
index: Element index for click or input actions index: Element index for click or input actions
text: Text for input action or search query text: Text for input action or search query
script: JavaScript code for execution
scroll_amount: Pixels to scroll for scroll action scroll_amount: Pixels to scroll for scroll action
tab_id: Tab ID for switch_tab action tab_id: Tab ID for switch_tab action
start_index: Starting character index for text observation
end_index: Ending character index for text observation
query: Search query for Google search query: Search query for Google search
goal: Extraction goal for content extraction goal: Extraction goal for content extraction
success: Success status for done action
keys: Keys to send for keyboard actions keys: Keys to send for keyboard actions
seconds: Seconds to wait seconds: Seconds to wait
**kwargs: Additional arguments **kwargs: Additional arguments
@ -275,10 +242,14 @@ class BrowserUseTool(BaseTool, Generic[Context]):
) )
# Navigation actions # Navigation actions
if action == "navigate": if action == "go_to_url":
if not url: if not url:
return ToolResult(error="URL is required for 'navigate' action") return ToolResult(
await context.navigate_to(url) error="URL is required for 'go_to_url' action"
)
page = await context.get_current_page()
await page.goto(url)
await page.wait_for_load_state()
return ToolResult(output=f"Navigated to {url}") return ToolResult(output=f"Navigated to {url}")
elif action == "go_back": elif action == "go_back":
@ -299,10 +270,22 @@ class BrowserUseTool(BaseTool, Generic[Context]):
if search_results: if search_results:
# Navigate to the first search result # Navigate to the first search result
first_result = search_results[0] first_result = search_results[0]
await context.navigate_to(first_result) if isinstance(first_result, dict) and "url" in first_result:
url_to_navigate = first_result["url"]
elif isinstance(first_result, str):
url_to_navigate = first_result
else:
return ToolResult( return ToolResult(
output=f"Searched for '{query}' and navigated to first result: {first_result}\nAll results:" error=f"Invalid search result format: {first_result}"
+ "\n".join(search_results) )
page = await context.get_current_page()
await page.goto(url_to_navigate)
await page.wait_for_load_state()
return ToolResult(
output=f"Searched for '{query}' and navigated to first result: {url_to_navigate}\nAll results:"
+ "\n".join([str(r) for r in search_results])
) )
else: else:
return ToolResult( return ToolResult(
@ -414,56 +397,71 @@ class BrowserUseTool(BaseTool, Generic[Context]):
) )
# Content extraction actions # Content extraction actions
elif action == "get_current_state":
return await self.get_current_state(context)
elif action == "get_html":
html = await context.get_page_html()
truncated = (
html[:max_content_length] + "..."
if len(html) > max_content_length
else html
)
return ToolResult(output=truncated)
elif action == "get_text":
start = start_index if start_index is not None else 0
end = end_index if end_index is not None else max_content_length
text = await context.execute_javascript(
f"document.body.innerText.substring({start}, {end})"
)
full_length = await context.execute_javascript(
"document.body.innerText.length"
)
result = f"Text from index {start} to {end}:\n{text}"
if end < full_length:
result += f"\n\n[Text continues... {full_length - end} more characters available]"
if start > 0:
result += f"\n[{start} characters before this point]"
return ToolResult(output=result)
elif action == "read_links":
links = await context.execute_javascript(
"Array.from(document.querySelectorAll('a[href]')).map(elem => elem.innerText && elem.href ? `${elem.innerText.trim()} - ${elem.href}` : null).filter(Boolean).join('\\n')"
)
return ToolResult(output=links)
elif action == "extract_content": elif action == "extract_content":
if not goal: if not goal:
return ToolResult( return ToolResult(
error="Goal is required for 'extract_content' action" error="Goal is required for 'extract_content' action"
) )
await context.get_page_html() page = await context.get_current_page()
# Note: In a real implementation, this would use an LLM to extract content try:
return ToolResult(output=f"Extracted content for goal: {goal}") # Get page content and convert to markdown for better processing
html_content = await page.content()
elif action == "screenshot": # Import markdownify here to avoid global import
screenshot = await context.take_screenshot(full_page=True) try:
return ToolResult( import markdownify
output=f"Screenshot captured (base64 length: {len(screenshot)})",
system=screenshot, content = markdownify.markdownify(html_content)
except ImportError:
# Fallback if markdownify is not available
content = html_content
# Create prompt for LLM
prompt_text = """
Your task is to extract the content of the page. You will be given a page and a goal, and you should extract all relevant information around this goal from the page.
Examples of extraction goals:
- Extract all company names
- Extract specific descriptions
- Extract all information about a topic
- Extract links with companies in structured format
- Extract all links
If the goal is vague, summarize the page. Respond in JSON format.
Extraction goal: {goal}
Page content:
{page}
"""
# Format the prompt with the goal and content
max_content_length = min(50000, len(content))
formatted_prompt = prompt_text.format(
goal=goal, page=content[:max_content_length]
) )
# Create a proper message list for the LLM
from app.schema import Message
messages = [Message.user_message(formatted_prompt)]
# Use LLM to extract content based on the goal
response = await self.llm.ask(messages)
msg = f"Extracted from page:\n{response}\n"
return ToolResult(output=msg)
except Exception as e:
# Provide a more helpful error message
error_msg = f"Failed to extract content: {str(e)}"
try:
# Try to return a portion of the page content as fallback
return ToolResult(
output=f"{error_msg}\nHere's a portion of the page content:\n{content[:2000]}..."
)
except:
# If all else fails, just return the error
return ToolResult(error=error_msg)
# Tab management actions # Tab management actions
elif action == "switch_tab": elif action == "switch_tab":
if tab_id is None: if tab_id is None:
@ -471,13 +469,15 @@ class BrowserUseTool(BaseTool, Generic[Context]):
error="Tab ID is required for 'switch_tab' action" error="Tab ID is required for 'switch_tab' action"
) )
await context.switch_to_tab(tab_id) await context.switch_to_tab(tab_id)
page = await context.get_current_page()
await page.wait_for_load_state()
return ToolResult(output=f"Switched to tab {tab_id}") return ToolResult(output=f"Switched to tab {tab_id}")
elif action == "open_tab": elif action == "open_tab":
if not url: if not url:
return ToolResult(error="URL is required for 'open_tab' action") return ToolResult(error="URL is required for 'open_tab' action")
await context.create_new_tab(url) await context.create_new_tab(url)
return ToolResult(output=f"Opened new tab with URL {url}") return ToolResult(output=f"Opened new tab with {url}")
elif action == "close_tab": elif action == "close_tab":
await context.close_current_tab() await context.close_current_tab()
@ -489,47 +489,61 @@ class BrowserUseTool(BaseTool, Generic[Context]):
await asyncio.sleep(seconds_to_wait) await asyncio.sleep(seconds_to_wait)
return ToolResult(output=f"Waited for {seconds_to_wait} seconds") return ToolResult(output=f"Waited for {seconds_to_wait} seconds")
elif action == "execute_js":
if not script:
return ToolResult(
error="Script is required for 'execute_js' action"
)
result = await context.execute_javascript(script)
return ToolResult(output=str(result))
# Task completion
elif action == "done":
if not text:
return ToolResult(error="Text is required for 'done' action")
success_value = success if success is not None else True
return ToolResult(output=text, is_done=True, success=success_value)
else: else:
return ToolResult(error=f"Unknown action: {action}") return ToolResult(error=f"Unknown action: {action}")
except Exception as e: except Exception as e:
return ToolResult(error=f"Browser action '{action}' failed: {str(e)}") return ToolResult(error=f"Browser action '{action}' failed: {str(e)}")
async def get_current_state(self, context: BrowserContext) -> ToolResult: async def get_current_state(
"""Get the current browser state as a ToolResult.""" self, context: Optional[BrowserContext] = None
) -> ToolResult:
"""
Get the current browser state as a ToolResult.
If context is not provided, uses self.context.
"""
try: try:
state = await context.get_state() # Use provided context or fall back to self.context
ctx = context or self.context
if not ctx:
return ToolResult(error="Browser context not initialized")
state = await ctx.get_state()
# Create a viewport_info dictionary if it doesn't exist
viewport_height = 0
if hasattr(state, "viewport_info") and state.viewport_info:
viewport_height = state.viewport_info.height
elif hasattr(ctx, "config") and hasattr(ctx.config, "browser_window_size"):
viewport_height = ctx.config.browser_window_size.get("height", 0)
# Take a screenshot for the state
screenshot = await ctx.take_screenshot(full_page=True)
# Build the state info with all required fields
state_info = { state_info = {
"url": state.url, "url": state.url,
"title": state.title, "title": state.title,
"tabs": [tab.model_dump() for tab in state.tabs], "tabs": [tab.model_dump() for tab in state.tabs],
"help": "[0], [1], [2], etc., represent clickable indices corresponding to the elements listed. Clicking on these indices will navigate to or interact with the respective content behind them.", "help": "[0], [1], [2], etc., represent clickable indices corresponding to the elements listed. Clicking on these indices will navigate to or interact with the respective content behind them.",
"interactive_elements": state.element_tree.clickable_elements_to_string(), "interactive_elements": (
state.element_tree.clickable_elements_to_string()
if state.element_tree
else ""
),
"scroll_info": { "scroll_info": {
"pixels_above": state.pixels_above, "pixels_above": getattr(state, "pixels_above", 0),
"pixels_below": state.pixels_below, "pixels_below": getattr(state, "pixels_below", 0),
"total_height": state.pixels_above "total_height": getattr(state, "pixels_above", 0)
+ state.pixels_below + getattr(state, "pixels_below", 0)
+ (state.viewport_info.height if state.viewport_info else 0), + viewport_height,
}, },
"viewport_height": viewport_height,
} }
return ToolResult( return ToolResult(
output=json.dumps(state_info, indent=4, ensure_ascii=False) output=json.dumps(state_info, indent=4, ensure_ascii=False),
base64_image=screenshot,
) )
except Exception as e: except Exception as e:
return ToolResult(error=f"Failed to get browser state: {str(e)}") return ToolResult(error=f"Failed to get browser state: {str(e)}")