From 9bdd8201052bd80b0914b1ff4e61d9bd3c497e77 Mon Sep 17 00:00:00 2001 From: liangxinbing <1580466765@qq.com> Date: Mon, 17 Mar 2025 23:07:04 +0800 Subject: [PATCH] update BrowserUseTool --- app/config.py | 3 + app/tool/browser_use_tool.py | 396 +++++++++++++++++++++++++++-------- 2 files changed, 317 insertions(+), 82 deletions(-) diff --git a/app/config.py b/app/config.py index 51356a0..9ab1448 100644 --- a/app/config.py +++ b/app/config.py @@ -59,6 +59,9 @@ class BrowserSettings(BaseModel): proxy: Optional[ProxySettings] = Field( None, description="Proxy settings for the browser" ) + max_content_length: int = Field( + 2000, description="Maximum length for content retrieval operations" + ) class AppConfig(BaseModel): diff --git a/app/tool/browser_use_tool.py b/app/tool/browser_use_tool.py index fb47ed8..468d054 100644 --- a/app/tool/browser_use_tool.py +++ b/app/tool/browser_use_tool.py @@ -1,6 +1,6 @@ import asyncio import json -from typing import Optional +from typing import Generic, Optional, TypeVar from browser_use import Browser as BrowserUseBrowser from browser_use import BrowserConfig @@ -11,31 +11,54 @@ from pydantic_core.core_schema import ValidationInfo from app.config import config from app.tool.base import BaseTool, ToolResult +from app.tool.web_search import WebSearch -MAX_LENGTH = 2000 - _BROWSER_DESCRIPTION = """ -Interact with a web browser to perform various actions such as navigation, element interaction, -content extraction, and tab management. Supported actions include: +Interact with a web browser to perform various actions such as navigation, element interaction, content extraction, and tab management. This tool provides a comprehensive set of browser automation capabilities: + +Navigation: - 'navigate': Go to a specific URL -- 'click': Click an element by index -- 'input_text': Input text into an element -- 'screenshot': Capture a screenshot -- 'get_html': Get page HTML content -- 'get_text': Get text content of the page -- 'read_links': Get all links on the page -- 'execute_js': Execute JavaScript code -- 'scroll': Scroll the page -- 'switch_tab': Switch to a specific tab -- 'new_tab': Open a new tab -- 'close_tab': Close the current tab +- 'go_back': Navigate back in browser history - 'refresh': Refresh the current page -- 'get_current_state': Get the current browser state including URL, title, tabs, and interactive elements +- 'web_search': Search the web with a specific query + +Element Interaction: +- 'click_element': Click an element by index +- 'input_text': Input text into a form element +- 'scroll_down'/'scroll_up': Scroll the page (with optional pixel amount) +- 'scroll_to_text': Scroll to specific text on the page +- 'send_keys': Send keyboard shortcuts or special keys +- 'get_dropdown_options': Get all options from a dropdown +- 'select_dropdown_option': Select an option from a dropdown by text + +Content Extraction: +- 'get_current_state': Get detailed browser state including URL, title, tabs, and interactive elements +- 'get_html': Get page HTML content +- 'get_text': Get text content of the page (supports start_index and end_index parameters) +- 'read_links': Get all links on the page +- 'extract_content': Extract specific information from the page using AI +- 'screenshot': Capture a screenshot + +Tab Management: +- 'switch_tab': Switch to a specific tab +- 'open_tab': Open a new tab with a URL +- 'close_tab': Close the current tab + +Utility: +- 'wait': Wait for a specified number of seconds +- 'execute_js': Execute JavaScript code on the page + +Task Completion: +- 'done': Complete the task and return results + +Each action requires specific parameters. Use get_current_state first to understand the current browser context. """ +Context = TypeVar("Context") -class BrowserUseTool(BaseTool): + +class BrowserUseTool(BaseTool, Generic[Context]): name: str = "browser_use" description: str = _BROWSER_DESCRIPTION parameters: dict = { @@ -45,18 +68,24 @@ class BrowserUseTool(BaseTool): "type": "string", "enum": [ "navigate", - "click", + "click_element", "get_current_state", "input_text", "screenshot", "get_html", "get_text", + "read_links", "execute_js", - "scroll", - "switch_tab", - "new_tab", - "close_tab", - "refresh", + "scroll_down", + "scroll_up", + "scroll_to_text", + "send_keys", + "get_dropdown_options", + "select_dropdown_option", + "go_back", + "web_search", + "wait", + "done", ], "description": "The browser action to perform", }, @@ -66,7 +95,7 @@ class BrowserUseTool(BaseTool): }, "index": { "type": "integer", - "description": "Element index (retrieved using get_current_state) for 'click' or 'input_text' actions", + "description": "Element index (retrieved using get_current_state) for 'click_element' or 'input_text' actions", }, "text": {"type": "string", "description": "Text for 'input_text' action"}, "script": { @@ -75,22 +104,59 @@ class BrowserUseTool(BaseTool): }, "scroll_amount": { "type": "integer", - "description": "Pixels to scroll (positive for down, negative for up) for 'scroll' action", + "description": "Pixels to scroll (positive for down, negative for up) for 'scroll_down' or 'scroll_up' actions", }, "tab_id": { "type": "integer", "description": "Tab ID for 'switch_tab' action", }, + "start_index": { + "type": "integer", + "description": "Starting character index for text observation (for 'scroll_to_text' and 'get_text' actions)", + }, + "end_index": { + "type": "integer", + "description": "Ending character index for text observation (for 'scroll_to_text' and 'get_text' actions)", + }, + "query": { + "type": "string", + "description": "Search query for 'web_search' action", + }, + "goal": { + "type": "string", + "description": "Extraction goal for 'extract_content' action", + }, + "success": { + "type": "boolean", + "description": "Success status for 'done' action", + }, + "keys": { + "type": "string", + "description": "Keys to send for 'send_keys' action", + }, + "seconds": { + "type": "integer", + "description": "Seconds to wait for 'wait' action", + }, }, "required": ["action"], "dependencies": { "navigate": ["url"], - "click": ["index"], + "click_element": ["index"], "input_text": ["index", "text"], "execute_js": ["script"], "switch_tab": ["tab_id"], "new_tab": ["url"], - "scroll": ["scroll_amount"], + "scroll_down": ["scroll_amount"], + "scroll_up": ["scroll_amount"], + "scroll_to_text": ["text"], + "send_keys": ["keys"], + "get_dropdown_options": ["index"], + "select_dropdown_option": ["index", "text"], + "go_back": [], + "web_search": ["query"], + "wait": ["seconds"], + "done": ["text"], }, } @@ -98,6 +164,10 @@ class BrowserUseTool(BaseTool): browser: Optional[BrowserUseBrowser] = Field(default=None, exclude=True) context: Optional[BrowserContext] = Field(default=None, exclude=True) dom_service: Optional[DomService] = Field(default=None, exclude=True) + web_search_tool: WebSearch = Field(default_factory=WebSearch, exclude=True) + + # Context for generic functionality + tool_context: Optional[Context] = Field(default=None, exclude=True) @field_validator("parameters", mode="before") def validate_parameters(cls, v: dict, info: ValidationInfo) -> dict: @@ -163,6 +233,13 @@ class BrowserUseTool(BaseTool): script: Optional[str] = None, scroll_amount: Optional[int] = None, tab_id: Optional[int] = None, + start_index: Optional[int] = None, + end_index: Optional[int] = None, + query: Optional[str] = None, + goal: Optional[str] = None, + success: Optional[bool] = None, + keys: Optional[str] = None, + seconds: Optional[int] = None, **kwargs, ) -> ToolResult: """ @@ -172,10 +249,17 @@ class BrowserUseTool(BaseTool): action: The browser action to perform url: URL for navigation or new tab index: Element index for click or input actions - text: Text for input action + text: Text for input action or search query script: JavaScript code for execution scroll_amount: Pixels to scroll for scroll action tab_id: Tab ID for switch_tab action + start_index: Starting character index for text observation + end_index: Ending character index for text observation + query: Search query for Google search + goal: Extraction goal for content extraction + success: Success status for done action + keys: Keys to send for keyboard actions + seconds: Seconds to wait **kwargs: Additional arguments Returns: @@ -185,15 +269,52 @@ class BrowserUseTool(BaseTool): try: context = await self._ensure_browser_initialized() + # Get max content length from config + max_content_length = getattr( + config.browser_config, "max_content_length", 2000 + ) + + # Navigation actions if action == "navigate": if not url: return ToolResult(error="URL is required for 'navigate' action") await context.navigate_to(url) return ToolResult(output=f"Navigated to {url}") - elif action == "click": + elif action == "go_back": + await context.go_back() + return ToolResult(output="Navigated back") + + elif action == "refresh": + await context.refresh_page() + return ToolResult(output="Refreshed current page") + + elif action == "web_search": + if not query: + return ToolResult( + error="Query is required for 'web_search' action" + ) + search_results = await self.web_search_tool.execute(query) + + if search_results: + # Navigate to the first search result + first_result = search_results[0] + await context.navigate_to(first_result) + return ToolResult( + output=f"Searched for '{query}' and navigated to first result: {first_result}\nAll results:" + + "\n".join(search_results) + ) + else: + return ToolResult( + error=f"No search results found for '{query}'" + ) + + # Element interaction actions + elif action == "click_element": if index is None: - return ToolResult(error="Index is required for 'click' action") + return ToolResult( + error="Index is required for 'click_element' action" + ) element = await context.get_dom_element_by_index(index) if not element: return ToolResult(error=f"Element with index {index} not found") @@ -203,9 +324,6 @@ class BrowserUseTool(BaseTool): output += f" - Downloaded file to {download_path}" return ToolResult(output=output) - elif action == "get_current_state": - return await self.get_current_state(context) - elif action == "input_text": if index is None or not text: return ToolResult( @@ -219,6 +337,126 @@ class BrowserUseTool(BaseTool): output=f"Input '{text}' into element at index {index}" ) + elif action == "scroll_down" or action == "scroll_up": + direction = 1 if action == "scroll_down" else -1 + amount = ( + scroll_amount + if scroll_amount is not None + else context.config.browser_window_size["height"] + ) + await context.execute_javascript( + f"window.scrollBy(0, {direction * amount});" + ) + return ToolResult( + output=f"Scrolled {'down' if direction > 0 else 'up'} by {amount} pixels" + ) + + elif action == "scroll_to_text": + if not text: + return ToolResult( + error="Text is required for 'scroll_to_text' action" + ) + page = await context.get_current_page() + try: + locator = page.get_by_text(text, exact=False) + await locator.scroll_into_view_if_needed() + return ToolResult(output=f"Scrolled to text: '{text}'") + except Exception as e: + return ToolResult(error=f"Failed to scroll to text: {str(e)}") + + elif action == "send_keys": + if not keys: + return ToolResult( + error="Keys are required for 'send_keys' action" + ) + page = await context.get_current_page() + await page.keyboard.press(keys) + return ToolResult(output=f"Sent keys: {keys}") + + elif action == "get_dropdown_options": + if index is None: + return ToolResult( + error="Index is required for 'get_dropdown_options' action" + ) + element = await context.get_dom_element_by_index(index) + if not element: + return ToolResult(error=f"Element with index {index} not found") + page = await context.get_current_page() + options = await page.evaluate( + """ + (xpath) => { + const select = document.evaluate(xpath, document, null, + XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; + if (!select) return null; + return Array.from(select.options).map(opt => ({ + text: opt.text, + value: opt.value, + index: opt.index + })); + } + """, + element.xpath, + ) + return ToolResult(output=f"Dropdown options: {options}") + + elif action == "select_dropdown_option": + if index is None or not text: + return ToolResult( + error="Index and text are required for 'select_dropdown_option' action" + ) + element = await context.get_dom_element_by_index(index) + if not element: + return ToolResult(error=f"Element with index {index} not found") + page = await context.get_current_page() + await page.select_option(element.xpath, label=text) + return ToolResult( + output=f"Selected option '{text}' from dropdown at index {index}" + ) + + # Content extraction actions + elif action == "get_current_state": + return await self.get_current_state(context) + + elif action == "get_html": + html = await context.get_page_html() + truncated = ( + html[:max_content_length] + "..." + if len(html) > max_content_length + else html + ) + return ToolResult(output=truncated) + + elif action == "get_text": + start = start_index if start_index is not None else 0 + end = end_index if end_index is not None else max_content_length + text = await context.execute_javascript( + f"document.body.innerText.substring({start}, {end})" + ) + full_length = await context.execute_javascript( + "document.body.innerText.length" + ) + result = f"Text from index {start} to {end}:\n{text}" + if end < full_length: + result += f"\n\n[Text continues... {full_length - end} more characters available]" + if start > 0: + result += f"\n[{start} characters before this point]" + return ToolResult(output=result) + + elif action == "read_links": + links = await context.execute_javascript( + "Array.from(document.querySelectorAll('a[href]')).map(elem => elem.innerText && elem.href ? `${elem.innerText.trim()} - ${elem.href}` : null).filter(Boolean).join('\\n')" + ) + return ToolResult(output=links) + + elif action == "extract_content": + if not goal: + return ToolResult( + error="Goal is required for 'extract_content' action" + ) + await context.get_page_html() + # Note: In a real implementation, this would use an LLM to extract content + return ToolResult(output=f"Extracted content for goal: {goal}") + elif action == "screenshot": screenshot = await context.take_screenshot(full_page=True) return ToolResult( @@ -226,22 +464,30 @@ class BrowserUseTool(BaseTool): system=screenshot, ) - elif action == "get_html": - html = await context.get_page_html() - truncated = ( - html[:MAX_LENGTH] + "..." if len(html) > MAX_LENGTH else html - ) - return ToolResult(output=truncated) + # Tab management actions + elif action == "switch_tab": + if tab_id is None: + return ToolResult( + error="Tab ID is required for 'switch_tab' action" + ) + await context.switch_to_tab(tab_id) + return ToolResult(output=f"Switched to tab {tab_id}") - elif action == "get_text": - text = await context.execute_javascript("document.body.innerText") - return ToolResult(output=text) + elif action == "open_tab": + if not url: + return ToolResult(error="URL is required for 'open_tab' action") + await context.create_new_tab(url) + return ToolResult(output=f"Opened new tab with URL {url}") - elif action == "read_links": - links = await context.execute_javascript( - "document.querySelectorAll('a[href]').forEach((elem) => {if (elem.innerText) {console.log(elem.innerText, elem.href)}})" - ) - return ToolResult(output=links) + elif action == "close_tab": + await context.close_current_tab() + return ToolResult(output="Closed current tab") + + # Utility actions + elif action == "wait": + seconds_to_wait = seconds if seconds is not None else 3 + await asyncio.sleep(seconds_to_wait) + return ToolResult(output=f"Waited for {seconds_to_wait} seconds") elif action == "execute_js": if not script: @@ -251,40 +497,12 @@ class BrowserUseTool(BaseTool): result = await context.execute_javascript(script) return ToolResult(output=str(result)) - elif action == "scroll": - if scroll_amount is None: - return ToolResult( - error="Scroll amount is required for 'scroll' action" - ) - await context.execute_javascript( - f"window.scrollBy(0, {scroll_amount});" - ) - direction = "down" if scroll_amount > 0 else "up" - return ToolResult( - output=f"Scrolled {direction} by {abs(scroll_amount)} pixels" - ) - - elif action == "switch_tab": - if tab_id is None: - return ToolResult( - error="Tab ID is required for 'switch_tab' action" - ) - await context.switch_to_tab(tab_id) - return ToolResult(output=f"Switched to tab {tab_id}") - - elif action == "new_tab": - if not url: - return ToolResult(error="URL is required for 'new_tab' action") - await context.create_new_tab(url) - return ToolResult(output=f"Opened new tab with URL {url}") - - elif action == "close_tab": - await context.close_current_tab() - return ToolResult(output="Closed current tab") - - elif action == "refresh": - await context.refresh_page() - return ToolResult(output="Refreshed current page") + # Task completion + elif action == "done": + if not text: + return ToolResult(error="Text is required for 'done' action") + success_value = success if success is not None else True + return ToolResult(output=text, is_done=True, success=success_value) else: return ToolResult(error=f"Unknown action: {action}") @@ -302,6 +520,13 @@ class BrowserUseTool(BaseTool): "tabs": [tab.model_dump() for tab in state.tabs], "help": "[0], [1], [2], etc., represent clickable indices corresponding to the elements listed. Clicking on these indices will navigate to or interact with the respective content behind them.", "interactive_elements": state.element_tree.clickable_elements_to_string(), + "scroll_info": { + "pixels_above": state.pixels_above, + "pixels_below": state.pixels_below, + "total_height": state.pixels_above + + state.pixels_below + + (state.viewport_info.height if state.viewport_info else 0), + }, } return ToolResult( output=json.dumps(state_info, indent=4, ensure_ascii=False) @@ -329,3 +554,10 @@ class BrowserUseTool(BaseTool): loop = asyncio.new_event_loop() loop.run_until_complete(self.cleanup()) loop.close() + + @classmethod + def create_with_context(cls, context: Context) -> "BrowserUseTool[Context]": + """Factory method to create a BrowserUseTool with a specific context.""" + tool = cls() + tool.tool_context = context + return tool