From c3203e7fa3c49345c92ec8bd36897a710c1b1b40 Mon Sep 17 00:00:00 2001
From: liangxinbing <1580466765@qq.com>
Date: Tue, 18 Mar 2025 02:38:56 +0800
Subject: [PATCH] update BrowserUseTool

---
 app/tool/browser_use_tool.py | 274 ++++++++++++++++++-----------------
 1 file changed, 144 insertions(+), 130 deletions(-)

diff --git a/app/tool/browser_use_tool.py b/app/tool/browser_use_tool.py
index 468d054..7817aef 100644
--- a/app/tool/browser_use_tool.py
+++ b/app/tool/browser_use_tool.py
@@ -10,6 +10,7 @@ from pydantic import Field, field_validator
 from pydantic_core.core_schema import ValidationInfo
 
 from app.config import config
+from app.llm import LLM
 from app.tool.base import BaseTool, ToolResult
 from app.tool.web_search import WebSearch
 
@@ -18,27 +19,22 @@ _BROWSER_DESCRIPTION = """
 Interact with a web browser to perform various actions such as navigation, element interaction, content extraction, and tab management. This tool provides a comprehensive set of browser automation capabilities:
 
 Navigation:
-- 'navigate': Go to a specific URL
-- 'go_back': Navigate back in browser history
+- 'go_to_url': Go to a specific URL in the current tab
+- 'go_back': Go back
 - 'refresh': Refresh the current page
-- 'web_search': Search the web with a specific query
+- 'web_search': Search the query in the current tab, the query should be a search query like humans search in web, concrete and not vague or super long. More the single most important items.
 
 Element Interaction:
 - 'click_element': Click an element by index
 - 'input_text': Input text into a form element
 - 'scroll_down'/'scroll_up': Scroll the page (with optional pixel amount)
-- 'scroll_to_text': Scroll to specific text on the page
-- 'send_keys': Send keyboard shortcuts or special keys
+- 'scroll_to_text': If you dont find something which you want to interact with, scroll to it
+- 'send_keys': Send strings of special keys like Escape,Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press.
 - 'get_dropdown_options': Get all options from a dropdown
-- 'select_dropdown_option': Select an option from a dropdown by text
+- 'select_dropdown_option': Select dropdown option for interactive element index by the text of the option you want to select
 
 Content Extraction:
-- 'get_current_state': Get detailed browser state including URL, title, tabs, and interactive elements
-- 'get_html': Get page HTML content
-- 'get_text': Get text content of the page (supports start_index and end_index parameters)
-- 'read_links': Get all links on the page
-- 'extract_content': Extract specific information from the page using AI
-- 'screenshot': Capture a screenshot
+- 'extract_content': Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links
 
 Tab Management:
 - 'switch_tab': Switch to a specific tab
@@ -47,12 +43,6 @@ Tab Management:
 
 Utility:
 - 'wait': Wait for a specified number of seconds
-- 'execute_js': Execute JavaScript code on the page
-
-Task Completion:
-- 'done': Complete the task and return results
-
-Each action requires specific parameters. Use get_current_state first to understand the current browser context.
 """
 
 Context = TypeVar("Context")
@@ -67,15 +57,9 @@ class BrowserUseTool(BaseTool, Generic[Context]):
             "action": {
                 "type": "string",
                 "enum": [
-                    "navigate",
+                    "go_to_url",
                     "click_element",
-                    "get_current_state",
                     "input_text",
-                    "screenshot",
-                    "get_html",
-                    "get_text",
-                    "read_links",
-                    "execute_js",
                     "scroll_down",
                     "scroll_up",
                     "scroll_to_text",
@@ -85,22 +69,24 @@ class BrowserUseTool(BaseTool, Generic[Context]):
                     "go_back",
                     "web_search",
                     "wait",
-                    "done",
+                    "extract_content",
+                    "switch_tab",
+                    "open_tab",
+                    "close_tab",
                 ],
                 "description": "The browser action to perform",
             },
             "url": {
                 "type": "string",
-                "description": "URL for 'navigate' or 'new_tab' actions",
+                "description": "URL for 'go_to_url' or 'open_tab' actions",
             },
             "index": {
                 "type": "integer",
-                "description": "Element index (retrieved using get_current_state) for 'click_element' or 'input_text' actions",
+                "description": "Element index for 'click_element', 'input_text', 'get_dropdown_options', or 'select_dropdown_option' actions",
             },
-            "text": {"type": "string", "description": "Text for 'input_text' action"},
-            "script": {
+            "text": {
                 "type": "string",
-                "description": "JavaScript code for 'execute_js' action",
+                "description": "Text for 'input_text', 'scroll_to_text', or 'select_dropdown_option' actions",
             },
             "scroll_amount": {
                 "type": "integer",
@@ -110,14 +96,6 @@ class BrowserUseTool(BaseTool, Generic[Context]):
                 "type": "integer",
                 "description": "Tab ID for 'switch_tab' action",
             },
-            "start_index": {
-                "type": "integer",
-                "description": "Starting character index for text observation (for 'scroll_to_text' and 'get_text' actions)",
-            },
-            "end_index": {
-                "type": "integer",
-                "description": "Ending character index for text observation (for 'scroll_to_text' and 'get_text' actions)",
-            },
             "query": {
                 "type": "string",
                 "description": "Search query for 'web_search' action",
@@ -126,10 +104,6 @@ class BrowserUseTool(BaseTool, Generic[Context]):
                 "type": "string",
                 "description": "Extraction goal for 'extract_content' action",
             },
-            "success": {
-                "type": "boolean",
-                "description": "Success status for 'done' action",
-            },
             "keys": {
                 "type": "string",
                 "description": "Keys to send for 'send_keys' action",
@@ -141,12 +115,11 @@ class BrowserUseTool(BaseTool, Generic[Context]):
         },
         "required": ["action"],
         "dependencies": {
-            "navigate": ["url"],
+            "go_to_url": ["url"],
             "click_element": ["index"],
             "input_text": ["index", "text"],
-            "execute_js": ["script"],
             "switch_tab": ["tab_id"],
-            "new_tab": ["url"],
+            "open_tab": ["url"],
             "scroll_down": ["scroll_amount"],
             "scroll_up": ["scroll_amount"],
             "scroll_to_text": ["text"],
@@ -156,7 +129,7 @@ class BrowserUseTool(BaseTool, Generic[Context]):
             "go_back": [],
             "web_search": ["query"],
             "wait": ["seconds"],
-            "done": ["text"],
+            "extract_content": ["goal"],
         },
     }
 
@@ -169,6 +142,8 @@ class BrowserUseTool(BaseTool, Generic[Context]):
     # Context for generic functionality
     tool_context: Optional[Context] = Field(default=None, exclude=True)
 
+    llm: Optional[LLM] = Field(default_factory=LLM)
+
     @field_validator("parameters", mode="before")
     def validate_parameters(cls, v: dict, info: ValidationInfo) -> dict:
         if not v:
@@ -230,14 +205,10 @@ class BrowserUseTool(BaseTool, Generic[Context]):
         url: Optional[str] = None,
         index: Optional[int] = None,
         text: Optional[str] = None,
-        script: Optional[str] = None,
         scroll_amount: Optional[int] = None,
         tab_id: Optional[int] = None,
-        start_index: Optional[int] = None,
-        end_index: Optional[int] = None,
         query: Optional[str] = None,
         goal: Optional[str] = None,
-        success: Optional[bool] = None,
         keys: Optional[str] = None,
         seconds: Optional[int] = None,
         **kwargs,
@@ -250,14 +221,10 @@ class BrowserUseTool(BaseTool, Generic[Context]):
             url: URL for navigation or new tab
             index: Element index for click or input actions
             text: Text for input action or search query
-            script: JavaScript code for execution
             scroll_amount: Pixels to scroll for scroll action
             tab_id: Tab ID for switch_tab action
-            start_index: Starting character index for text observation
-            end_index: Ending character index for text observation
             query: Search query for Google search
             goal: Extraction goal for content extraction
-            success: Success status for done action
             keys: Keys to send for keyboard actions
             seconds: Seconds to wait
             **kwargs: Additional arguments
@@ -275,10 +242,14 @@ class BrowserUseTool(BaseTool, Generic[Context]):
                 )
 
                 # Navigation actions
-                if action == "navigate":
+                if action == "go_to_url":
                     if not url:
-                        return ToolResult(error="URL is required for 'navigate' action")
-                    await context.navigate_to(url)
+                        return ToolResult(
+                            error="URL is required for 'go_to_url' action"
+                        )
+                    page = await context.get_current_page()
+                    await page.goto(url)
+                    await page.wait_for_load_state()
                     return ToolResult(output=f"Navigated to {url}")
 
                 elif action == "go_back":
@@ -299,10 +270,22 @@ class BrowserUseTool(BaseTool, Generic[Context]):
                     if search_results:
                         # Navigate to the first search result
                         first_result = search_results[0]
-                        await context.navigate_to(first_result)
+                        if isinstance(first_result, dict) and "url" in first_result:
+                            url_to_navigate = first_result["url"]
+                        elif isinstance(first_result, str):
+                            url_to_navigate = first_result
+                        else:
+                            return ToolResult(
+                                error=f"Invalid search result format: {first_result}"
+                            )
+
+                        page = await context.get_current_page()
+                        await page.goto(url_to_navigate)
+                        await page.wait_for_load_state()
+
                         return ToolResult(
-                            output=f"Searched for '{query}' and navigated to first result: {first_result}\nAll results:"
-                            + "\n".join(search_results)
+                            output=f"Searched for '{query}' and navigated to first result: {url_to_navigate}\nAll results:"
+                            + "\n".join([str(r) for r in search_results])
                         )
                     else:
                         return ToolResult(
@@ -414,55 +397,70 @@ class BrowserUseTool(BaseTool, Generic[Context]):
                     )
 
                 # Content extraction actions
-                elif action == "get_current_state":
-                    return await self.get_current_state(context)
-
-                elif action == "get_html":
-                    html = await context.get_page_html()
-                    truncated = (
-                        html[:max_content_length] + "..."
-                        if len(html) > max_content_length
-                        else html
-                    )
-                    return ToolResult(output=truncated)
-
-                elif action == "get_text":
-                    start = start_index if start_index is not None else 0
-                    end = end_index if end_index is not None else max_content_length
-                    text = await context.execute_javascript(
-                        f"document.body.innerText.substring({start}, {end})"
-                    )
-                    full_length = await context.execute_javascript(
-                        "document.body.innerText.length"
-                    )
-                    result = f"Text from index {start} to {end}:\n{text}"
-                    if end < full_length:
-                        result += f"\n\n[Text continues... {full_length - end} more characters available]"
-                    if start > 0:
-                        result += f"\n[{start} characters before this point]"
-                    return ToolResult(output=result)
-
-                elif action == "read_links":
-                    links = await context.execute_javascript(
-                        "Array.from(document.querySelectorAll('a[href]')).map(elem => elem.innerText && elem.href ? `${elem.innerText.trim()} - ${elem.href}` : null).filter(Boolean).join('\\n')"
-                    )
-                    return ToolResult(output=links)
-
                 elif action == "extract_content":
                     if not goal:
                         return ToolResult(
                             error="Goal is required for 'extract_content' action"
                         )
-                    await context.get_page_html()
-                    # Note: In a real implementation, this would use an LLM to extract content
-                    return ToolResult(output=f"Extracted content for goal: {goal}")
+                    page = await context.get_current_page()
+                    try:
+                        # Get page content and convert to markdown for better processing
+                        html_content = await page.content()
 
-                elif action == "screenshot":
-                    screenshot = await context.take_screenshot(full_page=True)
-                    return ToolResult(
-                        output=f"Screenshot captured (base64 length: {len(screenshot)})",
-                        system=screenshot,
-                    )
+                        # Import markdownify here to avoid global import
+                        try:
+                            import markdownify
+
+                            content = markdownify.markdownify(html_content)
+                        except ImportError:
+                            # Fallback if markdownify is not available
+                            content = html_content
+
+                        # Create prompt for LLM
+                        prompt_text = """
+Your task is to extract the content of the page. You will be given a page and a goal, and you should extract all relevant information around this goal from the page.
+
+Examples of extraction goals:
+- Extract all company names
+- Extract specific descriptions
+- Extract all information about a topic
+- Extract links with companies in structured format
+- Extract all links
+
+If the goal is vague, summarize the page. Respond in JSON format.
+
+Extraction goal: {goal}
+
+Page content:
+{page}
+"""
+                        # Format the prompt with the goal and content
+                        max_content_length = min(50000, len(content))
+                        formatted_prompt = prompt_text.format(
+                            goal=goal, page=content[:max_content_length]
+                        )
+
+                        # Create a proper message list for the LLM
+                        from app.schema import Message
+
+                        messages = [Message.user_message(formatted_prompt)]
+
+                        # Use LLM to extract content based on the goal
+                        response = await self.llm.ask(messages)
+
+                        msg = f"Extracted from page:\n{response}\n"
+                        return ToolResult(output=msg)
+                    except Exception as e:
+                        # Provide a more helpful error message
+                        error_msg = f"Failed to extract content: {str(e)}"
+                        try:
+                            # Try to return a portion of the page content as fallback
+                            return ToolResult(
+                                output=f"{error_msg}\nHere's a portion of the page content:\n{content[:2000]}..."
+                            )
+                        except:
+                            # If all else fails, just return the error
+                            return ToolResult(error=error_msg)
 
                 # Tab management actions
                 elif action == "switch_tab":
@@ -471,13 +469,15 @@ class BrowserUseTool(BaseTool, Generic[Context]):
                             error="Tab ID is required for 'switch_tab' action"
                         )
                     await context.switch_to_tab(tab_id)
+                    page = await context.get_current_page()
+                    await page.wait_for_load_state()
                     return ToolResult(output=f"Switched to tab {tab_id}")
 
                 elif action == "open_tab":
                     if not url:
                         return ToolResult(error="URL is required for 'open_tab' action")
                     await context.create_new_tab(url)
-                    return ToolResult(output=f"Opened new tab with URL {url}")
+                    return ToolResult(output=f"Opened new tab with {url}")
 
                 elif action == "close_tab":
                     await context.close_current_tab()
@@ -489,47 +489,61 @@ class BrowserUseTool(BaseTool, Generic[Context]):
                     await asyncio.sleep(seconds_to_wait)
                     return ToolResult(output=f"Waited for {seconds_to_wait} seconds")
 
-                elif action == "execute_js":
-                    if not script:
-                        return ToolResult(
-                            error="Script is required for 'execute_js' action"
-                        )
-                    result = await context.execute_javascript(script)
-                    return ToolResult(output=str(result))
-
-                # Task completion
-                elif action == "done":
-                    if not text:
-                        return ToolResult(error="Text is required for 'done' action")
-                    success_value = success if success is not None else True
-                    return ToolResult(output=text, is_done=True, success=success_value)
-
                 else:
                     return ToolResult(error=f"Unknown action: {action}")
 
             except Exception as e:
                 return ToolResult(error=f"Browser action '{action}' failed: {str(e)}")
 
-    async def get_current_state(self, context: BrowserContext) -> ToolResult:
-        """Get the current browser state as a ToolResult."""
+    async def get_current_state(
+        self, context: Optional[BrowserContext] = None
+    ) -> ToolResult:
+        """
+        Get the current browser state as a ToolResult.
+        If context is not provided, uses self.context.
+        """
         try:
-            state = await context.get_state()
+            # Use provided context or fall back to self.context
+            ctx = context or self.context
+            if not ctx:
+                return ToolResult(error="Browser context not initialized")
+
+            state = await ctx.get_state()
+
+            # Create a viewport_info dictionary if it doesn't exist
+            viewport_height = 0
+            if hasattr(state, "viewport_info") and state.viewport_info:
+                viewport_height = state.viewport_info.height
+            elif hasattr(ctx, "config") and hasattr(ctx.config, "browser_window_size"):
+                viewport_height = ctx.config.browser_window_size.get("height", 0)
+
+            # Take a screenshot for the state
+            screenshot = await ctx.take_screenshot(full_page=True)
+
+            # Build the state info with all required fields
             state_info = {
                 "url": state.url,
                 "title": state.title,
                 "tabs": [tab.model_dump() for tab in state.tabs],
                 "help": "[0], [1], [2], etc., represent clickable indices corresponding to the elements listed. Clicking on these indices will navigate to or interact with the respective content behind them.",
-                "interactive_elements": state.element_tree.clickable_elements_to_string(),
+                "interactive_elements": (
+                    state.element_tree.clickable_elements_to_string()
+                    if state.element_tree
+                    else ""
+                ),
                 "scroll_info": {
-                    "pixels_above": state.pixels_above,
-                    "pixels_below": state.pixels_below,
-                    "total_height": state.pixels_above
-                    + state.pixels_below
-                    + (state.viewport_info.height if state.viewport_info else 0),
+                    "pixels_above": getattr(state, "pixels_above", 0),
+                    "pixels_below": getattr(state, "pixels_below", 0),
+                    "total_height": getattr(state, "pixels_above", 0)
+                    + getattr(state, "pixels_below", 0)
+                    + viewport_height,
                 },
+                "viewport_height": viewport_height,
             }
+
             return ToolResult(
-                output=json.dumps(state_info, indent=4, ensure_ascii=False)
+                output=json.dumps(state_info, indent=4, ensure_ascii=False),
+                base64_image=screenshot,
             )
         except Exception as e:
             return ToolResult(error=f"Failed to get browser state: {str(e)}")