From 6b77a9944822fd9a4b3b500bc04cb0afd2ba4957 Mon Sep 17 00:00:00 2001
From: Sheng Fan <fredtools999@gmail.com>
Date: Sat, 8 Mar 2025 16:45:18 +0800
Subject: [PATCH] feat(browser_use_tool): add 'get_text' action to browser use
 tool

---
 app/tool/browser_use_tool.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/app/tool/browser_use_tool.py b/app/tool/browser_use_tool.py
index 62f12a5..bd4afcc 100644
--- a/app/tool/browser_use_tool.py
+++ b/app/tool/browser_use_tool.py
@@ -11,7 +11,6 @@ from pydantic_core.core_schema import ValidationInfo
 
 from app.tool.base import BaseTool, ToolResult
 
-
 _BROWSER_DESCRIPTION = """
 Interact with a web browser to perform various actions such as navigation, element interaction,
 content extraction, and tab management. Supported actions include:
@@ -20,6 +19,7 @@ content extraction, and tab management. Supported actions include:
 - 'input_text': Input text into an element
 - 'screenshot': Capture a screenshot
 - 'get_html': Get page HTML content
+- 'get_text': Get text content of the page
 - 'execute_js': Execute JavaScript code
 - 'scroll': Scroll the page
 - 'switch_tab': Switch to a specific tab
@@ -43,6 +43,7 @@ class BrowserUseTool(BaseTool):
                     "input_text",
                     "screenshot",
                     "get_html",
+                    "get_text",
                     "execute_js",
                     "scroll",
                     "switch_tab",
@@ -171,15 +172,21 @@ class BrowserUseTool(BaseTool):
                 elif action == "screenshot":
                     screenshot = await context.take_screenshot(full_page=True)
                     return ToolResult(
-                        output=f"Screenshot captured (base64 length: {len(screenshot)})",
+                        output=
+                        f"Screenshot captured (base64 length: {len(screenshot)})",
                         system=screenshot,
                     )
 
                 elif action == "get_html":
                     html = await context.get_page_html()
-                    truncated = html[:2000] + "..." if len(html) > 2000 else html
+                    truncated = html[:2000] + "..." if len(
+                        html) > 2000 else html
                     return ToolResult(output=truncated)
 
+                elif action == "get_text":
+                    text = await context.execute_javascript('document.body.innerText')
+                    return ToolResult(output=text)
+
                 elif action == "execute_js":
                     if not script:
                         return ToolResult(