feat(browser_use_tool): add 'get_text' action to browser use tool

This commit is contained in:
Sheng Fan 2025-03-08 16:45:18 +08:00
parent e76055b436
commit 6b77a99448

View File

@ -11,7 +11,6 @@ from pydantic_core.core_schema import ValidationInfo
from app.tool.base import BaseTool, ToolResult from app.tool.base import BaseTool, ToolResult
_BROWSER_DESCRIPTION = """ _BROWSER_DESCRIPTION = """
Interact with a web browser to perform various actions such as navigation, element interaction, Interact with a web browser to perform various actions such as navigation, element interaction,
content extraction, and tab management. Supported actions include: content extraction, and tab management. Supported actions include:
@ -20,6 +19,7 @@ content extraction, and tab management. Supported actions include:
- 'input_text': Input text into an element - 'input_text': Input text into an element
- 'screenshot': Capture a screenshot - 'screenshot': Capture a screenshot
- 'get_html': Get page HTML content - 'get_html': Get page HTML content
- 'get_text': Get text content of the page
- 'execute_js': Execute JavaScript code - 'execute_js': Execute JavaScript code
- 'scroll': Scroll the page - 'scroll': Scroll the page
- 'switch_tab': Switch to a specific tab - 'switch_tab': Switch to a specific tab
@ -43,6 +43,7 @@ class BrowserUseTool(BaseTool):
"input_text", "input_text",
"screenshot", "screenshot",
"get_html", "get_html",
"get_text",
"execute_js", "execute_js",
"scroll", "scroll",
"switch_tab", "switch_tab",
@ -171,15 +172,21 @@ class BrowserUseTool(BaseTool):
elif action == "screenshot": elif action == "screenshot":
screenshot = await context.take_screenshot(full_page=True) screenshot = await context.take_screenshot(full_page=True)
return ToolResult( return ToolResult(
output=f"Screenshot captured (base64 length: {len(screenshot)})", output=
f"Screenshot captured (base64 length: {len(screenshot)})",
system=screenshot, system=screenshot,
) )
elif action == "get_html": elif action == "get_html":
html = await context.get_page_html() html = await context.get_page_html()
truncated = html[:2000] + "..." if len(html) > 2000 else html truncated = html[:2000] + "..." if len(
html) > 2000 else html
return ToolResult(output=truncated) return ToolResult(output=truncated)
elif action == "get_text":
text = await context.execute_javascript('document.body.innerText')
return ToolResult(output=text)
elif action == "execute_js": elif action == "execute_js":
if not script: if not script:
return ToolResult( return ToolResult(