From 6b77a9944822fd9a4b3b500bc04cb0afd2ba4957 Mon Sep 17 00:00:00 2001 From: Sheng Fan Date: Sat, 8 Mar 2025 16:45:18 +0800 Subject: [PATCH 1/3] feat(browser_use_tool): add 'get_text' action to browser use tool --- app/tool/browser_use_tool.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/app/tool/browser_use_tool.py b/app/tool/browser_use_tool.py index 62f12a5..bd4afcc 100644 --- a/app/tool/browser_use_tool.py +++ b/app/tool/browser_use_tool.py @@ -11,7 +11,6 @@ from pydantic_core.core_schema import ValidationInfo from app.tool.base import BaseTool, ToolResult - _BROWSER_DESCRIPTION = """ Interact with a web browser to perform various actions such as navigation, element interaction, content extraction, and tab management. Supported actions include: @@ -20,6 +19,7 @@ content extraction, and tab management. Supported actions include: - 'input_text': Input text into an element - 'screenshot': Capture a screenshot - 'get_html': Get page HTML content +- 'get_text': Get text content of the page - 'execute_js': Execute JavaScript code - 'scroll': Scroll the page - 'switch_tab': Switch to a specific tab @@ -43,6 +43,7 @@ class BrowserUseTool(BaseTool): "input_text", "screenshot", "get_html", + "get_text", "execute_js", "scroll", "switch_tab", @@ -171,15 +172,21 @@ class BrowserUseTool(BaseTool): elif action == "screenshot": screenshot = await context.take_screenshot(full_page=True) return ToolResult( - output=f"Screenshot captured (base64 length: {len(screenshot)})", + output= + f"Screenshot captured (base64 length: {len(screenshot)})", system=screenshot, ) elif action == "get_html": html = await context.get_page_html() - truncated = html[:2000] + "..." if len(html) > 2000 else html + truncated = html[:2000] + "..." if len( + html) > 2000 else html return ToolResult(output=truncated) + elif action == "get_text": + text = await context.execute_javascript('document.body.innerText') + return ToolResult(output=text) + elif action == "execute_js": if not script: return ToolResult( From 7090490f75f9671fade64585ce3bb7b954e1c104 Mon Sep 17 00:00:00 2001 From: Sheng Fan Date: Sat, 8 Mar 2025 18:23:40 +0800 Subject: [PATCH 2/3] feat(browser_use_tool): add 'read_links' action --- app/tool/browser_use_tool.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/app/tool/browser_use_tool.py b/app/tool/browser_use_tool.py index bd4afcc..981ef1c 100644 --- a/app/tool/browser_use_tool.py +++ b/app/tool/browser_use_tool.py @@ -20,6 +20,7 @@ content extraction, and tab management. Supported actions include: - 'screenshot': Capture a screenshot - 'get_html': Get page HTML content - 'get_text': Get text content of the page +- 'read_links': Get all links on the page - 'execute_js': Execute JavaScript code - 'scroll': Scroll the page - 'switch_tab': Switch to a specific tab @@ -187,6 +188,10 @@ class BrowserUseTool(BaseTool): text = await context.execute_javascript('document.body.innerText') return ToolResult(output=text) + elif action == "read_links": + links = await context.execute_javascript("document.querySelectorAll('a[href]').forEach((elem) => {if (elem.innerText) {console.log(elem.innerText, elem.href)}})") + return ToolResult(output=links) + elif action == "execute_js": if not script: return ToolResult( From 0d0f8ab2330d2329bc2c0a8a8b002ebb9d8fcf1e Mon Sep 17 00:00:00 2001 From: Sheng Fan Date: Sun, 9 Mar 2025 11:30:41 +0800 Subject: [PATCH 3/3] chore(browser_use_tool): fix code style according to pre-commit --- app/tool/browser_use_tool.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/app/tool/browser_use_tool.py b/app/tool/browser_use_tool.py index 981ef1c..d2cf2a5 100644 --- a/app/tool/browser_use_tool.py +++ b/app/tool/browser_use_tool.py @@ -11,6 +11,7 @@ from pydantic_core.core_schema import ValidationInfo from app.tool.base import BaseTool, ToolResult + _BROWSER_DESCRIPTION = """ Interact with a web browser to perform various actions such as navigation, element interaction, content extraction, and tab management. Supported actions include: @@ -173,23 +174,23 @@ class BrowserUseTool(BaseTool): elif action == "screenshot": screenshot = await context.take_screenshot(full_page=True) return ToolResult( - output= - f"Screenshot captured (base64 length: {len(screenshot)})", + output=f"Screenshot captured (base64 length: {len(screenshot)})", system=screenshot, ) elif action == "get_html": html = await context.get_page_html() - truncated = html[:2000] + "..." if len( - html) > 2000 else html + truncated = html[:2000] + "..." if len(html) > 2000 else html return ToolResult(output=truncated) elif action == "get_text": - text = await context.execute_javascript('document.body.innerText') + text = await context.execute_javascript("document.body.innerText") return ToolResult(output=text) elif action == "read_links": - links = await context.execute_javascript("document.querySelectorAll('a[href]').forEach((elem) => {if (elem.innerText) {console.log(elem.innerText, elem.href)}})") + links = await context.execute_javascript( + "document.querySelectorAll('a[href]').forEach((elem) => {if (elem.innerText) {console.log(elem.innerText, elem.href)}})" + ) return ToolResult(output=links) elif action == "execute_js":