Merge remote-tracking branch 'origin/main'
This commit is contained in:
commit
9e966e4b86
@ -62,7 +62,7 @@ api_key = "sk-..." # Replace with your actual API key
|
|||||||
```
|
```
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
One line for run OpenManus:
|
One line for run OpenManus:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python main.py
|
python main.py
|
||||||
@ -70,7 +70,7 @@ python main.py
|
|||||||
|
|
||||||
Then input your idea via terminal!
|
Then input your idea via terminal!
|
||||||
|
|
||||||
## How to contribute
|
## How to contribute
|
||||||
We welcome any friendly suggestions and helpful contributions! Just create issues or submit pull requests.
|
We welcome any friendly suggestions and helpful contributions! Just create issues or submit pull requests.
|
||||||
|
|
||||||
Or contact @mannaandpoem via 📧email: mannaandpoem@gmail.com
|
Or contact @mannaandpoem via 📧email: mannaandpoem@gmail.com
|
||||||
@ -84,6 +84,6 @@ Or contact @mannaandpoem via 📧email: mannaandpoem@gmail.com
|
|||||||
|
|
||||||
## Acknowledgement
|
## Acknowledgement
|
||||||
|
|
||||||
Thanks to [broswer use](https://github.com/browser-use/browser-use) for providing basic support for this project!
|
Thanks to [anthropic-computer-use](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) and [broswer-use](https://github.com/browser-use/browser-use) for providing basic support for this project!
|
||||||
|
|
||||||
OpenManus is built by contributors from MetaGPT. Huge thanks to this agent community!
|
OpenManus is built by contributors from MetaGPT. Huge thanks to this agent community!
|
||||||
|
@ -1,15 +1,12 @@
|
|||||||
from pydantic import Field, model_validator
|
from pydantic import Field
|
||||||
|
|
||||||
from app.agent.planning import PlanningAgent
|
|
||||||
from app.agent.toolcall_en import ToolCallAgent
|
from app.agent.toolcall_en import ToolCallAgent
|
||||||
from app.tool import ToolCollection, Bash, Terminate
|
from app.prompt.manus import NEXT_STEP_PROMPT, SYSTEM_PROMPT
|
||||||
from app.tool.planning import PlanningTool
|
from app.tool import Terminate, ToolCollection
|
||||||
from app.tool.browser_use_tool import BrowserUseTool
|
from app.tool.browser_use_tool import BrowserUseTool
|
||||||
|
from app.tool.file_saver import FileSaver
|
||||||
from app.tool.google_search import GoogleSearch
|
from app.tool.google_search import GoogleSearch
|
||||||
from app.tool.python_execute import PythonExecute
|
from app.tool.python_execute import PythonExecute
|
||||||
from app.tool.file_saver import FileSaver
|
|
||||||
|
|
||||||
from app.prompt.manus import SYSTEM_PROMPT, NEXT_STEP_PROMPT
|
|
||||||
|
|
||||||
|
|
||||||
class Manus(ToolCallAgent):
|
class Manus(ToolCallAgent):
|
||||||
@ -22,7 +19,9 @@ class Manus(ToolCallAgent):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
name: str = "Manus"
|
name: str = "Manus"
|
||||||
description: str = "A versatile agent that can solve various tasks using multiple tools"
|
description: str = (
|
||||||
|
"A versatile agent that can solve various tasks using multiple tools"
|
||||||
|
)
|
||||||
|
|
||||||
system_prompt: str = SYSTEM_PROMPT
|
system_prompt: str = SYSTEM_PROMPT
|
||||||
next_step_prompt: str = NEXT_STEP_PROMPT
|
next_step_prompt: str = NEXT_STEP_PROMPT
|
||||||
@ -33,4 +32,3 @@ class Manus(ToolCallAgent):
|
|||||||
PythonExecute(), GoogleSearch(), BrowserUseTool(), FileSaver(), Terminate()
|
PythonExecute(), GoogleSearch(), BrowserUseTool(), FileSaver(), Terminate()
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -5,37 +5,11 @@ from pydantic import Field, model_validator
|
|||||||
|
|
||||||
from app.agent.toolcall import ToolCallAgent
|
from app.agent.toolcall import ToolCallAgent
|
||||||
from app.logger import logger
|
from app.logger import logger
|
||||||
|
from app.prompt.planning import NEXT_STEP_PROMPT, PLANNING_SYSTEM_PROMPT
|
||||||
from app.schema import Message, ToolCall
|
from app.schema import Message, ToolCall
|
||||||
from app.tool import PlanningTool, Terminate, ToolCollection
|
from app.tool import PlanningTool, Terminate, ToolCollection
|
||||||
|
|
||||||
|
|
||||||
PLANNING_SYSTEM_PROMPT = """
|
|
||||||
You are an expert Planning Agent tasked with solving complex problems by creating and managing structured plans.
|
|
||||||
Your job is:
|
|
||||||
1. Analyze requests to understand the task scope
|
|
||||||
2. Create clear, actionable plans with the `planning` tool
|
|
||||||
3. Execute steps using available tools as needed
|
|
||||||
4. Track progress and adapt plans dynamically
|
|
||||||
5. Use `finish` to conclude when the task is complete
|
|
||||||
|
|
||||||
Available tools will vary by task but may include:
|
|
||||||
- `planning`: Create, update, and track plans (commands: create, update, mark_step, etc.)
|
|
||||||
- `finish`: End the task when complete
|
|
||||||
|
|
||||||
Break tasks into logical, sequential steps. Think about dependencies and verification methods.
|
|
||||||
"""
|
|
||||||
|
|
||||||
NEXT_STEP_PROMPT = """
|
|
||||||
Based on the current state, what's your next step?
|
|
||||||
Consider:
|
|
||||||
1. Do you need to create or refine a plan?
|
|
||||||
2. Are you ready to execute a specific step?
|
|
||||||
3. Have you completed the task?
|
|
||||||
|
|
||||||
Provide reasoning, then select the appropriate tool or action.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class PlanningAgent(ToolCallAgent):
|
class PlanningAgent(ToolCallAgent):
|
||||||
"""
|
"""
|
||||||
An agent that creates and manages plans to solve tasks.
|
An agent that creates and manages plans to solve tasks.
|
||||||
|
@ -9,6 +9,7 @@ from app.prompt.toolcall import NEXT_STEP_PROMPT, SYSTEM_PROMPT
|
|||||||
from app.schema import AgentState, Message, ToolCall
|
from app.schema import AgentState, Message, ToolCall
|
||||||
from app.tool import CreateChatCompletion, Terminate, ToolCollection
|
from app.tool import CreateChatCompletion, Terminate, ToolCollection
|
||||||
|
|
||||||
|
|
||||||
TOOL_CALL_REQUIRED = "Tool calls required but none provided"
|
TOOL_CALL_REQUIRED = "Tool calls required but none provided"
|
||||||
|
|
||||||
|
|
||||||
@ -40,7 +41,9 @@ class ToolCallAgent(ReActAgent):
|
|||||||
# Get response with tool options
|
# Get response with tool options
|
||||||
response = await self.llm.ask_tool(
|
response = await self.llm.ask_tool(
|
||||||
messages=self.messages,
|
messages=self.messages,
|
||||||
system_msgs=[Message.system_message(self.system_prompt)] if self.system_prompt else None,
|
system_msgs=[Message.system_message(self.system_prompt)]
|
||||||
|
if self.system_prompt
|
||||||
|
else None,
|
||||||
tools=self.available_tools.to_params(),
|
tools=self.available_tools.to_params(),
|
||||||
tool_choice=self.tool_choices,
|
tool_choice=self.tool_choices,
|
||||||
)
|
)
|
||||||
@ -48,15 +51,21 @@ class ToolCallAgent(ReActAgent):
|
|||||||
|
|
||||||
# Log response info in a more engaging way
|
# Log response info in a more engaging way
|
||||||
logger.info(f"✨ AI's thoughts: {response.content}")
|
logger.info(f"✨ AI's thoughts: {response.content}")
|
||||||
logger.info(f"🛠️ AI selected {len(response.tool_calls) if response.tool_calls else 0} tools to use")
|
logger.info(
|
||||||
|
f"🛠️ AI selected {len(response.tool_calls) if response.tool_calls else 0} tools to use"
|
||||||
|
)
|
||||||
if response.tool_calls:
|
if response.tool_calls:
|
||||||
logger.info(f"🧰 Tools being prepared: {[call.function.name for call in response.tool_calls]}")
|
logger.info(
|
||||||
|
f"🧰 Tools being prepared: {[call.function.name for call in response.tool_calls]}"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Handle different tool_choices modes
|
# Handle different tool_choices modes
|
||||||
if self.tool_choices == "none":
|
if self.tool_choices == "none":
|
||||||
if response.tool_calls:
|
if response.tool_calls:
|
||||||
logger.warning("🤔 Hmm, AI tried to use tools when they weren't available!")
|
logger.warning(
|
||||||
|
"🤔 Hmm, AI tried to use tools when they weren't available!"
|
||||||
|
)
|
||||||
if response.content:
|
if response.content:
|
||||||
self.memory.add_message(Message.assistant_message(response.content))
|
self.memory.add_message(Message.assistant_message(response.content))
|
||||||
return True
|
return True
|
||||||
@ -82,9 +91,11 @@ class ToolCallAgent(ReActAgent):
|
|||||||
return bool(self.tool_calls)
|
return bool(self.tool_calls)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"🚨 Oops! The AI's thinking process hit a snag: {e}")
|
logger.error(f"🚨 Oops! The AI's thinking process hit a snag: {e}")
|
||||||
self.memory.add_message(Message.assistant_message(
|
self.memory.add_message(
|
||||||
f"Error encountered while processing: {str(e)}"
|
Message.assistant_message(
|
||||||
))
|
f"Error encountered while processing: {str(e)}"
|
||||||
|
)
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
async def act(self) -> str:
|
async def act(self) -> str:
|
||||||
@ -94,9 +105,7 @@ class ToolCallAgent(ReActAgent):
|
|||||||
raise ValueError(TOOL_CALL_REQUIRED)
|
raise ValueError(TOOL_CALL_REQUIRED)
|
||||||
|
|
||||||
# Return last message content if no tool calls
|
# Return last message content if no tool calls
|
||||||
return (
|
return self.messages[-1].content or "No content or commands to execute"
|
||||||
self.messages[-1].content or "No content or commands to execute"
|
|
||||||
)
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for command in self.tool_calls:
|
for command in self.tool_calls:
|
||||||
@ -144,7 +153,9 @@ class ToolCallAgent(ReActAgent):
|
|||||||
return observation
|
return observation
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
error_msg = f"Error parsing arguments for {name}: Invalid JSON format"
|
error_msg = f"Error parsing arguments for {name}: Invalid JSON format"
|
||||||
logger.error(f"📝 Oops! The arguments for '{name}' don't make sense - invalid JSON")
|
logger.error(
|
||||||
|
f"📝 Oops! The arguments for '{name}' don't make sense - invalid JSON"
|
||||||
|
)
|
||||||
return f"Error: {error_msg}"
|
return f"Error: {error_msg}"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Error executing tool {name}: {str(e)}"
|
error_msg = f"Error executing tool {name}: {str(e)}"
|
||||||
|
@ -9,6 +9,7 @@ from app.prompt.toolcall import NEXT_STEP_PROMPT, SYSTEM_PROMPT
|
|||||||
from app.schema import AgentState, Message, ToolCall
|
from app.schema import AgentState, Message, ToolCall
|
||||||
from app.tool import CreateChatCompletion, Terminate, ToolCollection
|
from app.tool import CreateChatCompletion, Terminate, ToolCollection
|
||||||
|
|
||||||
|
|
||||||
TOOL_CALL_REQUIRED = "Tool calls required but none provided"
|
TOOL_CALL_REQUIRED = "Tool calls required but none provided"
|
||||||
|
|
||||||
|
|
||||||
@ -40,7 +41,9 @@ class ToolCallAgent(ReActAgent):
|
|||||||
# Get response with tool options
|
# Get response with tool options
|
||||||
response = await self.llm.ask_tool(
|
response = await self.llm.ask_tool(
|
||||||
messages=self.messages,
|
messages=self.messages,
|
||||||
system_msgs=[Message.system_message(self.system_prompt)] if self.system_prompt else None,
|
system_msgs=[Message.system_message(self.system_prompt)]
|
||||||
|
if self.system_prompt
|
||||||
|
else None,
|
||||||
tools=self.available_tools.to_params(),
|
tools=self.available_tools.to_params(),
|
||||||
tool_choice=self.tool_choices,
|
tool_choice=self.tool_choices,
|
||||||
)
|
)
|
||||||
@ -48,9 +51,13 @@ class ToolCallAgent(ReActAgent):
|
|||||||
|
|
||||||
# Log response info in a more engaging way
|
# Log response info in a more engaging way
|
||||||
logger.info(f"✨ AI的思考过程:{response.content}")
|
logger.info(f"✨ AI的思考过程:{response.content}")
|
||||||
logger.info(f"🛠️ AI选择了 {len(response.tool_calls) if response.tool_calls else 0} 个工具来解决问题")
|
logger.info(
|
||||||
|
f"🛠️ AI选择了 {len(response.tool_calls) if response.tool_calls else 0} 个工具来解决问题"
|
||||||
|
)
|
||||||
if response.tool_calls:
|
if response.tool_calls:
|
||||||
logger.info(f"🧰 准备使用的工具箱:{[call.function.name for call in response.tool_calls]}")
|
logger.info(
|
||||||
|
f"🧰 准备使用的工具箱:{[call.function.name for call in response.tool_calls]}"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Handle different tool_choices modes
|
# Handle different tool_choices modes
|
||||||
@ -82,9 +89,11 @@ class ToolCallAgent(ReActAgent):
|
|||||||
return bool(self.tool_calls)
|
return bool(self.tool_calls)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"🚨 糟糕!AI思考时遇到了一点小问题:{e}")
|
logger.error(f"🚨 糟糕!AI思考时遇到了一点小问题:{e}")
|
||||||
self.memory.add_message(Message.assistant_message(
|
self.memory.add_message(
|
||||||
f"Error encountered while processing: {str(e)}"
|
Message.assistant_message(
|
||||||
))
|
f"Error encountered while processing: {str(e)}"
|
||||||
|
)
|
||||||
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
async def act(self) -> str:
|
async def act(self) -> str:
|
||||||
@ -94,9 +103,7 @@ class ToolCallAgent(ReActAgent):
|
|||||||
raise ValueError(TOOL_CALL_REQUIRED)
|
raise ValueError(TOOL_CALL_REQUIRED)
|
||||||
|
|
||||||
# Return last message content if no tool calls
|
# Return last message content if no tool calls
|
||||||
return (
|
return self.messages[-1].content or "No content or commands to execute"
|
||||||
self.messages[-1].content or "No content or commands to execute"
|
|
||||||
)
|
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for command in self.tool_calls:
|
for command in self.tool_calls:
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import threading
|
import threading
|
||||||
import tomllib
|
import tomllib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Optional
|
from typing import Dict
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
@ -23,14 +23,8 @@ class LLMSettings(BaseModel):
|
|||||||
temperature: float = Field(1.0, description="Sampling temperature")
|
temperature: float = Field(1.0, description="Sampling temperature")
|
||||||
|
|
||||||
|
|
||||||
class ScreenshotSettings(BaseModel):
|
|
||||||
api_key: Optional[str] = Field(None, description="Screenshot API key")
|
|
||||||
base_url: Optional[str] = Field(None, description="Screenshot service URL")
|
|
||||||
|
|
||||||
|
|
||||||
class AppConfig(BaseModel):
|
class AppConfig(BaseModel):
|
||||||
llm: Dict[str, LLMSettings]
|
llm: Dict[str, LLMSettings]
|
||||||
screenshot: Optional[ScreenshotSettings] = None
|
|
||||||
|
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
@ -94,16 +88,8 @@ class Config:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add screenshot config if present
|
|
||||||
if screenshot_config := raw_config.get("screenshot"):
|
|
||||||
config_dict["screenshot"] = screenshot_config
|
|
||||||
|
|
||||||
self._config = AppConfig(**config_dict)
|
self._config = AppConfig(**config_dict)
|
||||||
|
|
||||||
@property
|
|
||||||
def screenshot(self) -> Optional[ScreenshotSettings]:
|
|
||||||
return self._config.screenshot
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def llm(self) -> Dict[str, LLMSettings]:
|
def llm(self) -> Dict[str, LLMSettings]:
|
||||||
return self._config.llm
|
return self._config.llm
|
||||||
|
@ -3,10 +3,3 @@ class ToolError(Exception):
|
|||||||
|
|
||||||
def __init__(self, message):
|
def __init__(self, message):
|
||||||
self.message = message
|
self.message = message
|
||||||
|
|
||||||
|
|
||||||
class BrowserException(Exception):
|
|
||||||
"""Base exception for browser-related errors."""
|
|
||||||
|
|
||||||
def __init__(self, message):
|
|
||||||
super().__init__(message)
|
|
||||||
|
@ -22,7 +22,7 @@ def define_log_level(print_level="INFO", logfile_level="DEBUG", name: str = None
|
|||||||
|
|
||||||
_logger.remove()
|
_logger.remove()
|
||||||
_logger.add(sys.stderr, level=print_level)
|
_logger.add(sys.stderr, level=print_level)
|
||||||
_logger.add(PROJECT_ROOT / f"logs/{log_name}.txt", level=logfile_level)
|
_logger.add(PROJECT_ROOT / f"logs/{log_name}.log", level=logfile_level)
|
||||||
return _logger
|
return _logger
|
||||||
|
|
||||||
|
|
||||||
|
21
app/loop.py
21
app/loop.py
@ -1,21 +0,0 @@
|
|||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
from app.agent.base import BaseAgent
|
|
||||||
from app.flow.base import FlowType
|
|
||||||
from app.flow.flow_factory import FlowFactory
|
|
||||||
from app.tool import BaseTool, ToolCollection
|
|
||||||
|
|
||||||
|
|
||||||
async def loop(
|
|
||||||
agent: BaseAgent,
|
|
||||||
tools: Optional[List[BaseTool]] = None,
|
|
||||||
flow_type: FlowType = FlowType.PLANNING,
|
|
||||||
input_text: str = "",
|
|
||||||
**loop_kwargs,
|
|
||||||
) -> str:
|
|
||||||
"""Main entry point for running an agent with specified flow type"""
|
|
||||||
tool_collection = ToolCollection(*tools) if tools else None
|
|
||||||
flow = FlowFactory.create_flow(
|
|
||||||
flow_type, agent, tool_collection=tool_collection, **loop_kwargs
|
|
||||||
)
|
|
||||||
return await flow.execute(input_text)
|
|
@ -11,4 +11,4 @@ BrowserUseTool: Open and control web browsers
|
|||||||
GoogleSearch: Perform web information retrieval
|
GoogleSearch: Perform web information retrieval
|
||||||
|
|
||||||
Based on user needs, proactively select the most appropriate tool or combination of tools. For complex tasks, you can break down the problem and use different tools step by step to solve it. After using each tool, clearly explain the execution results and suggest the next steps.
|
Based on user needs, proactively select the most appropriate tool or combination of tools. For complex tasks, you can break down the problem and use different tools step by step to solve it. After using each tool, clearly explain the execution results and suggest the next steps.
|
||||||
"""
|
"""
|
||||||
|
25
app/prompt/planning.py
Normal file
25
app/prompt/planning.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
PLANNING_SYSTEM_PROMPT = """
|
||||||
|
You are an expert Planning Agent tasked with solving complex problems by creating and managing structured plans.
|
||||||
|
Your job is:
|
||||||
|
1. Analyze requests to understand the task scope
|
||||||
|
2. Create clear, actionable plans with the `planning` tool
|
||||||
|
3. Execute steps using available tools as needed
|
||||||
|
4. Track progress and adapt plans dynamically
|
||||||
|
5. Use `finish` to conclude when the task is complete
|
||||||
|
|
||||||
|
Available tools will vary by task but may include:
|
||||||
|
- `planning`: Create, update, and track plans (commands: create, update, mark_step, etc.)
|
||||||
|
- `finish`: End the task when complete
|
||||||
|
|
||||||
|
Break tasks into logical, sequential steps. Think about dependencies and verification methods.
|
||||||
|
"""
|
||||||
|
|
||||||
|
NEXT_STEP_PROMPT = """
|
||||||
|
Based on the current state, what's your next step?
|
||||||
|
Consider:
|
||||||
|
1. Do you need to create or refine a plan?
|
||||||
|
2. Are you ready to execute a specific step?
|
||||||
|
3. Have you completed the task?
|
||||||
|
|
||||||
|
Provide reasoning, then select the appropriate tool or action.
|
||||||
|
"""
|
@ -26,47 +26,3 @@ NEXT_STEP_TEMPLATE = """{{observation}}
|
|||||||
(Current directory: {{working_dir}})
|
(Current directory: {{working_dir}})
|
||||||
bash-$
|
bash-$
|
||||||
"""
|
"""
|
||||||
|
|
||||||
NEXT_STEP_NO_OUTPUT_TEMPLATE = """Your command ran successfully and did not produce any output.
|
|
||||||
(Open file: {{open_file}})
|
|
||||||
(Current directory: {{working_dir}})
|
|
||||||
bash-$
|
|
||||||
"""
|
|
||||||
|
|
||||||
INSTANCE_TEMPLATE = """We're currently solving the following issue within our repository. Here's the issue text:
|
|
||||||
ISSUE:
|
|
||||||
{{problem_statement}}
|
|
||||||
|
|
||||||
INSTRUCTIONS:
|
|
||||||
Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
|
|
||||||
Remember, YOU SHOULD ALWAYS INCLUDE EXACTLY ONE TOOL CALL/FUNCTION CALL PER RESPONSE.
|
|
||||||
When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command.
|
|
||||||
Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with the python <script_name>.py`.
|
|
||||||
|
|
||||||
NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
|
|
||||||
|
|
||||||
IMPORTANT TIPS:
|
|
||||||
1. Always start by trying to replicate the bug that the issues discusses.
|
|
||||||
If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
|
|
||||||
Then start trying to fix it.
|
|
||||||
When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
|
|
||||||
|
|
||||||
If the bug reproduction script does not print anything when it successfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
|
|
||||||
so that you can be sure that the script indeed ran fine all the way through.
|
|
||||||
|
|
||||||
2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
|
|
||||||
|
|
||||||
3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
|
|
||||||
|
|
||||||
4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doesn't work, use the linux 'find' command.
|
|
||||||
|
|
||||||
5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file.
|
|
||||||
|
|
||||||
6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
|
|
||||||
|
|
||||||
7. Do not try to install any packages with `pip`, `conda`, or any other way. This will usually not work. If the environment is not set up correctly, try to fix the issue without executing python code or running any tests that require the package installed.
|
|
||||||
|
|
||||||
|
|
||||||
(Open file: {{open_file}})
|
|
||||||
(Current directory: {{working_dir}})
|
|
||||||
bash-$"""
|
|
||||||
|
@ -1,259 +0,0 @@
|
|||||||
import atexit
|
|
||||||
import base64
|
|
||||||
import io
|
|
||||||
import json
|
|
||||||
import multiprocessing
|
|
||||||
import platform
|
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
import browsergym.core # noqa F401 (we register the openended task as a gym environment)
|
|
||||||
import gymnasium as gym
|
|
||||||
import html2text
|
|
||||||
import numpy as np
|
|
||||||
import tenacity
|
|
||||||
from browsergym.utils.obs import flatten_dom_to_str
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from app.exceptions import BrowserException
|
|
||||||
from app.logger import logger
|
|
||||||
from app.utils.shutdown_listener import should_continue, should_exit
|
|
||||||
|
|
||||||
|
|
||||||
BROWSER_EVAL_GET_GOAL_ACTION = "GET_EVAL_GOAL"
|
|
||||||
BROWSER_EVAL_GET_REWARDS_ACTION = "GET_EVAL_REWARDS"
|
|
||||||
|
|
||||||
|
|
||||||
class BrowserEnv:
|
|
||||||
def __init__(self, browsergym_eval_env: str | None = None, headless: bool = False):
|
|
||||||
"""
|
|
||||||
Initialize the browser environment.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
browsergym_eval_env: Optional evaluation environment name
|
|
||||||
headless: Whether to run the browser in headless mode (no UI)
|
|
||||||
"""
|
|
||||||
self.html_text_converter = self.get_html_text_converter()
|
|
||||||
self.eval_mode = False
|
|
||||||
self.eval_dir = ""
|
|
||||||
self.browsergym_eval_env = browsergym_eval_env
|
|
||||||
self.eval_mode = bool(browsergym_eval_env)
|
|
||||||
self.headless = headless
|
|
||||||
|
|
||||||
# Set multiprocessing start method
|
|
||||||
if platform.system() == "Windows":
|
|
||||||
multiprocessing.set_start_method("spawn", force=True)
|
|
||||||
else:
|
|
||||||
multiprocessing.set_start_method("fork", force=True)
|
|
||||||
|
|
||||||
self.browser_side, self.agent_side = multiprocessing.Pipe()
|
|
||||||
self.process = None # Initialize process as None
|
|
||||||
self.init_browser()
|
|
||||||
atexit.register(self.close)
|
|
||||||
|
|
||||||
def get_html_text_converter(self):
|
|
||||||
html_text_converter = html2text.HTML2Text()
|
|
||||||
# ignore links and images
|
|
||||||
html_text_converter.ignore_links = False
|
|
||||||
html_text_converter.ignore_images = True
|
|
||||||
# use alt text for images
|
|
||||||
html_text_converter.images_to_alt = True
|
|
||||||
# disable auto text wrapping
|
|
||||||
html_text_converter.body_width = 0
|
|
||||||
return html_text_converter
|
|
||||||
|
|
||||||
@tenacity.retry(
|
|
||||||
wait=tenacity.wait_fixed(1),
|
|
||||||
stop=tenacity.stop_after_attempt(5),
|
|
||||||
retry=tenacity.retry_if_exception_type(BrowserException),
|
|
||||||
)
|
|
||||||
def init_browser(self):
|
|
||||||
logger.debug(f"Starting browser env (headless: {self.headless})...")
|
|
||||||
try:
|
|
||||||
self.process = multiprocessing.Process(
|
|
||||||
target=self.browser_process, args=(self.headless,)
|
|
||||||
)
|
|
||||||
self.process.start()
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to start browser process: {e}")
|
|
||||||
if self.process is not None:
|
|
||||||
self.process.terminate()
|
|
||||||
raise BrowserException("Failed to start browser environment.")
|
|
||||||
|
|
||||||
if not self.check_alive():
|
|
||||||
self.close()
|
|
||||||
raise BrowserException("Failed to start browser environment.")
|
|
||||||
|
|
||||||
def browser_process(self, headless: bool):
|
|
||||||
if self.eval_mode:
|
|
||||||
assert self.browsergym_eval_env is not None
|
|
||||||
logger.debug("Initializing browser env for web browsing evaluation.")
|
|
||||||
if "webarena" in self.browsergym_eval_env:
|
|
||||||
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
|
|
||||||
elif "miniwob" in self.browsergym_eval_env:
|
|
||||||
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unsupported browsergym eval env: {self.browsergym_eval_env}"
|
|
||||||
)
|
|
||||||
env = gym.make(
|
|
||||||
self.browsergym_eval_env,
|
|
||||||
tags_to_mark="all",
|
|
||||||
headless=headless,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
env = gym.make(
|
|
||||||
"browsergym/openended",
|
|
||||||
task_kwargs={"start_url": "about:blank", "goal": "PLACEHOLDER_GOAL"},
|
|
||||||
wait_for_user_message=False,
|
|
||||||
headless=headless,
|
|
||||||
disable_env_checker=True,
|
|
||||||
tags_to_mark="all",
|
|
||||||
)
|
|
||||||
|
|
||||||
obs, info = env.reset()
|
|
||||||
|
|
||||||
# EVAL ONLY: save the goal into file for evaluation
|
|
||||||
self.eval_goal = None
|
|
||||||
self.eval_rewards: list[float] = [0]
|
|
||||||
if self.eval_mode:
|
|
||||||
logger.debug(f"Browsing goal: {obs['goal']}")
|
|
||||||
self.eval_goal = obs["goal"]
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
f"Browser env started in {'headless' if headless else 'visible'} mode."
|
|
||||||
)
|
|
||||||
while should_continue():
|
|
||||||
try:
|
|
||||||
if self.browser_side.poll(timeout=0.01):
|
|
||||||
unique_request_id, action_data = self.browser_side.recv()
|
|
||||||
|
|
||||||
# shutdown the browser environment
|
|
||||||
if unique_request_id == "SHUTDOWN":
|
|
||||||
logger.debug("SHUTDOWN recv, shutting down browser env...")
|
|
||||||
env.close()
|
|
||||||
return
|
|
||||||
elif unique_request_id == "IS_ALIVE":
|
|
||||||
self.browser_side.send(("ALIVE", None))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# EVAL ONLY: Get evaluation info
|
|
||||||
if action_data["action"] == BROWSER_EVAL_GET_GOAL_ACTION:
|
|
||||||
self.browser_side.send(
|
|
||||||
(unique_request_id, {"text_content": self.eval_goal})
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
elif action_data["action"] == BROWSER_EVAL_GET_REWARDS_ACTION:
|
|
||||||
self.browser_side.send(
|
|
||||||
(
|
|
||||||
unique_request_id,
|
|
||||||
{"text_content": json.dumps(self.eval_rewards)},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
action = action_data["action"]
|
|
||||||
obs, reward, terminated, truncated, info = env.step(action)
|
|
||||||
|
|
||||||
# EVAL ONLY: Save the rewards into file for evaluation
|
|
||||||
if self.eval_mode:
|
|
||||||
self.eval_rewards.append(reward)
|
|
||||||
|
|
||||||
# add text content of the page
|
|
||||||
html_str = flatten_dom_to_str(obs["dom_object"])
|
|
||||||
obs["text_content"] = self.html_text_converter.handle(html_str)
|
|
||||||
# make observation serializable
|
|
||||||
obs["screenshot"] = self.image_to_png_base64_url(obs["screenshot"])
|
|
||||||
obs["active_page_index"] = obs["active_page_index"].item()
|
|
||||||
obs["elapsed_time"] = obs["elapsed_time"].item()
|
|
||||||
self.browser_side.send((unique_request_id, obs))
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
logger.debug("Browser env process interrupted by user.")
|
|
||||||
try:
|
|
||||||
env.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return
|
|
||||||
|
|
||||||
def step(self, action_str: str, timeout: float = 30) -> dict:
|
|
||||||
"""Execute an action in the browser environment and return the observation."""
|
|
||||||
unique_request_id = str(uuid.uuid4())
|
|
||||||
self.agent_side.send((unique_request_id, {"action": action_str}))
|
|
||||||
start_time = time.time()
|
|
||||||
while True:
|
|
||||||
if should_exit() or time.time() - start_time > timeout:
|
|
||||||
raise TimeoutError("Browser environment took too long to respond.")
|
|
||||||
if self.agent_side.poll(timeout=0.01):
|
|
||||||
response_id, obs = self.agent_side.recv()
|
|
||||||
if response_id == unique_request_id:
|
|
||||||
return obs
|
|
||||||
|
|
||||||
def check_alive(self, timeout: float = 60):
|
|
||||||
self.agent_side.send(("IS_ALIVE", None))
|
|
||||||
if self.agent_side.poll(timeout=timeout):
|
|
||||||
response_id, _ = self.agent_side.recv()
|
|
||||||
if response_id == "ALIVE":
|
|
||||||
return True
|
|
||||||
logger.debug(f"Browser env is not alive. Response ID: {response_id}")
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
if (
|
|
||||||
not hasattr(self, "process")
|
|
||||||
or self.process is None
|
|
||||||
or not self.process.is_alive()
|
|
||||||
):
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
self.agent_side.send(("SHUTDOWN", None))
|
|
||||||
self.process.join(5) # Wait for the process to terminate
|
|
||||||
if self.process.is_alive():
|
|
||||||
logger.error(
|
|
||||||
"Browser process did not terminate, forcefully terminating..."
|
|
||||||
)
|
|
||||||
self.process.terminate()
|
|
||||||
self.process.join(5) # Wait for the process to terminate
|
|
||||||
if self.process.is_alive():
|
|
||||||
self.process.kill()
|
|
||||||
self.process.join(5) # Wait for the process to terminate
|
|
||||||
self.agent_side.close()
|
|
||||||
self.browser_side.close()
|
|
||||||
except Exception:
|
|
||||||
logger.error("Encountered an error when closing browser env", exc_info=True)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def image_to_png_base64_url(
|
|
||||||
image: np.ndarray | Image.Image, add_data_prefix: bool = False
|
|
||||||
):
|
|
||||||
"""Convert a numpy array to a base64 encoded png image url."""
|
|
||||||
if isinstance(image, np.ndarray):
|
|
||||||
image = Image.fromarray(image)
|
|
||||||
if image.mode in ("RGBA", "LA"):
|
|
||||||
image = image.convert("RGB")
|
|
||||||
buffered = io.BytesIO()
|
|
||||||
image.save(buffered, format="PNG")
|
|
||||||
|
|
||||||
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
|
||||||
return (
|
|
||||||
f"data:image/png;base64,{image_base64}"
|
|
||||||
if add_data_prefix
|
|
||||||
else f"{image_base64}"
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def image_to_jpg_base64_url(
|
|
||||||
image: np.ndarray | Image.Image, add_data_prefix: bool = False
|
|
||||||
):
|
|
||||||
"""Convert a numpy array to a base64 encoded jpeg image url."""
|
|
||||||
if isinstance(image, np.ndarray):
|
|
||||||
image = Image.fromarray(image)
|
|
||||||
if image.mode in ("RGBA", "LA"):
|
|
||||||
image = image.convert("RGB")
|
|
||||||
buffered = io.BytesIO()
|
|
||||||
image.save(buffered, format="JPEG")
|
|
||||||
|
|
||||||
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
|
||||||
return (
|
|
||||||
f"data:image/jpeg;base64,{image_base64}"
|
|
||||||
if add_data_prefix
|
|
||||||
else f"{image_base64}"
|
|
||||||
)
|
|
@ -2,7 +2,8 @@ import asyncio
|
|||||||
import json
|
import json
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from browser_use import Browser as BrowserUseBrowser, BrowserConfig
|
from browser_use import Browser as BrowserUseBrowser
|
||||||
|
from browser_use import BrowserConfig
|
||||||
from browser_use.browser.context import BrowserContext
|
from browser_use.browser.context import BrowserContext
|
||||||
from browser_use.dom.service import DomService
|
from browser_use.dom.service import DomService
|
||||||
from pydantic import Field, field_validator
|
from pydantic import Field, field_validator
|
||||||
@ -10,8 +11,9 @@ from pydantic_core.core_schema import ValidationInfo
|
|||||||
|
|
||||||
from app.tool.base import BaseTool, ToolResult
|
from app.tool.base import BaseTool, ToolResult
|
||||||
|
|
||||||
|
|
||||||
_BROWSER_DESCRIPTION = """
|
_BROWSER_DESCRIPTION = """
|
||||||
Interact with a web browser to perform various actions such as navigation, element interaction,
|
Interact with a web browser to perform various actions such as navigation, element interaction,
|
||||||
content extraction, and tab management. Supported actions include:
|
content extraction, and tab management. Supported actions include:
|
||||||
- 'navigate': Go to a specific URL
|
- 'navigate': Go to a specific URL
|
||||||
- 'click': Click an element by index
|
- 'click': Click an element by index
|
||||||
@ -36,35 +38,41 @@ class BrowserUseTool(BaseTool):
|
|||||||
"action": {
|
"action": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": [
|
"enum": [
|
||||||
"navigate", "click", "input_text", "screenshot", "get_html", "execute_js",
|
"navigate",
|
||||||
"scroll", "switch_tab", "new_tab", "close_tab", "refresh"
|
"click",
|
||||||
|
"input_text",
|
||||||
|
"screenshot",
|
||||||
|
"get_html",
|
||||||
|
"execute_js",
|
||||||
|
"scroll",
|
||||||
|
"switch_tab",
|
||||||
|
"new_tab",
|
||||||
|
"close_tab",
|
||||||
|
"refresh",
|
||||||
],
|
],
|
||||||
"description": "The browser action to perform"
|
"description": "The browser action to perform",
|
||||||
},
|
},
|
||||||
"url": {
|
"url": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "URL for 'navigate' or 'new_tab' actions"
|
"description": "URL for 'navigate' or 'new_tab' actions",
|
||||||
},
|
},
|
||||||
"index": {
|
"index": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Element index for 'click' or 'input_text' actions"
|
"description": "Element index for 'click' or 'input_text' actions",
|
||||||
},
|
|
||||||
"text": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Text for 'input_text' action"
|
|
||||||
},
|
},
|
||||||
|
"text": {"type": "string", "description": "Text for 'input_text' action"},
|
||||||
"script": {
|
"script": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "JavaScript code for 'execute_js' action"
|
"description": "JavaScript code for 'execute_js' action",
|
||||||
},
|
},
|
||||||
"scroll_amount": {
|
"scroll_amount": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Pixels to scroll (positive for down, negative for up) for 'scroll' action"
|
"description": "Pixels to scroll (positive for down, negative for up) for 'scroll' action",
|
||||||
},
|
},
|
||||||
"tab_id": {
|
"tab_id": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Tab ID for 'switch_tab' action"
|
"description": "Tab ID for 'switch_tab' action",
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
"required": ["action"],
|
"required": ["action"],
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
@ -74,8 +82,8 @@ class BrowserUseTool(BaseTool):
|
|||||||
"execute_js": ["script"],
|
"execute_js": ["script"],
|
||||||
"switch_tab": ["tab_id"],
|
"switch_tab": ["tab_id"],
|
||||||
"new_tab": ["url"],
|
"new_tab": ["url"],
|
||||||
"scroll": ["scroll_amount"]
|
"scroll": ["scroll_amount"],
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
lock: asyncio.Lock = Field(default_factory=asyncio.Lock)
|
lock: asyncio.Lock = Field(default_factory=asyncio.Lock)
|
||||||
@ -83,7 +91,7 @@ class BrowserUseTool(BaseTool):
|
|||||||
context: Optional[BrowserContext] = Field(default=None, exclude=True)
|
context: Optional[BrowserContext] = Field(default=None, exclude=True)
|
||||||
dom_service: Optional[DomService] = Field(default=None, exclude=True)
|
dom_service: Optional[DomService] = Field(default=None, exclude=True)
|
||||||
|
|
||||||
@field_validator('parameters', mode='before')
|
@field_validator("parameters", mode="before")
|
||||||
def validate_parameters(cls, v: dict, info: ValidationInfo) -> dict:
|
def validate_parameters(cls, v: dict, info: ValidationInfo) -> dict:
|
||||||
if not v:
|
if not v:
|
||||||
raise ValueError("Parameters cannot be empty")
|
raise ValueError("Parameters cannot be empty")
|
||||||
@ -98,10 +106,17 @@ class BrowserUseTool(BaseTool):
|
|||||||
self.dom_service = DomService(await self.context.get_current_page())
|
self.dom_service = DomService(await self.context.get_current_page())
|
||||||
return self.context
|
return self.context
|
||||||
|
|
||||||
async def execute(self, action: str, url: Optional[str] = None, index: Optional[int] = None,
|
async def execute(
|
||||||
text: Optional[str] = None, script: Optional[str] = None,
|
self,
|
||||||
scroll_amount: Optional[int] = None, tab_id: Optional[int] = None,
|
action: str,
|
||||||
**kwargs) -> ToolResult:
|
url: Optional[str] = None,
|
||||||
|
index: Optional[int] = None,
|
||||||
|
text: Optional[str] = None,
|
||||||
|
script: Optional[str] = None,
|
||||||
|
scroll_amount: Optional[int] = None,
|
||||||
|
tab_id: Optional[int] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> ToolResult:
|
||||||
"""
|
"""
|
||||||
Execute a specified browser action.
|
Execute a specified browser action.
|
||||||
|
|
||||||
@ -142,18 +157,22 @@ class BrowserUseTool(BaseTool):
|
|||||||
|
|
||||||
elif action == "input_text":
|
elif action == "input_text":
|
||||||
if index is None or not text:
|
if index is None or not text:
|
||||||
return ToolResult(error="Index and text are required for 'input_text' action")
|
return ToolResult(
|
||||||
|
error="Index and text are required for 'input_text' action"
|
||||||
|
)
|
||||||
element = await context.get_dom_element_by_index(index)
|
element = await context.get_dom_element_by_index(index)
|
||||||
if not element:
|
if not element:
|
||||||
return ToolResult(error=f"Element with index {index} not found")
|
return ToolResult(error=f"Element with index {index} not found")
|
||||||
await context._input_text_element_node(element, text)
|
await context._input_text_element_node(element, text)
|
||||||
return ToolResult(output=f"Input '{text}' into element at index {index}")
|
return ToolResult(
|
||||||
|
output=f"Input '{text}' into element at index {index}"
|
||||||
|
)
|
||||||
|
|
||||||
elif action == "screenshot":
|
elif action == "screenshot":
|
||||||
screenshot = await context.take_screenshot(full_page=True)
|
screenshot = await context.take_screenshot(full_page=True)
|
||||||
return ToolResult(
|
return ToolResult(
|
||||||
output=f"Screenshot captured (base64 length: {len(screenshot)})",
|
output=f"Screenshot captured (base64 length: {len(screenshot)})",
|
||||||
system=screenshot
|
system=screenshot,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif action == "get_html":
|
elif action == "get_html":
|
||||||
@ -163,20 +182,30 @@ class BrowserUseTool(BaseTool):
|
|||||||
|
|
||||||
elif action == "execute_js":
|
elif action == "execute_js":
|
||||||
if not script:
|
if not script:
|
||||||
return ToolResult(error="Script is required for 'execute_js' action")
|
return ToolResult(
|
||||||
|
error="Script is required for 'execute_js' action"
|
||||||
|
)
|
||||||
result = await context.execute_javascript(script)
|
result = await context.execute_javascript(script)
|
||||||
return ToolResult(output=str(result))
|
return ToolResult(output=str(result))
|
||||||
|
|
||||||
elif action == "scroll":
|
elif action == "scroll":
|
||||||
if scroll_amount is None:
|
if scroll_amount is None:
|
||||||
return ToolResult(error="Scroll amount is required for 'scroll' action")
|
return ToolResult(
|
||||||
await context.execute_javascript(f"window.scrollBy(0, {scroll_amount});")
|
error="Scroll amount is required for 'scroll' action"
|
||||||
|
)
|
||||||
|
await context.execute_javascript(
|
||||||
|
f"window.scrollBy(0, {scroll_amount});"
|
||||||
|
)
|
||||||
direction = "down" if scroll_amount > 0 else "up"
|
direction = "down" if scroll_amount > 0 else "up"
|
||||||
return ToolResult(output=f"Scrolled {direction} by {abs(scroll_amount)} pixels")
|
return ToolResult(
|
||||||
|
output=f"Scrolled {direction} by {abs(scroll_amount)} pixels"
|
||||||
|
)
|
||||||
|
|
||||||
elif action == "switch_tab":
|
elif action == "switch_tab":
|
||||||
if tab_id is None:
|
if tab_id is None:
|
||||||
return ToolResult(error="Tab ID is required for 'switch_tab' action")
|
return ToolResult(
|
||||||
|
error="Tab ID is required for 'switch_tab' action"
|
||||||
|
)
|
||||||
await context.switch_to_tab(tab_id)
|
await context.switch_to_tab(tab_id)
|
||||||
return ToolResult(output=f"Switched to tab {tab_id}")
|
return ToolResult(output=f"Switched to tab {tab_id}")
|
||||||
|
|
||||||
@ -210,7 +239,7 @@ class BrowserUseTool(BaseTool):
|
|||||||
"url": state.url,
|
"url": state.url,
|
||||||
"title": state.title,
|
"title": state.title,
|
||||||
"tabs": [tab.model_dump() for tab in state.tabs],
|
"tabs": [tab.model_dump() for tab in state.tabs],
|
||||||
"interactive_elements": state.element_tree.clickable_elements_to_string()
|
"interactive_elements": state.element_tree.clickable_elements_to_string(),
|
||||||
}
|
}
|
||||||
return ToolResult(output=json.dumps(state_info))
|
return ToolResult(output=json.dumps(state_info))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -1,6 +1,4 @@
|
|||||||
import os
|
import os
|
||||||
from typing import Optional
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from app.tool.base import BaseTool
|
from app.tool.base import BaseTool
|
||||||
|
|
||||||
@ -16,20 +14,20 @@ The tool accepts content and a file path, and saves the content to that location
|
|||||||
"properties": {
|
"properties": {
|
||||||
"content": {
|
"content": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "(required) The content to save to the file."
|
"description": "(required) The content to save to the file.",
|
||||||
},
|
},
|
||||||
"file_path": {
|
"file_path": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "(required) The path where the file should be saved, including filename and extension."
|
"description": "(required) The path where the file should be saved, including filename and extension.",
|
||||||
},
|
},
|
||||||
"mode": {
|
"mode": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "(optional) The file opening mode. Default is 'w' for write. Use 'a' for append.",
|
"description": "(optional) The file opening mode. Default is 'w' for write. Use 'a' for append.",
|
||||||
"enum": ["w", "a"],
|
"enum": ["w", "a"],
|
||||||
"default": "w"
|
"default": "w",
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
"required": ["content", "file_path"]
|
"required": ["content", "file_path"],
|
||||||
}
|
}
|
||||||
|
|
||||||
async def execute(self, content: str, file_path: str, mode: str = "w") -> str:
|
async def execute(self, content: str, file_path: str, mode: str = "w") -> str:
|
||||||
@ -51,7 +49,7 @@ The tool accepts content and a file path, and saves the content to that location
|
|||||||
os.makedirs(directory)
|
os.makedirs(directory)
|
||||||
|
|
||||||
# Write directly to the file
|
# Write directly to the file
|
||||||
with open(file_path, mode, encoding='utf-8') as file:
|
with open(file_path, mode, encoding="utf-8") as file:
|
||||||
file.write(content)
|
file.write(content)
|
||||||
|
|
||||||
return f"Content successfully saved to {file_path}"
|
return f"Content successfully saved to {file_path}"
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from typing import Optional, List
|
from typing import List
|
||||||
|
|
||||||
from googlesearch import search
|
from googlesearch import search
|
||||||
|
|
||||||
from app.tool.base import BaseTool
|
from app.tool.base import BaseTool
|
||||||
@ -16,15 +17,15 @@ The tool returns a list of URLs that match the search query.
|
|||||||
"properties": {
|
"properties": {
|
||||||
"query": {
|
"query": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "(required) The search query to submit to Google."
|
"description": "(required) The search query to submit to Google.",
|
||||||
},
|
},
|
||||||
"num_results": {
|
"num_results": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "(optional) The number of search results to return. Default is 10.",
|
"description": "(optional) The number of search results to return. Default is 10.",
|
||||||
"default": 10
|
"default": 10,
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
"required": ["query"]
|
"required": ["query"],
|
||||||
}
|
}
|
||||||
|
|
||||||
async def execute(self, query: str, num_results: int = 10) -> List[str]:
|
async def execute(self, query: str, num_results: int = 10) -> List[str]:
|
||||||
@ -41,8 +42,7 @@ The tool returns a list of URLs that match the search query.
|
|||||||
# Run the search in a thread pool to prevent blocking
|
# Run the search in a thread pool to prevent blocking
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
links = await loop.run_in_executor(
|
links = await loop.run_in_executor(
|
||||||
None,
|
None, lambda: list(search(query, num_results=num_results))
|
||||||
lambda: list(search(query, num_results=num_results))
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return links
|
return links
|
||||||
|
@ -1,112 +0,0 @@
|
|||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def extract_html_content(text: str, stack: str = "react-tailwind") -> str:
|
|
||||||
"""
|
|
||||||
Extract code content from LLM response based on technology stack.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: The raw text response from LLM
|
|
||||||
stack: Technology stack ("react-tailwind", "html-tailwind", "svg")
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Extracted code content
|
|
||||||
"""
|
|
||||||
# Remove markdown code blocks if present
|
|
||||||
text = re.sub(r"```[\w]*\n|```", "", text)
|
|
||||||
|
|
||||||
if stack == "svg":
|
|
||||||
# Extract SVG content
|
|
||||||
svg_match = re.search(r"(<svg.*?>.*?</svg>)", text, re.DOTALL)
|
|
||||||
if svg_match:
|
|
||||||
return svg_match.group(1)
|
|
||||||
elif stack == "react-tailwind":
|
|
||||||
# Extract React component content
|
|
||||||
react_match = re.search(r"(export default function.*?})\s*$", text, re.DOTALL)
|
|
||||||
if react_match:
|
|
||||||
return react_match.group(1)
|
|
||||||
# Alternative: look for const/function component definition
|
|
||||||
alt_match = re.search(
|
|
||||||
r"((?:const|function)\s+\w+\s*=?\s*(?:\([^)]*\))?\s*=>?\s*{.*?})\s*$",
|
|
||||||
text,
|
|
||||||
re.DOTALL,
|
|
||||||
)
|
|
||||||
if alt_match:
|
|
||||||
return alt_match.group(1)
|
|
||||||
|
|
||||||
# Default: try to extract content within <html> tags
|
|
||||||
html_match = re.search(r"(<html.*?>.*?</html>)", text, re.DOTALL)
|
|
||||||
if html_match:
|
|
||||||
return html_match.group(1)
|
|
||||||
|
|
||||||
# If no specific patterns match, try to extract any HTML-like content
|
|
||||||
body_match = re.search(r"(<body.*?>.*?</body>)", text, re.DOTALL)
|
|
||||||
if body_match:
|
|
||||||
return f"<html>\n{body_match.group(1)}\n</html>"
|
|
||||||
|
|
||||||
div_match = re.search(r"(<div.*?>.*?</div>)", text, re.DOTALL)
|
|
||||||
if div_match:
|
|
||||||
return f"<html>\n<body>\n{div_match.group(1)}\n</body>\n</html>"
|
|
||||||
|
|
||||||
# If no patterns match, clean up the text and return it
|
|
||||||
cleaned_text = text.strip()
|
|
||||||
print(
|
|
||||||
f"[Code Extraction] No specific pattern found for stack '{stack}'. Raw content:\n{cleaned_text}"
|
|
||||||
)
|
|
||||||
return cleaned_text
|
|
||||||
|
|
||||||
|
|
||||||
def clean_code_content(code: str) -> str:
|
|
||||||
"""
|
|
||||||
Clean and format the extracted code content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
code: Raw code content
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Cleaned and formatted code
|
|
||||||
"""
|
|
||||||
# Remove leading/trailing whitespace
|
|
||||||
code = code.strip()
|
|
||||||
|
|
||||||
# Remove extra blank lines
|
|
||||||
code = re.sub(r"\n\s*\n", "\n\n", code)
|
|
||||||
|
|
||||||
# Ensure proper indentation
|
|
||||||
lines = code.split("\n")
|
|
||||||
indent_level = 0
|
|
||||||
formatted_lines = []
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
# Adjust indent level based on brackets/braces
|
|
||||||
stripped_line = line.strip()
|
|
||||||
if stripped_line.endswith("{"):
|
|
||||||
formatted_lines.append(" " * indent_level + stripped_line)
|
|
||||||
indent_level += 1
|
|
||||||
elif stripped_line.startswith("}"):
|
|
||||||
indent_level = max(0, indent_level - 1)
|
|
||||||
formatted_lines.append(" " * indent_level + stripped_line)
|
|
||||||
else:
|
|
||||||
formatted_lines.append(" " * indent_level + stripped_line)
|
|
||||||
|
|
||||||
return "\n".join(formatted_lines)
|
|
||||||
|
|
||||||
|
|
||||||
def extract_code_content(text: str, stack: str = "react-tailwind") -> str:
|
|
||||||
"""
|
|
||||||
Main function to extract and clean code content.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: Raw text from LLM response
|
|
||||||
stack: Technology stack being used
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Final cleaned and formatted code
|
|
||||||
"""
|
|
||||||
# Extract the relevant code content
|
|
||||||
extracted_content = extract_html_content(text, stack)
|
|
||||||
|
|
||||||
# Clean and format the code
|
|
||||||
cleaned_content = clean_code_content(extracted_content)
|
|
||||||
|
|
||||||
return cleaned_content
|
|
@ -1,74 +0,0 @@
|
|||||||
"""
|
|
||||||
This module monitors the app for shutdown signals
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import signal
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
from types import FrameType
|
|
||||||
|
|
||||||
from uvicorn.server import HANDLED_SIGNALS
|
|
||||||
|
|
||||||
from app.logger import logger
|
|
||||||
|
|
||||||
|
|
||||||
_should_exit = None
|
|
||||||
|
|
||||||
|
|
||||||
def _register_signal_handler(sig: signal.Signals):
|
|
||||||
original_handler = None
|
|
||||||
|
|
||||||
def handler(sig_: int, frame: FrameType | None):
|
|
||||||
logger.debug(f"shutdown_signal:{sig_}")
|
|
||||||
global _should_exit
|
|
||||||
_should_exit = True
|
|
||||||
if original_handler:
|
|
||||||
original_handler(sig_, frame) # type: ignore[unreachable]
|
|
||||||
|
|
||||||
original_handler = signal.signal(sig, handler)
|
|
||||||
|
|
||||||
|
|
||||||
def _register_signal_handlers():
|
|
||||||
global _should_exit
|
|
||||||
if _should_exit is not None:
|
|
||||||
return
|
|
||||||
_should_exit = False
|
|
||||||
|
|
||||||
logger.debug("_register_signal_handlers")
|
|
||||||
|
|
||||||
# Check if we're in the main thread of the main interpreter
|
|
||||||
if threading.current_thread() is threading.main_thread():
|
|
||||||
logger.debug("_register_signal_handlers:main_thread")
|
|
||||||
for sig in HANDLED_SIGNALS:
|
|
||||||
_register_signal_handler(sig)
|
|
||||||
else:
|
|
||||||
logger.debug("_register_signal_handlers:not_main_thread")
|
|
||||||
|
|
||||||
|
|
||||||
def should_exit() -> bool:
|
|
||||||
_register_signal_handlers()
|
|
||||||
return bool(_should_exit)
|
|
||||||
|
|
||||||
|
|
||||||
def should_continue() -> bool:
|
|
||||||
_register_signal_handlers()
|
|
||||||
return not _should_exit
|
|
||||||
|
|
||||||
|
|
||||||
def sleep_if_should_continue(timeout: float):
|
|
||||||
if timeout <= 1:
|
|
||||||
time.sleep(timeout)
|
|
||||||
return
|
|
||||||
start_time = time.time()
|
|
||||||
while (time.time() - start_time) < timeout and should_continue():
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
|
|
||||||
async def async_sleep_if_should_continue(timeout: float):
|
|
||||||
if timeout <= 1:
|
|
||||||
await asyncio.sleep(timeout)
|
|
||||||
return
|
|
||||||
start_time = time.time()
|
|
||||||
while time.time() - start_time < timeout and should_continue():
|
|
||||||
await asyncio.sleep(1)
|
|
@ -14,4 +14,3 @@ uvicorn~=0.34.0
|
|||||||
unidiff~=0.7.5
|
unidiff~=0.7.5
|
||||||
browser-use~=0.1.40
|
browser-use~=0.1.40
|
||||||
googlesearch-python~=1.3.0
|
googlesearch-python~=1.3.0
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ from app.flow.base import FlowType
|
|||||||
from app.flow.flow_factory import FlowFactory
|
from app.flow.flow_factory import FlowFactory
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
async def run_flow():
|
||||||
agent = ToolCallAgent()
|
agent = ToolCallAgent()
|
||||||
|
|
||||||
flow = FlowFactory.create_flow(
|
flow = FlowFactory.create_flow(
|
||||||
@ -13,7 +13,9 @@ if __name__ == "__main__":
|
|||||||
agents=agent,
|
agents=agent,
|
||||||
)
|
)
|
||||||
|
|
||||||
result = asyncio.run(
|
result = await flow.execute("Create a web app that shows Japan travel destinations")
|
||||||
flow.execute("Create a web app that shows Japan travel destinations")
|
|
||||||
)
|
|
||||||
print(result)
|
print(result)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(run_flow())
|
Loading…
x
Reference in New Issue
Block a user