Merge remote-tracking branch 'origin/main'

This commit is contained in:
xiangjinyu 2025-03-07 01:13:52 +08:00
commit 9e966e4b86
22 changed files with 156 additions and 644 deletions

View File

@ -62,7 +62,7 @@ api_key = "sk-..." # Replace with your actual API key
``` ```
## Quick Start ## Quick Start
One line for run OpenManus: One line for run OpenManus:
```bash ```bash
python main.py python main.py
@ -70,7 +70,7 @@ python main.py
Then input your idea via terminal! Then input your idea via terminal!
## How to contribute ## How to contribute
We welcome any friendly suggestions and helpful contributions! Just create issues or submit pull requests. We welcome any friendly suggestions and helpful contributions! Just create issues or submit pull requests.
Or contact @mannaandpoem via 📧email: mannaandpoem@gmail.com Or contact @mannaandpoem via 📧email: mannaandpoem@gmail.com
@ -84,6 +84,6 @@ Or contact @mannaandpoem via 📧email: mannaandpoem@gmail.com
## Acknowledgement ## Acknowledgement
Thanks to [broswer use](https://github.com/browser-use/browser-use) for providing basic support for this project! Thanks to [anthropic-computer-use](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) and [broswer-use](https://github.com/browser-use/browser-use) for providing basic support for this project!
OpenManus is built by contributors from MetaGPT. Huge thanks to this agent community! OpenManus is built by contributors from MetaGPT. Huge thanks to this agent community!

View File

@ -1,15 +1,12 @@
from pydantic import Field, model_validator from pydantic import Field
from app.agent.planning import PlanningAgent
from app.agent.toolcall_en import ToolCallAgent from app.agent.toolcall_en import ToolCallAgent
from app.tool import ToolCollection, Bash, Terminate from app.prompt.manus import NEXT_STEP_PROMPT, SYSTEM_PROMPT
from app.tool.planning import PlanningTool from app.tool import Terminate, ToolCollection
from app.tool.browser_use_tool import BrowserUseTool from app.tool.browser_use_tool import BrowserUseTool
from app.tool.file_saver import FileSaver
from app.tool.google_search import GoogleSearch from app.tool.google_search import GoogleSearch
from app.tool.python_execute import PythonExecute from app.tool.python_execute import PythonExecute
from app.tool.file_saver import FileSaver
from app.prompt.manus import SYSTEM_PROMPT, NEXT_STEP_PROMPT
class Manus(ToolCallAgent): class Manus(ToolCallAgent):
@ -22,7 +19,9 @@ class Manus(ToolCallAgent):
""" """
name: str = "Manus" name: str = "Manus"
description: str = "A versatile agent that can solve various tasks using multiple tools" description: str = (
"A versatile agent that can solve various tasks using multiple tools"
)
system_prompt: str = SYSTEM_PROMPT system_prompt: str = SYSTEM_PROMPT
next_step_prompt: str = NEXT_STEP_PROMPT next_step_prompt: str = NEXT_STEP_PROMPT
@ -33,4 +32,3 @@ class Manus(ToolCallAgent):
PythonExecute(), GoogleSearch(), BrowserUseTool(), FileSaver(), Terminate() PythonExecute(), GoogleSearch(), BrowserUseTool(), FileSaver(), Terminate()
) )
) )

View File

@ -5,37 +5,11 @@ from pydantic import Field, model_validator
from app.agent.toolcall import ToolCallAgent from app.agent.toolcall import ToolCallAgent
from app.logger import logger from app.logger import logger
from app.prompt.planning import NEXT_STEP_PROMPT, PLANNING_SYSTEM_PROMPT
from app.schema import Message, ToolCall from app.schema import Message, ToolCall
from app.tool import PlanningTool, Terminate, ToolCollection from app.tool import PlanningTool, Terminate, ToolCollection
PLANNING_SYSTEM_PROMPT = """
You are an expert Planning Agent tasked with solving complex problems by creating and managing structured plans.
Your job is:
1. Analyze requests to understand the task scope
2. Create clear, actionable plans with the `planning` tool
3. Execute steps using available tools as needed
4. Track progress and adapt plans dynamically
5. Use `finish` to conclude when the task is complete
Available tools will vary by task but may include:
- `planning`: Create, update, and track plans (commands: create, update, mark_step, etc.)
- `finish`: End the task when complete
Break tasks into logical, sequential steps. Think about dependencies and verification methods.
"""
NEXT_STEP_PROMPT = """
Based on the current state, what's your next step?
Consider:
1. Do you need to create or refine a plan?
2. Are you ready to execute a specific step?
3. Have you completed the task?
Provide reasoning, then select the appropriate tool or action.
"""
class PlanningAgent(ToolCallAgent): class PlanningAgent(ToolCallAgent):
""" """
An agent that creates and manages plans to solve tasks. An agent that creates and manages plans to solve tasks.

View File

@ -9,6 +9,7 @@ from app.prompt.toolcall import NEXT_STEP_PROMPT, SYSTEM_PROMPT
from app.schema import AgentState, Message, ToolCall from app.schema import AgentState, Message, ToolCall
from app.tool import CreateChatCompletion, Terminate, ToolCollection from app.tool import CreateChatCompletion, Terminate, ToolCollection
TOOL_CALL_REQUIRED = "Tool calls required but none provided" TOOL_CALL_REQUIRED = "Tool calls required but none provided"
@ -40,7 +41,9 @@ class ToolCallAgent(ReActAgent):
# Get response with tool options # Get response with tool options
response = await self.llm.ask_tool( response = await self.llm.ask_tool(
messages=self.messages, messages=self.messages,
system_msgs=[Message.system_message(self.system_prompt)] if self.system_prompt else None, system_msgs=[Message.system_message(self.system_prompt)]
if self.system_prompt
else None,
tools=self.available_tools.to_params(), tools=self.available_tools.to_params(),
tool_choice=self.tool_choices, tool_choice=self.tool_choices,
) )
@ -48,15 +51,21 @@ class ToolCallAgent(ReActAgent):
# Log response info in a more engaging way # Log response info in a more engaging way
logger.info(f"✨ AI's thoughts: {response.content}") logger.info(f"✨ AI's thoughts: {response.content}")
logger.info(f"🛠️ AI selected {len(response.tool_calls) if response.tool_calls else 0} tools to use") logger.info(
f"🛠️ AI selected {len(response.tool_calls) if response.tool_calls else 0} tools to use"
)
if response.tool_calls: if response.tool_calls:
logger.info(f"🧰 Tools being prepared: {[call.function.name for call in response.tool_calls]}") logger.info(
f"🧰 Tools being prepared: {[call.function.name for call in response.tool_calls]}"
)
try: try:
# Handle different tool_choices modes # Handle different tool_choices modes
if self.tool_choices == "none": if self.tool_choices == "none":
if response.tool_calls: if response.tool_calls:
logger.warning("🤔 Hmm, AI tried to use tools when they weren't available!") logger.warning(
"🤔 Hmm, AI tried to use tools when they weren't available!"
)
if response.content: if response.content:
self.memory.add_message(Message.assistant_message(response.content)) self.memory.add_message(Message.assistant_message(response.content))
return True return True
@ -82,9 +91,11 @@ class ToolCallAgent(ReActAgent):
return bool(self.tool_calls) return bool(self.tool_calls)
except Exception as e: except Exception as e:
logger.error(f"🚨 Oops! The AI's thinking process hit a snag: {e}") logger.error(f"🚨 Oops! The AI's thinking process hit a snag: {e}")
self.memory.add_message(Message.assistant_message( self.memory.add_message(
f"Error encountered while processing: {str(e)}" Message.assistant_message(
)) f"Error encountered while processing: {str(e)}"
)
)
return False return False
async def act(self) -> str: async def act(self) -> str:
@ -94,9 +105,7 @@ class ToolCallAgent(ReActAgent):
raise ValueError(TOOL_CALL_REQUIRED) raise ValueError(TOOL_CALL_REQUIRED)
# Return last message content if no tool calls # Return last message content if no tool calls
return ( return self.messages[-1].content or "No content or commands to execute"
self.messages[-1].content or "No content or commands to execute"
)
results = [] results = []
for command in self.tool_calls: for command in self.tool_calls:
@ -144,7 +153,9 @@ class ToolCallAgent(ReActAgent):
return observation return observation
except json.JSONDecodeError: except json.JSONDecodeError:
error_msg = f"Error parsing arguments for {name}: Invalid JSON format" error_msg = f"Error parsing arguments for {name}: Invalid JSON format"
logger.error(f"📝 Oops! The arguments for '{name}' don't make sense - invalid JSON") logger.error(
f"📝 Oops! The arguments for '{name}' don't make sense - invalid JSON"
)
return f"Error: {error_msg}" return f"Error: {error_msg}"
except Exception as e: except Exception as e:
error_msg = f"Error executing tool {name}: {str(e)}" error_msg = f"Error executing tool {name}: {str(e)}"

View File

@ -9,6 +9,7 @@ from app.prompt.toolcall import NEXT_STEP_PROMPT, SYSTEM_PROMPT
from app.schema import AgentState, Message, ToolCall from app.schema import AgentState, Message, ToolCall
from app.tool import CreateChatCompletion, Terminate, ToolCollection from app.tool import CreateChatCompletion, Terminate, ToolCollection
TOOL_CALL_REQUIRED = "Tool calls required but none provided" TOOL_CALL_REQUIRED = "Tool calls required but none provided"
@ -40,7 +41,9 @@ class ToolCallAgent(ReActAgent):
# Get response with tool options # Get response with tool options
response = await self.llm.ask_tool( response = await self.llm.ask_tool(
messages=self.messages, messages=self.messages,
system_msgs=[Message.system_message(self.system_prompt)] if self.system_prompt else None, system_msgs=[Message.system_message(self.system_prompt)]
if self.system_prompt
else None,
tools=self.available_tools.to_params(), tools=self.available_tools.to_params(),
tool_choice=self.tool_choices, tool_choice=self.tool_choices,
) )
@ -48,9 +51,13 @@ class ToolCallAgent(ReActAgent):
# Log response info in a more engaging way # Log response info in a more engaging way
logger.info(f"✨ AI的思考过程{response.content}") logger.info(f"✨ AI的思考过程{response.content}")
logger.info(f"🛠️ AI选择了 {len(response.tool_calls) if response.tool_calls else 0} 个工具来解决问题") logger.info(
f"🛠️ AI选择了 {len(response.tool_calls) if response.tool_calls else 0} 个工具来解决问题"
)
if response.tool_calls: if response.tool_calls:
logger.info(f"🧰 准备使用的工具箱:{[call.function.name for call in response.tool_calls]}") logger.info(
f"🧰 准备使用的工具箱:{[call.function.name for call in response.tool_calls]}"
)
try: try:
# Handle different tool_choices modes # Handle different tool_choices modes
@ -82,9 +89,11 @@ class ToolCallAgent(ReActAgent):
return bool(self.tool_calls) return bool(self.tool_calls)
except Exception as e: except Exception as e:
logger.error(f"🚨 糟糕AI思考时遇到了一点小问题{e}") logger.error(f"🚨 糟糕AI思考时遇到了一点小问题{e}")
self.memory.add_message(Message.assistant_message( self.memory.add_message(
f"Error encountered while processing: {str(e)}" Message.assistant_message(
)) f"Error encountered while processing: {str(e)}"
)
)
return False return False
async def act(self) -> str: async def act(self) -> str:
@ -94,9 +103,7 @@ class ToolCallAgent(ReActAgent):
raise ValueError(TOOL_CALL_REQUIRED) raise ValueError(TOOL_CALL_REQUIRED)
# Return last message content if no tool calls # Return last message content if no tool calls
return ( return self.messages[-1].content or "No content or commands to execute"
self.messages[-1].content or "No content or commands to execute"
)
results = [] results = []
for command in self.tool_calls: for command in self.tool_calls:

View File

@ -1,7 +1,7 @@
import threading import threading
import tomllib import tomllib
from pathlib import Path from pathlib import Path
from typing import Dict, Optional from typing import Dict
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@ -23,14 +23,8 @@ class LLMSettings(BaseModel):
temperature: float = Field(1.0, description="Sampling temperature") temperature: float = Field(1.0, description="Sampling temperature")
class ScreenshotSettings(BaseModel):
api_key: Optional[str] = Field(None, description="Screenshot API key")
base_url: Optional[str] = Field(None, description="Screenshot service URL")
class AppConfig(BaseModel): class AppConfig(BaseModel):
llm: Dict[str, LLMSettings] llm: Dict[str, LLMSettings]
screenshot: Optional[ScreenshotSettings] = None
class Config: class Config:
@ -94,16 +88,8 @@ class Config:
} }
} }
# Add screenshot config if present
if screenshot_config := raw_config.get("screenshot"):
config_dict["screenshot"] = screenshot_config
self._config = AppConfig(**config_dict) self._config = AppConfig(**config_dict)
@property
def screenshot(self) -> Optional[ScreenshotSettings]:
return self._config.screenshot
@property @property
def llm(self) -> Dict[str, LLMSettings]: def llm(self) -> Dict[str, LLMSettings]:
return self._config.llm return self._config.llm

View File

@ -3,10 +3,3 @@ class ToolError(Exception):
def __init__(self, message): def __init__(self, message):
self.message = message self.message = message
class BrowserException(Exception):
"""Base exception for browser-related errors."""
def __init__(self, message):
super().__init__(message)

View File

@ -22,7 +22,7 @@ def define_log_level(print_level="INFO", logfile_level="DEBUG", name: str = None
_logger.remove() _logger.remove()
_logger.add(sys.stderr, level=print_level) _logger.add(sys.stderr, level=print_level)
_logger.add(PROJECT_ROOT / f"logs/{log_name}.txt", level=logfile_level) _logger.add(PROJECT_ROOT / f"logs/{log_name}.log", level=logfile_level)
return _logger return _logger

View File

@ -1,21 +0,0 @@
from typing import List, Optional
from app.agent.base import BaseAgent
from app.flow.base import FlowType
from app.flow.flow_factory import FlowFactory
from app.tool import BaseTool, ToolCollection
async def loop(
agent: BaseAgent,
tools: Optional[List[BaseTool]] = None,
flow_type: FlowType = FlowType.PLANNING,
input_text: str = "",
**loop_kwargs,
) -> str:
"""Main entry point for running an agent with specified flow type"""
tool_collection = ToolCollection(*tools) if tools else None
flow = FlowFactory.create_flow(
flow_type, agent, tool_collection=tool_collection, **loop_kwargs
)
return await flow.execute(input_text)

View File

@ -11,4 +11,4 @@ BrowserUseTool: Open and control web browsers
GoogleSearch: Perform web information retrieval GoogleSearch: Perform web information retrieval
Based on user needs, proactively select the most appropriate tool or combination of tools. For complex tasks, you can break down the problem and use different tools step by step to solve it. After using each tool, clearly explain the execution results and suggest the next steps. Based on user needs, proactively select the most appropriate tool or combination of tools. For complex tasks, you can break down the problem and use different tools step by step to solve it. After using each tool, clearly explain the execution results and suggest the next steps.
""" """

25
app/prompt/planning.py Normal file
View File

@ -0,0 +1,25 @@
PLANNING_SYSTEM_PROMPT = """
You are an expert Planning Agent tasked with solving complex problems by creating and managing structured plans.
Your job is:
1. Analyze requests to understand the task scope
2. Create clear, actionable plans with the `planning` tool
3. Execute steps using available tools as needed
4. Track progress and adapt plans dynamically
5. Use `finish` to conclude when the task is complete
Available tools will vary by task but may include:
- `planning`: Create, update, and track plans (commands: create, update, mark_step, etc.)
- `finish`: End the task when complete
Break tasks into logical, sequential steps. Think about dependencies and verification methods.
"""
NEXT_STEP_PROMPT = """
Based on the current state, what's your next step?
Consider:
1. Do you need to create or refine a plan?
2. Are you ready to execute a specific step?
3. Have you completed the task?
Provide reasoning, then select the appropriate tool or action.
"""

View File

@ -26,47 +26,3 @@ NEXT_STEP_TEMPLATE = """{{observation}}
(Current directory: {{working_dir}}) (Current directory: {{working_dir}})
bash-$ bash-$
""" """
NEXT_STEP_NO_OUTPUT_TEMPLATE = """Your command ran successfully and did not produce any output.
(Open file: {{open_file}})
(Current directory: {{working_dir}})
bash-$
"""
INSTANCE_TEMPLATE = """We're currently solving the following issue within our repository. Here's the issue text:
ISSUE:
{{problem_statement}}
INSTRUCTIONS:
Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
Remember, YOU SHOULD ALWAYS INCLUDE EXACTLY ONE TOOL CALL/FUNCTION CALL PER RESPONSE.
When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command.
Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with the python <script_name>.py`.
NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
IMPORTANT TIPS:
1. Always start by trying to replicate the bug that the issues discusses.
If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
Then start trying to fix it.
When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
If the bug reproduction script does not print anything when it successfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
so that you can be sure that the script indeed ran fine all the way through.
2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doesn't work, use the linux 'find' command.
5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current open file.
6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
7. Do not try to install any packages with `pip`, `conda`, or any other way. This will usually not work. If the environment is not set up correctly, try to fix the issue without executing python code or running any tests that require the package installed.
(Open file: {{open_file}})
(Current directory: {{working_dir}})
bash-$"""

View File

@ -1,259 +0,0 @@
import atexit
import base64
import io
import json
import multiprocessing
import platform
import time
import uuid
import browsergym.core # noqa F401 (we register the openended task as a gym environment)
import gymnasium as gym
import html2text
import numpy as np
import tenacity
from browsergym.utils.obs import flatten_dom_to_str
from PIL import Image
from app.exceptions import BrowserException
from app.logger import logger
from app.utils.shutdown_listener import should_continue, should_exit
BROWSER_EVAL_GET_GOAL_ACTION = "GET_EVAL_GOAL"
BROWSER_EVAL_GET_REWARDS_ACTION = "GET_EVAL_REWARDS"
class BrowserEnv:
def __init__(self, browsergym_eval_env: str | None = None, headless: bool = False):
"""
Initialize the browser environment.
Args:
browsergym_eval_env: Optional evaluation environment name
headless: Whether to run the browser in headless mode (no UI)
"""
self.html_text_converter = self.get_html_text_converter()
self.eval_mode = False
self.eval_dir = ""
self.browsergym_eval_env = browsergym_eval_env
self.eval_mode = bool(browsergym_eval_env)
self.headless = headless
# Set multiprocessing start method
if platform.system() == "Windows":
multiprocessing.set_start_method("spawn", force=True)
else:
multiprocessing.set_start_method("fork", force=True)
self.browser_side, self.agent_side = multiprocessing.Pipe()
self.process = None # Initialize process as None
self.init_browser()
atexit.register(self.close)
def get_html_text_converter(self):
html_text_converter = html2text.HTML2Text()
# ignore links and images
html_text_converter.ignore_links = False
html_text_converter.ignore_images = True
# use alt text for images
html_text_converter.images_to_alt = True
# disable auto text wrapping
html_text_converter.body_width = 0
return html_text_converter
@tenacity.retry(
wait=tenacity.wait_fixed(1),
stop=tenacity.stop_after_attempt(5),
retry=tenacity.retry_if_exception_type(BrowserException),
)
def init_browser(self):
logger.debug(f"Starting browser env (headless: {self.headless})...")
try:
self.process = multiprocessing.Process(
target=self.browser_process, args=(self.headless,)
)
self.process.start()
except Exception as e:
logger.error(f"Failed to start browser process: {e}")
if self.process is not None:
self.process.terminate()
raise BrowserException("Failed to start browser environment.")
if not self.check_alive():
self.close()
raise BrowserException("Failed to start browser environment.")
def browser_process(self, headless: bool):
if self.eval_mode:
assert self.browsergym_eval_env is not None
logger.debug("Initializing browser env for web browsing evaluation.")
if "webarena" in self.browsergym_eval_env:
import browsergym.webarena # noqa F401 register webarena tasks as gym environments
elif "miniwob" in self.browsergym_eval_env:
import browsergym.miniwob # noqa F401 register miniwob tasks as gym environments
else:
raise ValueError(
f"Unsupported browsergym eval env: {self.browsergym_eval_env}"
)
env = gym.make(
self.browsergym_eval_env,
tags_to_mark="all",
headless=headless,
)
else:
env = gym.make(
"browsergym/openended",
task_kwargs={"start_url": "about:blank", "goal": "PLACEHOLDER_GOAL"},
wait_for_user_message=False,
headless=headless,
disable_env_checker=True,
tags_to_mark="all",
)
obs, info = env.reset()
# EVAL ONLY: save the goal into file for evaluation
self.eval_goal = None
self.eval_rewards: list[float] = [0]
if self.eval_mode:
logger.debug(f"Browsing goal: {obs['goal']}")
self.eval_goal = obs["goal"]
logger.debug(
f"Browser env started in {'headless' if headless else 'visible'} mode."
)
while should_continue():
try:
if self.browser_side.poll(timeout=0.01):
unique_request_id, action_data = self.browser_side.recv()
# shutdown the browser environment
if unique_request_id == "SHUTDOWN":
logger.debug("SHUTDOWN recv, shutting down browser env...")
env.close()
return
elif unique_request_id == "IS_ALIVE":
self.browser_side.send(("ALIVE", None))
continue
# EVAL ONLY: Get evaluation info
if action_data["action"] == BROWSER_EVAL_GET_GOAL_ACTION:
self.browser_side.send(
(unique_request_id, {"text_content": self.eval_goal})
)
continue
elif action_data["action"] == BROWSER_EVAL_GET_REWARDS_ACTION:
self.browser_side.send(
(
unique_request_id,
{"text_content": json.dumps(self.eval_rewards)},
)
)
continue
action = action_data["action"]
obs, reward, terminated, truncated, info = env.step(action)
# EVAL ONLY: Save the rewards into file for evaluation
if self.eval_mode:
self.eval_rewards.append(reward)
# add text content of the page
html_str = flatten_dom_to_str(obs["dom_object"])
obs["text_content"] = self.html_text_converter.handle(html_str)
# make observation serializable
obs["screenshot"] = self.image_to_png_base64_url(obs["screenshot"])
obs["active_page_index"] = obs["active_page_index"].item()
obs["elapsed_time"] = obs["elapsed_time"].item()
self.browser_side.send((unique_request_id, obs))
except KeyboardInterrupt:
logger.debug("Browser env process interrupted by user.")
try:
env.close()
except Exception:
pass
return
def step(self, action_str: str, timeout: float = 30) -> dict:
"""Execute an action in the browser environment and return the observation."""
unique_request_id = str(uuid.uuid4())
self.agent_side.send((unique_request_id, {"action": action_str}))
start_time = time.time()
while True:
if should_exit() or time.time() - start_time > timeout:
raise TimeoutError("Browser environment took too long to respond.")
if self.agent_side.poll(timeout=0.01):
response_id, obs = self.agent_side.recv()
if response_id == unique_request_id:
return obs
def check_alive(self, timeout: float = 60):
self.agent_side.send(("IS_ALIVE", None))
if self.agent_side.poll(timeout=timeout):
response_id, _ = self.agent_side.recv()
if response_id == "ALIVE":
return True
logger.debug(f"Browser env is not alive. Response ID: {response_id}")
def close(self):
if (
not hasattr(self, "process")
or self.process is None
or not self.process.is_alive()
):
return
try:
self.agent_side.send(("SHUTDOWN", None))
self.process.join(5) # Wait for the process to terminate
if self.process.is_alive():
logger.error(
"Browser process did not terminate, forcefully terminating..."
)
self.process.terminate()
self.process.join(5) # Wait for the process to terminate
if self.process.is_alive():
self.process.kill()
self.process.join(5) # Wait for the process to terminate
self.agent_side.close()
self.browser_side.close()
except Exception:
logger.error("Encountered an error when closing browser env", exc_info=True)
@staticmethod
def image_to_png_base64_url(
image: np.ndarray | Image.Image, add_data_prefix: bool = False
):
"""Convert a numpy array to a base64 encoded png image url."""
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
if image.mode in ("RGBA", "LA"):
image = image.convert("RGB")
buffered = io.BytesIO()
image.save(buffered, format="PNG")
image_base64 = base64.b64encode(buffered.getvalue()).decode()
return (
f"data:image/png;base64,{image_base64}"
if add_data_prefix
else f"{image_base64}"
)
@staticmethod
def image_to_jpg_base64_url(
image: np.ndarray | Image.Image, add_data_prefix: bool = False
):
"""Convert a numpy array to a base64 encoded jpeg image url."""
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
if image.mode in ("RGBA", "LA"):
image = image.convert("RGB")
buffered = io.BytesIO()
image.save(buffered, format="JPEG")
image_base64 = base64.b64encode(buffered.getvalue()).decode()
return (
f"data:image/jpeg;base64,{image_base64}"
if add_data_prefix
else f"{image_base64}"
)

View File

@ -2,7 +2,8 @@ import asyncio
import json import json
from typing import Optional from typing import Optional
from browser_use import Browser as BrowserUseBrowser, BrowserConfig from browser_use import Browser as BrowserUseBrowser
from browser_use import BrowserConfig
from browser_use.browser.context import BrowserContext from browser_use.browser.context import BrowserContext
from browser_use.dom.service import DomService from browser_use.dom.service import DomService
from pydantic import Field, field_validator from pydantic import Field, field_validator
@ -10,8 +11,9 @@ from pydantic_core.core_schema import ValidationInfo
from app.tool.base import BaseTool, ToolResult from app.tool.base import BaseTool, ToolResult
_BROWSER_DESCRIPTION = """ _BROWSER_DESCRIPTION = """
Interact with a web browser to perform various actions such as navigation, element interaction, Interact with a web browser to perform various actions such as navigation, element interaction,
content extraction, and tab management. Supported actions include: content extraction, and tab management. Supported actions include:
- 'navigate': Go to a specific URL - 'navigate': Go to a specific URL
- 'click': Click an element by index - 'click': Click an element by index
@ -36,35 +38,41 @@ class BrowserUseTool(BaseTool):
"action": { "action": {
"type": "string", "type": "string",
"enum": [ "enum": [
"navigate", "click", "input_text", "screenshot", "get_html", "execute_js", "navigate",
"scroll", "switch_tab", "new_tab", "close_tab", "refresh" "click",
"input_text",
"screenshot",
"get_html",
"execute_js",
"scroll",
"switch_tab",
"new_tab",
"close_tab",
"refresh",
], ],
"description": "The browser action to perform" "description": "The browser action to perform",
}, },
"url": { "url": {
"type": "string", "type": "string",
"description": "URL for 'navigate' or 'new_tab' actions" "description": "URL for 'navigate' or 'new_tab' actions",
}, },
"index": { "index": {
"type": "integer", "type": "integer",
"description": "Element index for 'click' or 'input_text' actions" "description": "Element index for 'click' or 'input_text' actions",
},
"text": {
"type": "string",
"description": "Text for 'input_text' action"
}, },
"text": {"type": "string", "description": "Text for 'input_text' action"},
"script": { "script": {
"type": "string", "type": "string",
"description": "JavaScript code for 'execute_js' action" "description": "JavaScript code for 'execute_js' action",
}, },
"scroll_amount": { "scroll_amount": {
"type": "integer", "type": "integer",
"description": "Pixels to scroll (positive for down, negative for up) for 'scroll' action" "description": "Pixels to scroll (positive for down, negative for up) for 'scroll' action",
}, },
"tab_id": { "tab_id": {
"type": "integer", "type": "integer",
"description": "Tab ID for 'switch_tab' action" "description": "Tab ID for 'switch_tab' action",
} },
}, },
"required": ["action"], "required": ["action"],
"dependencies": { "dependencies": {
@ -74,8 +82,8 @@ class BrowserUseTool(BaseTool):
"execute_js": ["script"], "execute_js": ["script"],
"switch_tab": ["tab_id"], "switch_tab": ["tab_id"],
"new_tab": ["url"], "new_tab": ["url"],
"scroll": ["scroll_amount"] "scroll": ["scroll_amount"],
} },
} }
lock: asyncio.Lock = Field(default_factory=asyncio.Lock) lock: asyncio.Lock = Field(default_factory=asyncio.Lock)
@ -83,7 +91,7 @@ class BrowserUseTool(BaseTool):
context: Optional[BrowserContext] = Field(default=None, exclude=True) context: Optional[BrowserContext] = Field(default=None, exclude=True)
dom_service: Optional[DomService] = Field(default=None, exclude=True) dom_service: Optional[DomService] = Field(default=None, exclude=True)
@field_validator('parameters', mode='before') @field_validator("parameters", mode="before")
def validate_parameters(cls, v: dict, info: ValidationInfo) -> dict: def validate_parameters(cls, v: dict, info: ValidationInfo) -> dict:
if not v: if not v:
raise ValueError("Parameters cannot be empty") raise ValueError("Parameters cannot be empty")
@ -98,10 +106,17 @@ class BrowserUseTool(BaseTool):
self.dom_service = DomService(await self.context.get_current_page()) self.dom_service = DomService(await self.context.get_current_page())
return self.context return self.context
async def execute(self, action: str, url: Optional[str] = None, index: Optional[int] = None, async def execute(
text: Optional[str] = None, script: Optional[str] = None, self,
scroll_amount: Optional[int] = None, tab_id: Optional[int] = None, action: str,
**kwargs) -> ToolResult: url: Optional[str] = None,
index: Optional[int] = None,
text: Optional[str] = None,
script: Optional[str] = None,
scroll_amount: Optional[int] = None,
tab_id: Optional[int] = None,
**kwargs,
) -> ToolResult:
""" """
Execute a specified browser action. Execute a specified browser action.
@ -142,18 +157,22 @@ class BrowserUseTool(BaseTool):
elif action == "input_text": elif action == "input_text":
if index is None or not text: if index is None or not text:
return ToolResult(error="Index and text are required for 'input_text' action") return ToolResult(
error="Index and text are required for 'input_text' action"
)
element = await context.get_dom_element_by_index(index) element = await context.get_dom_element_by_index(index)
if not element: if not element:
return ToolResult(error=f"Element with index {index} not found") return ToolResult(error=f"Element with index {index} not found")
await context._input_text_element_node(element, text) await context._input_text_element_node(element, text)
return ToolResult(output=f"Input '{text}' into element at index {index}") return ToolResult(
output=f"Input '{text}' into element at index {index}"
)
elif action == "screenshot": elif action == "screenshot":
screenshot = await context.take_screenshot(full_page=True) screenshot = await context.take_screenshot(full_page=True)
return ToolResult( return ToolResult(
output=f"Screenshot captured (base64 length: {len(screenshot)})", output=f"Screenshot captured (base64 length: {len(screenshot)})",
system=screenshot system=screenshot,
) )
elif action == "get_html": elif action == "get_html":
@ -163,20 +182,30 @@ class BrowserUseTool(BaseTool):
elif action == "execute_js": elif action == "execute_js":
if not script: if not script:
return ToolResult(error="Script is required for 'execute_js' action") return ToolResult(
error="Script is required for 'execute_js' action"
)
result = await context.execute_javascript(script) result = await context.execute_javascript(script)
return ToolResult(output=str(result)) return ToolResult(output=str(result))
elif action == "scroll": elif action == "scroll":
if scroll_amount is None: if scroll_amount is None:
return ToolResult(error="Scroll amount is required for 'scroll' action") return ToolResult(
await context.execute_javascript(f"window.scrollBy(0, {scroll_amount});") error="Scroll amount is required for 'scroll' action"
)
await context.execute_javascript(
f"window.scrollBy(0, {scroll_amount});"
)
direction = "down" if scroll_amount > 0 else "up" direction = "down" if scroll_amount > 0 else "up"
return ToolResult(output=f"Scrolled {direction} by {abs(scroll_amount)} pixels") return ToolResult(
output=f"Scrolled {direction} by {abs(scroll_amount)} pixels"
)
elif action == "switch_tab": elif action == "switch_tab":
if tab_id is None: if tab_id is None:
return ToolResult(error="Tab ID is required for 'switch_tab' action") return ToolResult(
error="Tab ID is required for 'switch_tab' action"
)
await context.switch_to_tab(tab_id) await context.switch_to_tab(tab_id)
return ToolResult(output=f"Switched to tab {tab_id}") return ToolResult(output=f"Switched to tab {tab_id}")
@ -210,7 +239,7 @@ class BrowserUseTool(BaseTool):
"url": state.url, "url": state.url,
"title": state.title, "title": state.title,
"tabs": [tab.model_dump() for tab in state.tabs], "tabs": [tab.model_dump() for tab in state.tabs],
"interactive_elements": state.element_tree.clickable_elements_to_string() "interactive_elements": state.element_tree.clickable_elements_to_string(),
} }
return ToolResult(output=json.dumps(state_info)) return ToolResult(output=json.dumps(state_info))
except Exception as e: except Exception as e:

View File

@ -1,6 +1,4 @@
import os import os
from typing import Optional
from pathlib import Path
from app.tool.base import BaseTool from app.tool.base import BaseTool
@ -16,20 +14,20 @@ The tool accepts content and a file path, and saves the content to that location
"properties": { "properties": {
"content": { "content": {
"type": "string", "type": "string",
"description": "(required) The content to save to the file." "description": "(required) The content to save to the file.",
}, },
"file_path": { "file_path": {
"type": "string", "type": "string",
"description": "(required) The path where the file should be saved, including filename and extension." "description": "(required) The path where the file should be saved, including filename and extension.",
}, },
"mode": { "mode": {
"type": "string", "type": "string",
"description": "(optional) The file opening mode. Default is 'w' for write. Use 'a' for append.", "description": "(optional) The file opening mode. Default is 'w' for write. Use 'a' for append.",
"enum": ["w", "a"], "enum": ["w", "a"],
"default": "w" "default": "w",
} },
}, },
"required": ["content", "file_path"] "required": ["content", "file_path"],
} }
async def execute(self, content: str, file_path: str, mode: str = "w") -> str: async def execute(self, content: str, file_path: str, mode: str = "w") -> str:
@ -51,7 +49,7 @@ The tool accepts content and a file path, and saves the content to that location
os.makedirs(directory) os.makedirs(directory)
# Write directly to the file # Write directly to the file
with open(file_path, mode, encoding='utf-8') as file: with open(file_path, mode, encoding="utf-8") as file:
file.write(content) file.write(content)
return f"Content successfully saved to {file_path}" return f"Content successfully saved to {file_path}"

View File

@ -1,5 +1,6 @@
import asyncio import asyncio
from typing import Optional, List from typing import List
from googlesearch import search from googlesearch import search
from app.tool.base import BaseTool from app.tool.base import BaseTool
@ -16,15 +17,15 @@ The tool returns a list of URLs that match the search query.
"properties": { "properties": {
"query": { "query": {
"type": "string", "type": "string",
"description": "(required) The search query to submit to Google." "description": "(required) The search query to submit to Google.",
}, },
"num_results": { "num_results": {
"type": "integer", "type": "integer",
"description": "(optional) The number of search results to return. Default is 10.", "description": "(optional) The number of search results to return. Default is 10.",
"default": 10 "default": 10,
} },
}, },
"required": ["query"] "required": ["query"],
} }
async def execute(self, query: str, num_results: int = 10) -> List[str]: async def execute(self, query: str, num_results: int = 10) -> List[str]:
@ -41,8 +42,7 @@ The tool returns a list of URLs that match the search query.
# Run the search in a thread pool to prevent blocking # Run the search in a thread pool to prevent blocking
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
links = await loop.run_in_executor( links = await loop.run_in_executor(
None, None, lambda: list(search(query, num_results=num_results))
lambda: list(search(query, num_results=num_results))
) )
return links return links

View File

View File

@ -1,112 +0,0 @@
import re
def extract_html_content(text: str, stack: str = "react-tailwind") -> str:
"""
Extract code content from LLM response based on technology stack.
Args:
text: The raw text response from LLM
stack: Technology stack ("react-tailwind", "html-tailwind", "svg")
Returns:
str: Extracted code content
"""
# Remove markdown code blocks if present
text = re.sub(r"```[\w]*\n|```", "", text)
if stack == "svg":
# Extract SVG content
svg_match = re.search(r"(<svg.*?>.*?</svg>)", text, re.DOTALL)
if svg_match:
return svg_match.group(1)
elif stack == "react-tailwind":
# Extract React component content
react_match = re.search(r"(export default function.*?})\s*$", text, re.DOTALL)
if react_match:
return react_match.group(1)
# Alternative: look for const/function component definition
alt_match = re.search(
r"((?:const|function)\s+\w+\s*=?\s*(?:\([^)]*\))?\s*=>?\s*{.*?})\s*$",
text,
re.DOTALL,
)
if alt_match:
return alt_match.group(1)
# Default: try to extract content within <html> tags
html_match = re.search(r"(<html.*?>.*?</html>)", text, re.DOTALL)
if html_match:
return html_match.group(1)
# If no specific patterns match, try to extract any HTML-like content
body_match = re.search(r"(<body.*?>.*?</body>)", text, re.DOTALL)
if body_match:
return f"<html>\n{body_match.group(1)}\n</html>"
div_match = re.search(r"(<div.*?>.*?</div>)", text, re.DOTALL)
if div_match:
return f"<html>\n<body>\n{div_match.group(1)}\n</body>\n</html>"
# If no patterns match, clean up the text and return it
cleaned_text = text.strip()
print(
f"[Code Extraction] No specific pattern found for stack '{stack}'. Raw content:\n{cleaned_text}"
)
return cleaned_text
def clean_code_content(code: str) -> str:
"""
Clean and format the extracted code content.
Args:
code: Raw code content
Returns:
str: Cleaned and formatted code
"""
# Remove leading/trailing whitespace
code = code.strip()
# Remove extra blank lines
code = re.sub(r"\n\s*\n", "\n\n", code)
# Ensure proper indentation
lines = code.split("\n")
indent_level = 0
formatted_lines = []
for line in lines:
# Adjust indent level based on brackets/braces
stripped_line = line.strip()
if stripped_line.endswith("{"):
formatted_lines.append(" " * indent_level + stripped_line)
indent_level += 1
elif stripped_line.startswith("}"):
indent_level = max(0, indent_level - 1)
formatted_lines.append(" " * indent_level + stripped_line)
else:
formatted_lines.append(" " * indent_level + stripped_line)
return "\n".join(formatted_lines)
def extract_code_content(text: str, stack: str = "react-tailwind") -> str:
"""
Main function to extract and clean code content.
Args:
text: Raw text from LLM response
stack: Technology stack being used
Returns:
str: Final cleaned and formatted code
"""
# Extract the relevant code content
extracted_content = extract_html_content(text, stack)
# Clean and format the code
cleaned_content = clean_code_content(extracted_content)
return cleaned_content

View File

@ -1,74 +0,0 @@
"""
This module monitors the app for shutdown signals
"""
import asyncio
import signal
import threading
import time
from types import FrameType
from uvicorn.server import HANDLED_SIGNALS
from app.logger import logger
_should_exit = None
def _register_signal_handler(sig: signal.Signals):
original_handler = None
def handler(sig_: int, frame: FrameType | None):
logger.debug(f"shutdown_signal:{sig_}")
global _should_exit
_should_exit = True
if original_handler:
original_handler(sig_, frame) # type: ignore[unreachable]
original_handler = signal.signal(sig, handler)
def _register_signal_handlers():
global _should_exit
if _should_exit is not None:
return
_should_exit = False
logger.debug("_register_signal_handlers")
# Check if we're in the main thread of the main interpreter
if threading.current_thread() is threading.main_thread():
logger.debug("_register_signal_handlers:main_thread")
for sig in HANDLED_SIGNALS:
_register_signal_handler(sig)
else:
logger.debug("_register_signal_handlers:not_main_thread")
def should_exit() -> bool:
_register_signal_handlers()
return bool(_should_exit)
def should_continue() -> bool:
_register_signal_handlers()
return not _should_exit
def sleep_if_should_continue(timeout: float):
if timeout <= 1:
time.sleep(timeout)
return
start_time = time.time()
while (time.time() - start_time) < timeout and should_continue():
time.sleep(1)
async def async_sleep_if_should_continue(timeout: float):
if timeout <= 1:
await asyncio.sleep(timeout)
return
start_time = time.time()
while time.time() - start_time < timeout and should_continue():
await asyncio.sleep(1)

View File

@ -14,4 +14,3 @@ uvicorn~=0.34.0
unidiff~=0.7.5 unidiff~=0.7.5
browser-use~=0.1.40 browser-use~=0.1.40
googlesearch-python~=1.3.0 googlesearch-python~=1.3.0

View File

@ -5,7 +5,7 @@ from app.flow.base import FlowType
from app.flow.flow_factory import FlowFactory from app.flow.flow_factory import FlowFactory
if __name__ == "__main__": async def run_flow():
agent = ToolCallAgent() agent = ToolCallAgent()
flow = FlowFactory.create_flow( flow = FlowFactory.create_flow(
@ -13,7 +13,9 @@ if __name__ == "__main__":
agents=agent, agents=agent,
) )
result = asyncio.run( result = await flow.execute("Create a web app that shows Japan travel destinations")
flow.execute("Create a web app that shows Japan travel destinations")
)
print(result) print(result)
if __name__ == "__main__":
asyncio.run(run_flow())