Merge remote-tracking branch 'origin/main'

2025-03-07 01:13:52 +08:00 · 2025-03-07 01:13:52 +08:00 · 9e966e4b86
commit 9e966e4b86
parent f841c50108 07b24752b3
22 changed files with 156 additions and 644 deletions
--- a/README.md
+++ b/README.md
@ -62,7 +62,7 @@ api_key = "sk-..."  # Replace with your actual API key
 ```
 ## Quick Start
-One line for run OpenManus:  
+One line for run OpenManus:
 ```bash
 python main.py
@ -70,7 +70,7 @@ python main.py
 Then input your idea via terminal!
-## How to contribute 
+## How to contribute
 We welcome any friendly suggestions and helpful contributions! Just create issues or submit pull requests.
 Or contact @mannaandpoem via 📧email: mannaandpoem@gmail.com
@ -84,6 +84,6 @@ Or contact @mannaandpoem via 📧email: mannaandpoem@gmail.com
 ## Acknowledgement
-Thanks to [broswer use](https://github.com/browser-use/browser-use) for providing basic support for this project!
+Thanks to [anthropic-computer-use](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) and [broswer-use](https://github.com/browser-use/browser-use) for providing basic support for this project!
 OpenManus is built by contributors from MetaGPT. Huge thanks to this agent community!
--- a/app/agent/manus.py
+++ b/app/agent/manus.py
@ -1,15 +1,12 @@
-from pydantic import Field, model_validator
+from pydantic import Field
 from app.agent.planning import PlanningAgent
 from app.agent.toolcall_en import ToolCallAgent
-from app.tool import ToolCollection, Bash, Terminate
+from app.prompt.manus import NEXT_STEP_PROMPT, SYSTEM_PROMPT
-from app.tool.planning import PlanningTool
+from app.tool import Terminate, ToolCollection
 from app.tool.browser_use_tool import BrowserUseTool
 from app.tool.file_saver import FileSaver
 from app.tool.google_search import GoogleSearch
 from app.tool.python_execute import PythonExecute
 from app.tool.file_saver import FileSaver
 from app.prompt.manus import SYSTEM_PROMPT, NEXT_STEP_PROMPT
 class Manus(ToolCallAgent):
@ -22,7 +19,9 @@ class Manus(ToolCallAgent):
    """
    name: str = "Manus"
-    description: str = "A versatile agent that can solve various tasks using multiple tools"
+    description: str = (
        "A versatile agent that can solve various tasks using multiple tools"
    )
    system_prompt: str = SYSTEM_PROMPT
    next_step_prompt: str = NEXT_STEP_PROMPT
@ -33,4 +32,3 @@ class Manus(ToolCallAgent):
            PythonExecute(), GoogleSearch(), BrowserUseTool(), FileSaver(), Terminate()
        )
    )
--- a/app/agent/planning.py
+++ b/app/agent/planning.py
@ -5,37 +5,11 @@ from pydantic import Field, model_validator
 from app.agent.toolcall import ToolCallAgent
 from app.logger import logger
 from app.prompt.planning import NEXT_STEP_PROMPT, PLANNING_SYSTEM_PROMPT
 from app.schema import Message, ToolCall
 from app.tool import PlanningTool, Terminate, ToolCollection
 PLANNING_SYSTEM_PROMPT = """
 You are an expert Planning Agent tasked with solving complex problems by creating and managing structured plans.
 Your job is:
 1. Analyze requests to understand the task scope
 2. Create clear, actionable plans with the `planning` tool
 3. Execute steps using available tools as needed
 4. Track progress and adapt plans dynamically
 5. Use `finish` to conclude when the task is complete
 Available tools will vary by task but may include:
 - `planning`: Create, update, and track plans (commands: create, update, mark_step, etc.)
 - `finish`: End the task when complete
 Break tasks into logical, sequential steps. Think about dependencies and verification methods.
 """
 NEXT_STEP_PROMPT = """
 Based on the current state, what's your next step?
 Consider:
 1. Do you need to create or refine a plan?
 2. Are you ready to execute a specific step?
 3. Have you completed the task?
 Provide reasoning, then select the appropriate tool or action.
 """
 class PlanningAgent(ToolCallAgent):
    """
    An agent that creates and manages plans to solve tasks.
--- a/app/agent/toolcall_en.py
+++ b/app/agent/toolcall_en.py
@ -9,6 +9,7 @@ from app.prompt.toolcall import NEXT_STEP_PROMPT, SYSTEM_PROMPT
 from app.schema import AgentState, Message, ToolCall
 from app.tool import CreateChatCompletion, Terminate, ToolCollection
 TOOL_CALL_REQUIRED = "Tool calls required but none provided"
@ -40,7 +41,9 @@ class ToolCallAgent(ReActAgent):
        # Get response with tool options
        response = await self.llm.ask_tool(
            messages=self.messages,
-            system_msgs=[Message.system_message(self.system_prompt)] if self.system_prompt else None,
+            system_msgs=[Message.system_message(self.system_prompt)]
            if self.system_prompt
            else None,
            tools=self.available_tools.to_params(),
            tool_choice=self.tool_choices,
        )
@ -48,15 +51,21 @@ class ToolCallAgent(ReActAgent):
        # Log response info in a more engaging way
        logger.info(f"✨ AI's thoughts: {response.content}")
-        logger.info(f"🛠️ AI selected {len(response.tool_calls) if response.tool_calls else 0} tools to use")
+        logger.info(
            f"🛠️ AI selected {len(response.tool_calls) if response.tool_calls else 0} tools to use"
        )
        if response.tool_calls:
-            logger.info(f"🧰 Tools being prepared: {[call.function.name for call in response.tool_calls]}")
+            logger.info(
                f"🧰 Tools being prepared: {[call.function.name for call in response.tool_calls]}"
            )
        try:
            # Handle different tool_choices modes
            if self.tool_choices == "none":
                if response.tool_calls:
-                    logger.warning("🤔 Hmm, AI tried to use tools when they weren't available!")
+                    logger.warning(
                        "🤔 Hmm, AI tried to use tools when they weren't available!"
                    )
                if response.content:
                    self.memory.add_message(Message.assistant_message(response.content))
                    return True
@ -82,9 +91,11 @@ class ToolCallAgent(ReActAgent):
            return bool(self.tool_calls)
        except Exception as e:
            logger.error(f"🚨 Oops! The AI's thinking process hit a snag: {e}")
-            self.memory.add_message(Message.assistant_message(
+            self.memory.add_message(
-                f"Error encountered while processing: {str(e)}"
+                Message.assistant_message(
-            ))
+                    f"Error encountered while processing: {str(e)}"
                )
            )
            return False
    async def act(self) -> str:
@ -94,9 +105,7 @@ class ToolCallAgent(ReActAgent):
                raise ValueError(TOOL_CALL_REQUIRED)
            # Return last message content if no tool calls
-            return (
+            return self.messages[-1].content or "No content or commands to execute"
                self.messages[-1].content or "No content or commands to execute"
            )
        results = []
        for command in self.tool_calls:
@ -144,7 +153,9 @@ class ToolCallAgent(ReActAgent):
            return observation
        except json.JSONDecodeError:
            error_msg = f"Error parsing arguments for {name}: Invalid JSON format"
-            logger.error(f"📝 Oops! The arguments for '{name}' don't make sense - invalid JSON")
+            logger.error(
                f"📝 Oops! The arguments for '{name}' don't make sense - invalid JSON"
            )
            return f"Error: {error_msg}"
        except Exception as e:
            error_msg = f"Error executing tool {name}: {str(e)}"
--- a/app/agent/toolcall_zh.py
+++ b/app/agent/toolcall_zh.py
@ -9,6 +9,7 @@ from app.prompt.toolcall import NEXT_STEP_PROMPT, SYSTEM_PROMPT
 from app.schema import AgentState, Message, ToolCall
 from app.tool import CreateChatCompletion, Terminate, ToolCollection
 TOOL_CALL_REQUIRED = "Tool calls required but none provided"
@ -40,7 +41,9 @@ class ToolCallAgent(ReActAgent):
        # Get response with tool options
        response = await self.llm.ask_tool(
            messages=self.messages,
-            system_msgs=[Message.system_message(self.system_prompt)] if self.system_prompt else None,
+            system_msgs=[Message.system_message(self.system_prompt)]
            if self.system_prompt
            else None,
            tools=self.available_tools.to_params(),
            tool_choice=self.tool_choices,
        )
@ -48,9 +51,13 @@ class ToolCallAgent(ReActAgent):
        # Log response info in a more engaging way
        logger.info(f"✨ AI的思考过程：{response.content}")
-        logger.info(f"🛠️ AI选择了 {len(response.tool_calls) if response.tool_calls else 0} 个工具来解决问题")
+        logger.info(
            f"🛠️ AI选择了 {len(response.tool_calls) if response.tool_calls else 0} 个工具来解决问题"
        )
        if response.tool_calls:
-            logger.info(f"🧰 准备使用的工具箱：{[call.function.name for call in response.tool_calls]}")
+            logger.info(
                f"🧰 准备使用的工具箱：{[call.function.name for call in response.tool_calls]}"
            )
        try:
            # Handle different tool_choices modes
@ -82,9 +89,11 @@ class ToolCallAgent(ReActAgent):
            return bool(self.tool_calls)
        except Exception as e:
            logger.error(f"🚨 糟糕！AI思考时遇到了一点小问题：{e}")
-            self.memory.add_message(Message.assistant_message(
+            self.memory.add_message(
-                f"Error encountered while processing: {str(e)}"
+                Message.assistant_message(
-            ))
+                    f"Error encountered while processing: {str(e)}"
                )
            )
            return False
    async def act(self) -> str:
@ -94,9 +103,7 @@ class ToolCallAgent(ReActAgent):
                raise ValueError(TOOL_CALL_REQUIRED)
            # Return last message content if no tool calls
-            return (
+            return self.messages[-1].content or "No content or commands to execute"
                self.messages[-1].content or "No content or commands to execute"
            )
        results = []
        for command in self.tool_calls:
--- a/app/config.py
+++ b/app/config.py
@ -1,7 +1,7 @@
 import threading
 import tomllib
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Dict
 from pydantic import BaseModel, Field
@ -23,14 +23,8 @@ class LLMSettings(BaseModel):
    temperature: float = Field(1.0, description="Sampling temperature")
 class ScreenshotSettings(BaseModel):
    api_key: Optional[str] = Field(None, description="Screenshot API key")
    base_url: Optional[str] = Field(None, description="Screenshot service URL")
 class AppConfig(BaseModel):
    llm: Dict[str, LLMSettings]
    screenshot: Optional[ScreenshotSettings] = None
 class Config:
@ -94,16 +88,8 @@ class Config:
            }
        }
        # Add screenshot config if present
        if screenshot_config := raw_config.get("screenshot"):
            config_dict["screenshot"] = screenshot_config
        self._config = AppConfig(**config_dict)
    @property
    def screenshot(self) -> Optional[ScreenshotSettings]:
        return self._config.screenshot
    @property
    def llm(self) -> Dict[str, LLMSettings]:
        return self._config.llm
--- a/app/exceptions.py
+++ b/app/exceptions.py
@ -3,10 +3,3 @@ class ToolError(Exception):
    def __init__(self, message):
        self.message = message
 class BrowserException(Exception):
    """Base exception for browser-related errors."""
    def __init__(self, message):
        super().__init__(message)
--- a/app/logger.py
+++ b/app/logger.py
@ -22,7 +22,7 @@ def define_log_level(print_level="INFO", logfile_level="DEBUG", name: str = None
    _logger.remove()
    _logger.add(sys.stderr, level=print_level)
-    _logger.add(PROJECT_ROOT / f"logs/{log_name}.txt", level=logfile_level)
+    _logger.add(PROJECT_ROOT / f"logs/{log_name}.log", level=logfile_level)
    return _logger
--- a/app/loop.py
+++ b/app/loop.py
@ -1,21 +0,0 @@
 from typing import List, Optional
 from app.agent.base import BaseAgent
 from app.flow.base import FlowType
 from app.flow.flow_factory import FlowFactory
 from app.tool import BaseTool, ToolCollection
 async def loop(
    agent: BaseAgent,
    tools: Optional[List[BaseTool]] = None,
    flow_type: FlowType = FlowType.PLANNING,
    input_text: str = "",
    **loop_kwargs,
 ) -> str:
    """Main entry point for running an agent with specified flow type"""
    tool_collection = ToolCollection(*tools) if tools else None
    flow = FlowFactory.create_flow(
        flow_type, agent, tool_collection=tool_collection, **loop_kwargs
    )
    return await flow.execute(input_text)
--- a/app/prompt/manus.py
+++ b/app/prompt/manus.py
@ -11,4 +11,4 @@ BrowserUseTool: Open and control web browsers
 GoogleSearch: Perform web information retrieval
 Based on user needs, proactively select the most appropriate tool or combination of tools. For complex tasks, you can break down the problem and use different tools step by step to solve it. After using each tool, clearly explain the execution results and suggest the next steps.
-"""
+"""
--- a/app/prompt/planning.py
+++ b/app/prompt/planning.py
@ -0,0 +1,25 @@
 PLANNING_SYSTEM_PROMPT = """
 You are an expert Planning Agent tasked with solving complex problems by creating and managing structured plans.
 Your job is:
 1. Analyze requests to understand the task scope
 2. Create clear, actionable plans with the `planning` tool
 3. Execute steps using available tools as needed
 4. Track progress and adapt plans dynamically
 5. Use `finish` to conclude when the task is complete
 Available tools will vary by task but may include:
 - `planning`: Create, update, and track plans (commands: create, update, mark_step, etc.)
 - `finish`: End the task when complete
 Break tasks into logical, sequential steps. Think about dependencies and verification methods.
 """
 NEXT_STEP_PROMPT = """
 Based on the current state, what's your next step?
 Consider:
 1. Do you need to create or refine a plan?
 2. Are you ready to execute a specific step?
 3. Have you completed the task?
 Provide reasoning, then select the appropriate tool or action.
 """
--- a/app/prompt/swe.py
+++ b/app/prompt/swe.py
@ -26,47 +26,3 @@ NEXT_STEP_TEMPLATE = """{{observation}}
 (Current directory: {{working_dir}})
 bash-$
 """
 NEXT_STEP_NO_OUTPUT_TEMPLATE = """Your command ran successfully and did not produce any output.
 (Open file: {{open_file}})
 (Current directory: {{working_dir}})
 bash-$
 """
 INSTANCE_TEMPLATE = """We're currently solving the following issue within our repository. Here's the issue text:
 ISSUE:
 {{problem_statement}}
 INSTRUCTIONS:
 Now, you're going to solve this issue on your own. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need to and run any checks or tests that you want.
 Remember, YOU SHOULD ALWAYS INCLUDE EXACTLY ONE TOOL CALL/FUNCTION CALL PER RESPONSE.
 When you're satisfied with all of the changes you've made, you can submit your changes to the code base by simply running the submit command.
 Note however that you cannot use any interactive session commands (e.g. python, vim) in this environment, but you can write scripts and run them. E.g. you can write a python script and then run it with the python <script_name>.py`.
 NOTE ABOUT THE EDIT COMMAND: Indentation really matters! When editing a file, make sure to insert appropriate indentation before each line!
 IMPORTANT TIPS:
 1. Always start by trying to replicate the bug that the issues discusses.
 If the issue includes code for reproducing the bug, we recommend that you re-implement that in your environment, and run it to make sure you can reproduce the bug.
 Then start trying to fix it.
 When you think you've fixed the bug, re-run the bug reproduction script to make sure that the bug has indeed been fixed.
 If the bug reproduction script does not print anything when it successfully runs, we recommend adding a print("Script completed successfully, no errors.") command at the end of the file,
 so that you can be sure that the script indeed ran fine all the way through.
 2. If you run a command and it doesn't work, try running a different command. A command that did not work once will not work the second time unless you modify it!
 3. If you open a file and need to get to an area around a specific line that is not in the first 100 lines, say line 583, don't just use the scroll_down command multiple times. Instead, use the goto 583 command. It's much quicker.
 4. If the bug reproduction script requires inputting/reading a specific file, such as buggy-input.png, and you'd like to understand how to input that file, conduct a search in the existing repo code, to see whether someone else has already done that. Do this by running the command: find_file "buggy-input.png" If that doesn't work, use the linux 'find' command.
 5. Always make sure to look at the currently open file and the current working directory (which appears right after the currently open file). The currently open file might be in a different directory than the working directory! Note that some commands, such as 'create', open files, so they might change the current  open file.
 6. When editing files, it is easy to accidentally specify a wrong line number or to write code with incorrect indentation. Always check the code after you issue an edit to make sure that it reflects what you wanted to accomplish. If it didn't, issue another command to fix it.
 7. Do not try to install any packages with `pip`, `conda`, or any other way. This will usually not work. If the environment is not set up correctly, try to fix the issue without executing python code or running any tests that require the package installed.
 (Open file: {{open_file}})
 (Current directory: {{working_dir}})
 bash-$"""
--- a/app/runtime/init.py
+++ b/app/runtime/init.py
--- a/app/runtime/browser_env.py
+++ b/app/runtime/browser_env.py
@ -1,259 +0,0 @@
 import atexit
 import base64
 import io
 import json
 import multiprocessing
 import platform
 import time
 import uuid
 import browsergym.core  # noqa F401 (we register the openended task as a gym environment)
 import gymnasium as gym
 import html2text
 import numpy as np
 import tenacity
 from browsergym.utils.obs import flatten_dom_to_str
 from PIL import Image
 from app.exceptions import BrowserException
 from app.logger import logger
 from app.utils.shutdown_listener import should_continue, should_exit
 BROWSER_EVAL_GET_GOAL_ACTION = "GET_EVAL_GOAL"
 BROWSER_EVAL_GET_REWARDS_ACTION = "GET_EVAL_REWARDS"
 class BrowserEnv:
    def __init__(self, browsergym_eval_env: str | None = None, headless: bool = False):
        """
        Initialize the browser environment.
        Args:
            browsergym_eval_env: Optional evaluation environment name
            headless: Whether to run the browser in headless mode (no UI)
        """
        self.html_text_converter = self.get_html_text_converter()
        self.eval_mode = False
        self.eval_dir = ""
        self.browsergym_eval_env = browsergym_eval_env
        self.eval_mode = bool(browsergym_eval_env)
        self.headless = headless
        # Set multiprocessing start method
        if platform.system() == "Windows":
            multiprocessing.set_start_method("spawn", force=True)
        else:
            multiprocessing.set_start_method("fork", force=True)
        self.browser_side, self.agent_side = multiprocessing.Pipe()
        self.process = None  # Initialize process as None
        self.init_browser()
        atexit.register(self.close)
    def get_html_text_converter(self):
        html_text_converter = html2text.HTML2Text()
        # ignore links and images
        html_text_converter.ignore_links = False
        html_text_converter.ignore_images = True
        # use alt text for images
        html_text_converter.images_to_alt = True
        # disable auto text wrapping
        html_text_converter.body_width = 0
        return html_text_converter
    @tenacity.retry(
        wait=tenacity.wait_fixed(1),
        stop=tenacity.stop_after_attempt(5),
        retry=tenacity.retry_if_exception_type(BrowserException),
    )
    def init_browser(self):
        logger.debug(f"Starting browser env (headless: {self.headless})...")
        try:
            self.process = multiprocessing.Process(
                target=self.browser_process, args=(self.headless,)
            )
            self.process.start()
        except Exception as e:
            logger.error(f"Failed to start browser process: {e}")
            if self.process is not None:
                self.process.terminate()
            raise BrowserException("Failed to start browser environment.")
        if not self.check_alive():
            self.close()
            raise BrowserException("Failed to start browser environment.")
    def browser_process(self, headless: bool):
        if self.eval_mode:
            assert self.browsergym_eval_env is not None
            logger.debug("Initializing browser env for web browsing evaluation.")
            if "webarena" in self.browsergym_eval_env:
                import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
            elif "miniwob" in self.browsergym_eval_env:
                import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
            else:
                raise ValueError(
                    f"Unsupported browsergym eval env: {self.browsergym_eval_env}"
                )
            env = gym.make(
                self.browsergym_eval_env,
                tags_to_mark="all",
                headless=headless,
            )
        else:
            env = gym.make(
                "browsergym/openended",
                task_kwargs={"start_url": "about:blank", "goal": "PLACEHOLDER_GOAL"},
                wait_for_user_message=False,
                headless=headless,
                disable_env_checker=True,
                tags_to_mark="all",
            )
        obs, info = env.reset()
        # EVAL ONLY: save the goal into file for evaluation
        self.eval_goal = None
        self.eval_rewards: list[float] = [0]
        if self.eval_mode:
            logger.debug(f"Browsing goal: {obs['goal']}")
            self.eval_goal = obs["goal"]
        logger.debug(
            f"Browser env started in {'headless' if headless else 'visible'} mode."
        )
        while should_continue():
            try:
                if self.browser_side.poll(timeout=0.01):
                    unique_request_id, action_data = self.browser_side.recv()
                    # shutdown the browser environment
                    if unique_request_id == "SHUTDOWN":
                        logger.debug("SHUTDOWN recv, shutting down browser env...")
                        env.close()
                        return
                    elif unique_request_id == "IS_ALIVE":
                        self.browser_side.send(("ALIVE", None))
                        continue
                    # EVAL ONLY: Get evaluation info
                    if action_data["action"] == BROWSER_EVAL_GET_GOAL_ACTION:
                        self.browser_side.send(
                            (unique_request_id, {"text_content": self.eval_goal})
                        )
                        continue
                    elif action_data["action"] == BROWSER_EVAL_GET_REWARDS_ACTION:
                        self.browser_side.send(
                            (
                                unique_request_id,
                                {"text_content": json.dumps(self.eval_rewards)},
                            )
                        )
                        continue
                    action = action_data["action"]
                    obs, reward, terminated, truncated, info = env.step(action)
                    # EVAL ONLY: Save the rewards into file for evaluation
                    if self.eval_mode:
                        self.eval_rewards.append(reward)
                    # add text content of the page
                    html_str = flatten_dom_to_str(obs["dom_object"])
                    obs["text_content"] = self.html_text_converter.handle(html_str)
                    # make observation serializable
                    obs["screenshot"] = self.image_to_png_base64_url(obs["screenshot"])
                    obs["active_page_index"] = obs["active_page_index"].item()
                    obs["elapsed_time"] = obs["elapsed_time"].item()
                    self.browser_side.send((unique_request_id, obs))
            except KeyboardInterrupt:
                logger.debug("Browser env process interrupted by user.")
                try:
                    env.close()
                except Exception:
                    pass
                return
    def step(self, action_str: str, timeout: float = 30) -> dict:
        """Execute an action in the browser environment and return the observation."""
        unique_request_id = str(uuid.uuid4())
        self.agent_side.send((unique_request_id, {"action": action_str}))
        start_time = time.time()
        while True:
            if should_exit() or time.time() - start_time > timeout:
                raise TimeoutError("Browser environment took too long to respond.")
            if self.agent_side.poll(timeout=0.01):
                response_id, obs = self.agent_side.recv()
                if response_id == unique_request_id:
                    return obs
    def check_alive(self, timeout: float = 60):
        self.agent_side.send(("IS_ALIVE", None))
        if self.agent_side.poll(timeout=timeout):
            response_id, _ = self.agent_side.recv()
            if response_id == "ALIVE":
                return True
            logger.debug(f"Browser env is not alive. Response ID: {response_id}")
    def close(self):
        if (
            not hasattr(self, "process")
            or self.process is None
            or not self.process.is_alive()
        ):
            return
        try:
            self.agent_side.send(("SHUTDOWN", None))
            self.process.join(5)  # Wait for the process to terminate
            if self.process.is_alive():
                logger.error(
                    "Browser process did not terminate, forcefully terminating..."
                )
                self.process.terminate()
                self.process.join(5)  # Wait for the process to terminate
                if self.process.is_alive():
                    self.process.kill()
                    self.process.join(5)  # Wait for the process to terminate
            self.agent_side.close()
            self.browser_side.close()
        except Exception:
            logger.error("Encountered an error when closing browser env", exc_info=True)
    @staticmethod
    def image_to_png_base64_url(
        image: np.ndarray | Image.Image, add_data_prefix: bool = False
    ):
        """Convert a numpy array to a base64 encoded png image url."""
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)
        if image.mode in ("RGBA", "LA"):
            image = image.convert("RGB")
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        image_base64 = base64.b64encode(buffered.getvalue()).decode()
        return (
            f"data:image/png;base64,{image_base64}"
            if add_data_prefix
            else f"{image_base64}"
        )
    @staticmethod
    def image_to_jpg_base64_url(
        image: np.ndarray | Image.Image, add_data_prefix: bool = False
    ):
        """Convert a numpy array to a base64 encoded jpeg image url."""
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)
        if image.mode in ("RGBA", "LA"):
            image = image.convert("RGB")
        buffered = io.BytesIO()
        image.save(buffered, format="JPEG")
        image_base64 = base64.b64encode(buffered.getvalue()).decode()
        return (
            f"data:image/jpeg;base64,{image_base64}"
            if add_data_prefix
            else f"{image_base64}"
        )
--- a/app/tool/browser_use_tool.py
+++ b/app/tool/browser_use_tool.py
@ -2,7 +2,8 @@ import asyncio
 import json
 from typing import Optional
-from browser_use import Browser as BrowserUseBrowser, BrowserConfig
+from browser_use import Browser as BrowserUseBrowser
 from browser_use import BrowserConfig
 from browser_use.browser.context import BrowserContext
 from browser_use.dom.service import DomService
 from pydantic import Field, field_validator
@ -10,8 +11,9 @@ from pydantic_core.core_schema import ValidationInfo
 from app.tool.base import BaseTool, ToolResult
 _BROWSER_DESCRIPTION = """
-Interact with a web browser to perform various actions such as navigation, element interaction, 
+Interact with a web browser to perform various actions such as navigation, element interaction,
 content extraction, and tab management. Supported actions include:
 - 'navigate': Go to a specific URL
 - 'click': Click an element by index
@ -36,35 +38,41 @@ class BrowserUseTool(BaseTool):
            "action": {
                "type": "string",
                "enum": [
-                    "navigate", "click", "input_text", "screenshot", "get_html", "execute_js",
+                    "navigate",
-                    "scroll", "switch_tab", "new_tab", "close_tab", "refresh"
+                    "click",
                    "input_text",
                    "screenshot",
                    "get_html",
                    "execute_js",
                    "scroll",
                    "switch_tab",
                    "new_tab",
                    "close_tab",
                    "refresh",
                ],
-                "description": "The browser action to perform"
+                "description": "The browser action to perform",
            },
            "url": {
                "type": "string",
-                "description": "URL for 'navigate' or 'new_tab' actions"
+                "description": "URL for 'navigate' or 'new_tab' actions",
            },
            "index": {
                "type": "integer",
-                "description": "Element index for 'click' or 'input_text' actions"
+                "description": "Element index for 'click' or 'input_text' actions",
            },
            "text": {
                "type": "string",
                "description": "Text for 'input_text' action"
            },
            "text": {"type": "string", "description": "Text for 'input_text' action"},
            "script": {
                "type": "string",
-                "description": "JavaScript code for 'execute_js' action"
+                "description": "JavaScript code for 'execute_js' action",
            },
            "scroll_amount": {
                "type": "integer",
-                "description": "Pixels to scroll (positive for down, negative for up) for 'scroll' action"
+                "description": "Pixels to scroll (positive for down, negative for up) for 'scroll' action",
            },
            "tab_id": {
                "type": "integer",
-                "description": "Tab ID for 'switch_tab' action"
+                "description": "Tab ID for 'switch_tab' action",
-            }
+            },
        },
        "required": ["action"],
        "dependencies": {
@ -74,8 +82,8 @@ class BrowserUseTool(BaseTool):
            "execute_js": ["script"],
            "switch_tab": ["tab_id"],
            "new_tab": ["url"],
-            "scroll": ["scroll_amount"]
+            "scroll": ["scroll_amount"],
-        }
+        },
    }
    lock: asyncio.Lock = Field(default_factory=asyncio.Lock)
@ -83,7 +91,7 @@ class BrowserUseTool(BaseTool):
    context: Optional[BrowserContext] = Field(default=None, exclude=True)
    dom_service: Optional[DomService] = Field(default=None, exclude=True)
-    @field_validator('parameters', mode='before')
+    @field_validator("parameters", mode="before")
    def validate_parameters(cls, v: dict, info: ValidationInfo) -> dict:
        if not v:
            raise ValueError("Parameters cannot be empty")
@ -98,10 +106,17 @@ class BrowserUseTool(BaseTool):
            self.dom_service = DomService(await self.context.get_current_page())
        return self.context
-    async def execute(self, action: str, url: Optional[str] = None, index: Optional[int] = None,
+    async def execute(
-                      text: Optional[str] = None, script: Optional[str] = None,
+        self,
-                      scroll_amount: Optional[int] = None, tab_id: Optional[int] = None,
+        action: str,
-                      **kwargs) -> ToolResult:
+        url: Optional[str] = None,
        index: Optional[int] = None,
        text: Optional[str] = None,
        script: Optional[str] = None,
        scroll_amount: Optional[int] = None,
        tab_id: Optional[int] = None,
        **kwargs,
    ) -> ToolResult:
        """
        Execute a specified browser action.
@ -142,18 +157,22 @@ class BrowserUseTool(BaseTool):
                elif action == "input_text":
                    if index is None or not text:
-                        return ToolResult(error="Index and text are required for 'input_text' action")
+                        return ToolResult(
                            error="Index and text are required for 'input_text' action"
                        )
                    element = await context.get_dom_element_by_index(index)
                    if not element:
                        return ToolResult(error=f"Element with index {index} not found")
                    await context._input_text_element_node(element, text)
-                    return ToolResult(output=f"Input '{text}' into element at index {index}")
+                    return ToolResult(
                        output=f"Input '{text}' into element at index {index}"
                    )
                elif action == "screenshot":
                    screenshot = await context.take_screenshot(full_page=True)
                    return ToolResult(
                        output=f"Screenshot captured (base64 length: {len(screenshot)})",
-                        system=screenshot
+                        system=screenshot,
                    )
                elif action == "get_html":
@ -163,20 +182,30 @@ class BrowserUseTool(BaseTool):
                elif action == "execute_js":
                    if not script:
-                        return ToolResult(error="Script is required for 'execute_js' action")
+                        return ToolResult(
                            error="Script is required for 'execute_js' action"
                        )
                    result = await context.execute_javascript(script)
                    return ToolResult(output=str(result))
                elif action == "scroll":
                    if scroll_amount is None:
-                        return ToolResult(error="Scroll amount is required for 'scroll' action")
+                        return ToolResult(
-                    await context.execute_javascript(f"window.scrollBy(0, {scroll_amount});")
+                            error="Scroll amount is required for 'scroll' action"
                        )
                    await context.execute_javascript(
                        f"window.scrollBy(0, {scroll_amount});"
                    )
                    direction = "down" if scroll_amount > 0 else "up"
-                    return ToolResult(output=f"Scrolled {direction} by {abs(scroll_amount)} pixels")
+                    return ToolResult(
                        output=f"Scrolled {direction} by {abs(scroll_amount)} pixels"
                    )
                elif action == "switch_tab":
                    if tab_id is None:
-                        return ToolResult(error="Tab ID is required for 'switch_tab' action")
+                        return ToolResult(
                            error="Tab ID is required for 'switch_tab' action"
                        )
                    await context.switch_to_tab(tab_id)
                    return ToolResult(output=f"Switched to tab {tab_id}")
@ -210,7 +239,7 @@ class BrowserUseTool(BaseTool):
                    "url": state.url,
                    "title": state.title,
                    "tabs": [tab.model_dump() for tab in state.tabs],
-                    "interactive_elements": state.element_tree.clickable_elements_to_string()
+                    "interactive_elements": state.element_tree.clickable_elements_to_string(),
                }
                return ToolResult(output=json.dumps(state_info))
            except Exception as e:
--- a/app/tool/file_saver.py
+++ b/app/tool/file_saver.py
@ -1,6 +1,4 @@
 import os
 from typing import Optional
 from pathlib import Path
 from app.tool.base import BaseTool
@ -16,20 +14,20 @@ The tool accepts content and a file path, and saves the content to that location
        "properties": {
            "content": {
                "type": "string",
-                "description": "(required) The content to save to the file."
+                "description": "(required) The content to save to the file.",
            },
            "file_path": {
                "type": "string",
-                "description": "(required) The path where the file should be saved, including filename and extension."
+                "description": "(required) The path where the file should be saved, including filename and extension.",
            },
            "mode": {
                "type": "string",
                "description": "(optional) The file opening mode. Default is 'w' for write. Use 'a' for append.",
                "enum": ["w", "a"],
-                "default": "w"
+                "default": "w",
-            }
+            },
        },
-        "required": ["content", "file_path"]
+        "required": ["content", "file_path"],
    }
    async def execute(self, content: str, file_path: str, mode: str = "w") -> str:
@ -51,7 +49,7 @@ The tool accepts content and a file path, and saves the content to that location
                os.makedirs(directory)
            # Write directly to the file
-            with open(file_path, mode, encoding='utf-8') as file:
+            with open(file_path, mode, encoding="utf-8") as file:
                file.write(content)
            return f"Content successfully saved to {file_path}"
--- a/app/tool/google_search.py
+++ b/app/tool/google_search.py
@ -1,5 +1,6 @@
 import asyncio
-from typing import Optional, List
+from typing import List
 from googlesearch import search
 from app.tool.base import BaseTool
@ -16,15 +17,15 @@ The tool returns a list of URLs that match the search query.
        "properties": {
            "query": {
                "type": "string",
-                "description": "(required) The search query to submit to Google."
+                "description": "(required) The search query to submit to Google.",
            },
            "num_results": {
                "type": "integer",
                "description": "(optional) The number of search results to return. Default is 10.",
-                "default": 10
+                "default": 10,
-            }
+            },
        },
-        "required": ["query"]
+        "required": ["query"],
    }
    async def execute(self, query: str, num_results: int = 10) -> List[str]:
@ -41,8 +42,7 @@ The tool returns a list of URLs that match the search query.
        # Run the search in a thread pool to prevent blocking
        loop = asyncio.get_event_loop()
        links = await loop.run_in_executor(
-            None,
+            None, lambda: list(search(query, num_results=num_results))
            lambda: list(search(query, num_results=num_results))
        )
        return links
--- a/app/utils/init.py
+++ b/app/utils/init.py
--- a/app/utils/extract_html_content.py
+++ b/app/utils/extract_html_content.py
@ -1,112 +0,0 @@
 import re
 def extract_html_content(text: str, stack: str = "react-tailwind") -> str:
    """
    Extract code content from LLM response based on technology stack.
    Args:
        text: The raw text response from LLM
        stack: Technology stack ("react-tailwind", "html-tailwind", "svg")
    Returns:
        str: Extracted code content
    """
    # Remove markdown code blocks if present
    text = re.sub(r"```[\w]*\n|```", "", text)
    if stack == "svg":
        # Extract SVG content
        svg_match = re.search(r"(<svg.*?>.*?</svg>)", text, re.DOTALL)
        if svg_match:
            return svg_match.group(1)
    elif stack == "react-tailwind":
        # Extract React component content
        react_match = re.search(r"(export default function.*?})\s*$", text, re.DOTALL)
        if react_match:
            return react_match.group(1)
        # Alternative: look for const/function component definition
        alt_match = re.search(
            r"((?:const|function)\s+\w+\s*=?\s*(?:\([^)]*\))?\s*=>?\s*{.*?})\s*$",
            text,
            re.DOTALL,
        )
        if alt_match:
            return alt_match.group(1)
    # Default: try to extract content within <html> tags
    html_match = re.search(r"(<html.*?>.*?</html>)", text, re.DOTALL)
    if html_match:
        return html_match.group(1)
    # If no specific patterns match, try to extract any HTML-like content
    body_match = re.search(r"(<body.*?>.*?</body>)", text, re.DOTALL)
    if body_match:
        return f"<html>\n{body_match.group(1)}\n</html>"
    div_match = re.search(r"(<div.*?>.*?</div>)", text, re.DOTALL)
    if div_match:
        return f"<html>\n<body>\n{div_match.group(1)}\n</body>\n</html>"
    # If no patterns match, clean up the text and return it
    cleaned_text = text.strip()
    print(
        f"[Code Extraction] No specific pattern found for stack '{stack}'. Raw content:\n{cleaned_text}"
    )
    return cleaned_text
 def clean_code_content(code: str) -> str:
    """
    Clean and format the extracted code content.
    Args:
        code: Raw code content
    Returns:
        str: Cleaned and formatted code
    """
    # Remove leading/trailing whitespace
    code = code.strip()
    # Remove extra blank lines
    code = re.sub(r"\n\s*\n", "\n\n", code)
    # Ensure proper indentation
    lines = code.split("\n")
    indent_level = 0
    formatted_lines = []
    for line in lines:
        # Adjust indent level based on brackets/braces
        stripped_line = line.strip()
        if stripped_line.endswith("{"):
            formatted_lines.append("  " * indent_level + stripped_line)
            indent_level += 1
        elif stripped_line.startswith("}"):
            indent_level = max(0, indent_level - 1)
            formatted_lines.append("  " * indent_level + stripped_line)
        else:
            formatted_lines.append("  " * indent_level + stripped_line)
    return "\n".join(formatted_lines)
 def extract_code_content(text: str, stack: str = "react-tailwind") -> str:
    """
    Main function to extract and clean code content.
    Args:
        text: Raw text from LLM response
        stack: Technology stack being used
    Returns:
        str: Final cleaned and formatted code
    """
    # Extract the relevant code content
    extracted_content = extract_html_content(text, stack)
    # Clean and format the code
    cleaned_content = clean_code_content(extracted_content)
    return cleaned_content
--- a/app/utils/shutdown_listener.py
+++ b/app/utils/shutdown_listener.py
@ -1,74 +0,0 @@
 """
 This module monitors the app for shutdown signals
 """
 import asyncio
 import signal
 import threading
 import time
 from types import FrameType
 from uvicorn.server import HANDLED_SIGNALS
 from app.logger import logger
 _should_exit = None
 def _register_signal_handler(sig: signal.Signals):
    original_handler = None
    def handler(sig_: int, frame: FrameType | None):
        logger.debug(f"shutdown_signal:{sig_}")
        global _should_exit
        _should_exit = True
        if original_handler:
            original_handler(sig_, frame)  # type: ignore[unreachable]
    original_handler = signal.signal(sig, handler)
 def _register_signal_handlers():
    global _should_exit
    if _should_exit is not None:
        return
    _should_exit = False
    logger.debug("_register_signal_handlers")
    # Check if we're in the main thread of the main interpreter
    if threading.current_thread() is threading.main_thread():
        logger.debug("_register_signal_handlers:main_thread")
        for sig in HANDLED_SIGNALS:
            _register_signal_handler(sig)
    else:
        logger.debug("_register_signal_handlers:not_main_thread")
 def should_exit() -> bool:
    _register_signal_handlers()
    return bool(_should_exit)
 def should_continue() -> bool:
    _register_signal_handlers()
    return not _should_exit
 def sleep_if_should_continue(timeout: float):
    if timeout <= 1:
        time.sleep(timeout)
        return
    start_time = time.time()
    while (time.time() - start_time) < timeout and should_continue():
        time.sleep(1)
 async def async_sleep_if_should_continue(timeout: float):
    if timeout <= 1:
        await asyncio.sleep(timeout)
        return
    start_time = time.time()
    while time.time() - start_time < timeout and should_continue():
        await asyncio.sleep(1)
--- a/requirements.txt
+++ b/requirements.txt
@ -14,4 +14,3 @@ uvicorn~=0.34.0
 unidiff~=0.7.5
 browser-use~=0.1.40
 googlesearch-python~=1.3.0
--- a/run_flow.py
+++ b/run_flow.py
@ -5,7 +5,7 @@ from app.flow.base import FlowType
 from app.flow.flow_factory import FlowFactory
-if __name__ == "__main__":
+async def run_flow():
    agent = ToolCallAgent()
    flow = FlowFactory.create_flow(
@ -13,7 +13,9 @@ if __name__ == "__main__":
        agents=agent,
    )
-    result = asyncio.run(
+    result = await flow.execute("Create a web app that shows Japan travel destinations")
        flow.execute("Create a web app that shows Japan travel destinations")
    )
    print(result)
 if __name__ == "__main__":
    asyncio.run(run_flow())