From 2509bc30c49e08b9de616022932656fd7ded6dd2 Mon Sep 17 00:00:00 2001 From: liangxinbing <1580466765@qq.com> Date: Tue, 18 Mar 2025 02:39:11 +0800 Subject: [PATCH] update ToolCallAgent and Manus --- app/agent/manus.py | 46 ++++++++++++++++++++++++++++++++++++++++++- app/agent/toolcall.py | 41 ++++++++++++++++++++++++++++++-------- 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/app/agent/manus.py b/app/agent/manus.py index df784ed..06101aa 100644 --- a/app/agent/manus.py +++ b/app/agent/manus.py @@ -1,8 +1,10 @@ -from typing import Any +import json +from typing import Any, Optional from pydantic import Field from app.agent.toolcall import ToolCallAgent +from app.logger import logger from app.prompt.manus import NEXT_STEP_PROMPT, SYSTEM_PROMPT from app.tool import Terminate, ToolCollection from app.tool.browser_use_tool import BrowserUseTool @@ -43,3 +45,45 @@ class Manus(ToolCallAgent): else: await self.available_tools.get_tool(BrowserUseTool().name).cleanup() await super()._handle_special_tool(name, result, **kwargs) + + async def get_browser_state(self) -> Optional[dict]: + """Get the current browser state for context in next steps.""" + browser_tool = self.available_tools.get_tool(BrowserUseTool().name) + if not browser_tool: + return None + + try: + # Get browser state directly from the tool with no context parameter + result = await browser_tool.get_current_state() + + if result.error: + logger.debug(f"Browser state error: {result.error}") + return None + + # Store screenshot if available + if hasattr(result, "base64_image") and result.base64_image: + self._current_base64_image = result.base64_image + + # Parse the state info + return json.loads(result.output) + + except Exception as e: + logger.debug(f"Failed to get browser state: {str(e)}") + return None + + async def think(self) -> bool: + # Add your custom pre-processing here + browser_state = await self.get_browser_state() + + # Modify the next_step_prompt temporarily + original_prompt = self.next_step_prompt + if browser_state and not browser_state.get("error"): + self.next_step_prompt += f"\nCurrent browser state:\nURL: {browser_state.get('url', 'N/A')}\nTitle: {browser_state.get('title', 'N/A')}\n" + + # Call parent implementation + result = await super().think() + + # Restore original prompt + self.next_step_prompt = original_prompt + + return result diff --git a/app/agent/toolcall.py b/app/agent/toolcall.py index 29e5af4..131fd91 100644 --- a/app/agent/toolcall.py +++ b/app/agent/toolcall.py @@ -30,6 +30,7 @@ class ToolCallAgent(ReActAgent): special_tool_names: List[str] = Field(default_factory=lambda: [Terminate().name]) tool_calls: List[ToolCall] = Field(default_factory=list) + _current_base64_image: Optional[str] = None max_steps: int = 30 max_observe: Optional[Union[int, bool]] = None @@ -44,9 +45,11 @@ class ToolCallAgent(ReActAgent): # Get response with tool options response = await self.llm.ask_tool( messages=self.messages, - system_msgs=[Message.system_message(self.system_prompt)] - if self.system_prompt - else None, + system_msgs=( + [Message.system_message(self.system_prompt)] + if self.system_prompt + else None + ), tools=self.available_tools.to_params(), tool_choice=self.tool_choices, ) @@ -79,6 +82,9 @@ class ToolCallAgent(ReActAgent): logger.info( f"🧰 Tools being prepared: {[call.function.name for call in response.tool_calls]}" ) + logger.info( + f"🔧 Tool arguments: {response.tool_calls[0].function.arguments}" + ) try: # Handle different tool_choices modes @@ -130,6 +136,9 @@ class ToolCallAgent(ReActAgent): results = [] for command in self.tool_calls: + # Reset base64_image for each tool call + self._current_base64_image = None + result = await self.execute_tool(command) if self.max_observe: @@ -141,7 +150,10 @@ class ToolCallAgent(ReActAgent): # Add tool response to memory tool_msg = Message.tool_message( - content=result, tool_call_id=command.id, name=command.function.name + content=result, + tool_call_id=command.id, + name=command.function.name, + base64_image=self._current_base64_image, ) self.memory.add_message(tool_msg) results.append(result) @@ -165,16 +177,29 @@ class ToolCallAgent(ReActAgent): logger.info(f"🔧 Activating tool: '{name}'...") result = await self.available_tools.execute(name=name, tool_input=args) - # Format result for display + # Handle special tools + await self._handle_special_tool(name=name, result=result) + + # Check if result is a ToolResult with base64_image + if hasattr(result, "base64_image") and result.base64_image: + # Store the base64_image for later use in tool_message + self._current_base64_image = result.base64_image + + # Format result for display + observation = ( + f"Observed output of cmd `{name}` executed:\n{str(result)}" + if result + else f"Cmd `{name}` completed with no output" + ) + return observation + + # Format result for display (standard case) observation = ( f"Observed output of cmd `{name}` executed:\n{str(result)}" if result else f"Cmd `{name}` completed with no output" ) - # Handle special tools like `finish` - await self._handle_special_tool(name=name, result=result) - return observation except json.JSONDecodeError: error_msg = f"Error parsing arguments for {name}: Invalid JSON format"