From 2509bc30c49e08b9de616022932656fd7ded6dd2 Mon Sep 17 00:00:00 2001
From: liangxinbing <1580466765@qq.com>
Date: Tue, 18 Mar 2025 02:39:11 +0800
Subject: [PATCH] update ToolCallAgent and Manus

---
 app/agent/manus.py    | 46 ++++++++++++++++++++++++++++++++++++++++++-
 app/agent/toolcall.py | 41 ++++++++++++++++++++++++++++++--------
 2 files changed, 78 insertions(+), 9 deletions(-)

diff --git a/app/agent/manus.py b/app/agent/manus.py
index df784ed..06101aa 100644
--- a/app/agent/manus.py
+++ b/app/agent/manus.py
@@ -1,8 +1,10 @@
-from typing import Any
+import json
+from typing import Any, Optional
 
 from pydantic import Field
 
 from app.agent.toolcall import ToolCallAgent
+from app.logger import logger
 from app.prompt.manus import NEXT_STEP_PROMPT, SYSTEM_PROMPT
 from app.tool import Terminate, ToolCollection
 from app.tool.browser_use_tool import BrowserUseTool
@@ -43,3 +45,45 @@ class Manus(ToolCallAgent):
         else:
             await self.available_tools.get_tool(BrowserUseTool().name).cleanup()
             await super()._handle_special_tool(name, result, **kwargs)
+
+    async def get_browser_state(self) -> Optional[dict]:
+        """Get the current browser state for context in next steps."""
+        browser_tool = self.available_tools.get_tool(BrowserUseTool().name)
+        if not browser_tool:
+            return None
+
+        try:
+            # Get browser state directly from the tool with no context parameter
+            result = await browser_tool.get_current_state()
+
+            if result.error:
+                logger.debug(f"Browser state error: {result.error}")
+                return None
+
+            # Store screenshot if available
+            if hasattr(result, "base64_image") and result.base64_image:
+                self._current_base64_image = result.base64_image
+
+            # Parse the state info
+            return json.loads(result.output)
+
+        except Exception as e:
+            logger.debug(f"Failed to get browser state: {str(e)}")
+            return None
+
+    async def think(self) -> bool:
+        # Add your custom pre-processing here
+        browser_state = await self.get_browser_state()
+
+        # Modify the next_step_prompt temporarily
+        original_prompt = self.next_step_prompt
+        if browser_state and not browser_state.get("error"):
+            self.next_step_prompt += f"\nCurrent browser state:\nURL: {browser_state.get('url', 'N/A')}\nTitle: {browser_state.get('title', 'N/A')}\n"
+
+        # Call parent implementation
+        result = await super().think()
+
+        # Restore original prompt
+        self.next_step_prompt = original_prompt
+
+        return result
diff --git a/app/agent/toolcall.py b/app/agent/toolcall.py
index 29e5af4..131fd91 100644
--- a/app/agent/toolcall.py
+++ b/app/agent/toolcall.py
@@ -30,6 +30,7 @@ class ToolCallAgent(ReActAgent):
     special_tool_names: List[str] = Field(default_factory=lambda: [Terminate().name])
 
     tool_calls: List[ToolCall] = Field(default_factory=list)
+    _current_base64_image: Optional[str] = None
 
     max_steps: int = 30
     max_observe: Optional[Union[int, bool]] = None
@@ -44,9 +45,11 @@ class ToolCallAgent(ReActAgent):
             # Get response with tool options
             response = await self.llm.ask_tool(
                 messages=self.messages,
-                system_msgs=[Message.system_message(self.system_prompt)]
-                if self.system_prompt
-                else None,
+                system_msgs=(
+                    [Message.system_message(self.system_prompt)]
+                    if self.system_prompt
+                    else None
+                ),
                 tools=self.available_tools.to_params(),
                 tool_choice=self.tool_choices,
             )
@@ -79,6 +82,9 @@ class ToolCallAgent(ReActAgent):
             logger.info(
                 f"🧰 Tools being prepared: {[call.function.name for call in response.tool_calls]}"
             )
+            logger.info(
+                f"🔧 Tool arguments: {response.tool_calls[0].function.arguments}"
+            )
 
         try:
             # Handle different tool_choices modes
@@ -130,6 +136,9 @@ class ToolCallAgent(ReActAgent):
 
         results = []
         for command in self.tool_calls:
+            # Reset base64_image for each tool call
+            self._current_base64_image = None
+
             result = await self.execute_tool(command)
 
             if self.max_observe:
@@ -141,7 +150,10 @@ class ToolCallAgent(ReActAgent):
 
             # Add tool response to memory
             tool_msg = Message.tool_message(
-                content=result, tool_call_id=command.id, name=command.function.name
+                content=result,
+                tool_call_id=command.id,
+                name=command.function.name,
+                base64_image=self._current_base64_image,
             )
             self.memory.add_message(tool_msg)
             results.append(result)
@@ -165,16 +177,29 @@ class ToolCallAgent(ReActAgent):
             logger.info(f"🔧 Activating tool: '{name}'...")
             result = await self.available_tools.execute(name=name, tool_input=args)
 
-            # Format result for display
+            # Handle special tools
+            await self._handle_special_tool(name=name, result=result)
+
+            # Check if result is a ToolResult with base64_image
+            if hasattr(result, "base64_image") and result.base64_image:
+                # Store the base64_image for later use in tool_message
+                self._current_base64_image = result.base64_image
+
+                # Format result for display
+                observation = (
+                    f"Observed output of cmd `{name}` executed:\n{str(result)}"
+                    if result
+                    else f"Cmd `{name}` completed with no output"
+                )
+                return observation
+
+            # Format result for display (standard case)
             observation = (
                 f"Observed output of cmd `{name}` executed:\n{str(result)}"
                 if result
                 else f"Cmd `{name}` completed with no output"
             )
 
-            # Handle special tools like `finish`
-            await self._handle_special_tool(name=name, result=result)
-
             return observation
         except json.JSONDecodeError:
             error_msg = f"Error parsing arguments for {name}: Invalid JSON format"