update ToolCallAgent and Manus

2025-03-18 02:39:11 +08:00 · 2025-03-18 02:39:11 +08:00 · 2509bc30c4
commit 2509bc30c4
parent c3203e7fa3
2 changed files with 78 additions and 9 deletions
--- a/app/agent/manus.py
+++ b/app/agent/manus.py
@ -1,8 +1,10 @@
-from typing import Any
+import json
+from typing import Any, Optional

 from pydantic import Field

 from app.agent.toolcall import ToolCallAgent
+from app.logger import logger
 from app.prompt.manus import NEXT_STEP_PROMPT, SYSTEM_PROMPT
 from app.tool import Terminate, ToolCollection
 from app.tool.browser_use_tool import BrowserUseTool
@ -43,3 +45,45 @@ class Manus(ToolCallAgent):
        else:
            await self.available_tools.get_tool(BrowserUseTool().name).cleanup()
            await super()._handle_special_tool(name, result, **kwargs)
+
+    async def get_browser_state(self) -> Optional[dict]:
+        """Get the current browser state for context in next steps."""
+        browser_tool = self.available_tools.get_tool(BrowserUseTool().name)
+        if not browser_tool:
+            return None
+
+        try:
+            # Get browser state directly from the tool with no context parameter
+            result = await browser_tool.get_current_state()
+
+            if result.error:
+                logger.debug(f"Browser state error: {result.error}")
+                return None
+
+            # Store screenshot if available
+            if hasattr(result, "base64_image") and result.base64_image:
+                self._current_base64_image = result.base64_image
+
+            # Parse the state info
+            return json.loads(result.output)
+
+        except Exception as e:
+            logger.debug(f"Failed to get browser state: {str(e)}")
+            return None
+
+    async def think(self) -> bool:
+        # Add your custom pre-processing here
+        browser_state = await self.get_browser_state()
+
+        # Modify the next_step_prompt temporarily
+        original_prompt = self.next_step_prompt
+        if browser_state and not browser_state.get("error"):
+            self.next_step_prompt += f"\nCurrent browser state:\nURL: {browser_state.get('url', 'N/A')}\nTitle: {browser_state.get('title', 'N/A')}\n"
+
+        # Call parent implementation
+        result = await super().think()
+
+        # Restore original prompt
+        self.next_step_prompt = original_prompt
+
+        return result
--- a/app/agent/toolcall.py
+++ b/app/agent/toolcall.py
@ -30,6 +30,7 @@ class ToolCallAgent(ReActAgent):
    special_tool_names: List[str] = Field(default_factory=lambda: [Terminate().name])

    tool_calls: List[ToolCall] = Field(default_factory=list)
+    _current_base64_image: Optional[str] = None

    max_steps: int = 30
    max_observe: Optional[Union[int, bool]] = None
@ -44,9 +45,11 @@ class ToolCallAgent(ReActAgent):
            # Get response with tool options
            response = await self.llm.ask_tool(
                messages=self.messages,
-                system_msgs=[Message.system_message(self.system_prompt)]
-                if self.system_prompt
-                else None,
+                system_msgs=(
+                    [Message.system_message(self.system_prompt)]
+                    if self.system_prompt
+                    else None
+                ),
                tools=self.available_tools.to_params(),
                tool_choice=self.tool_choices,
            )
@ -79,6 +82,9 @@ class ToolCallAgent(ReActAgent):
            logger.info(
                f"🧰 Tools being prepared: {[call.function.name for call in response.tool_calls]}"
            )
+            logger.info(
+                f"🔧 Tool arguments: {response.tool_calls[0].function.arguments}"
+            )

        try:
            # Handle different tool_choices modes
@ -130,6 +136,9 @@ class ToolCallAgent(ReActAgent):

        results = []
        for command in self.tool_calls:
+            # Reset base64_image for each tool call
+            self._current_base64_image = None
+
            result = await self.execute_tool(command)

            if self.max_observe:
@ -141,7 +150,10 @@ class ToolCallAgent(ReActAgent):

            # Add tool response to memory
            tool_msg = Message.tool_message(
-                content=result, tool_call_id=command.id, name=command.function.name
+                content=result,
+                tool_call_id=command.id,
+                name=command.function.name,
+                base64_image=self._current_base64_image,
            )
            self.memory.add_message(tool_msg)
            results.append(result)
@ -165,16 +177,29 @@ class ToolCallAgent(ReActAgent):
            logger.info(f"🔧 Activating tool: '{name}'...")
            result = await self.available_tools.execute(name=name, tool_input=args)

-            # Format result for display
+            # Handle special tools
+            await self._handle_special_tool(name=name, result=result)
+
+            # Check if result is a ToolResult with base64_image
+            if hasattr(result, "base64_image") and result.base64_image:
+                # Store the base64_image for later use in tool_message
+                self._current_base64_image = result.base64_image
+
+                # Format result for display
+                observation = (
+                    f"Observed output of cmd `{name}` executed:\n{str(result)}"
+                    if result
+                    else f"Cmd `{name}` completed with no output"
+                )
+                return observation
+
+            # Format result for display (standard case)
            observation = (
                f"Observed output of cmd `{name}` executed:\n{str(result)}"
                if result
                else f"Cmd `{name}` completed with no output"
            )

-            # Handle special tools like `finish`
-            await self._handle_special_tool(name=name, result=result)
-
            return observation
        except json.JSONDecodeError:
            error_msg = f"Error parsing arguments for {name}: Invalid JSON format"