diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..1ef0e94 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,58 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + open-pull-requests-limit: 4 + groups: + # Group critical packages that might need careful review + core-dependencies: + patterns: + - "pydantic*" + - "openai" + - "fastapi" + - "tiktoken" + browsergym-related: + patterns: + - "browsergym*" + - "browser-use" + - "playwright" + search-tools: + patterns: + - "googlesearch-python" + - "baidusearch" + - "duckduckgo_search" + pre-commit: + patterns: + - "pre-commit" + security-all: + applies-to: "security-updates" + patterns: + - "*" + version-all: + applies-to: "version-updates" + patterns: + - "*" + exclude-patterns: + - "pydantic*" + - "openai" + - "fastapi" + - "tiktoken" + - "browsergym*" + - "browser-use" + - "playwright" + - "googlesearch-python" + - "baidusearch" + - "duckduckgo_search" + - "pre-commit" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + open-pull-requests-limit: 4 + groups: + actions: + patterns: + - "*" diff --git a/.github/workflows/environment-corrupt-check.yaml b/.github/workflows/environment-corrupt-check.yaml new file mode 100644 index 0000000..dc66fe0 --- /dev/null +++ b/.github/workflows/environment-corrupt-check.yaml @@ -0,0 +1,33 @@ +name: Environment Corruption Check +on: + push: + branches: ["main"] + paths: + - requirements.txt + pull_request: + branches: ["main"] + paths: + - requirements.txt +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} + cancel-in-progress: true +jobs: + test-python-versions: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11.11", "3.12.8", "3.13.2"] + fail-fast: false + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Upgrade pip + run: | + python -m pip install --upgrade pip + - name: Install dependencies + run: | + pip install -r requirements.txt diff --git a/.github/workflows/pr-autodiff.yaml b/.github/workflows/pr-autodiff.yaml new file mode 100644 index 0000000..ed218dc --- /dev/null +++ b/.github/workflows/pr-autodiff.yaml @@ -0,0 +1,127 @@ +name: PR Diff Summarization +on: + # pull_request: + # branches: [main] + # types: [opened, ready_for_review, reopened] + issue_comment: + types: [created] +permissions: + contents: read + pull-requests: write +jobs: + pr-diff-summarization: + runs-on: ubuntu-latest + if: | + (github.event_name == 'pull_request') || + (github.event_name == 'issue_comment' && + contains(github.event.comment.body, '!pr-diff') && + (github.event.comment.author_association == 'COLLABORATOR' || github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && + github.event.issue.pull_request) + steps: + - name: Get PR head SHA + id: get-pr-sha + run: | + if [ "${{ github.event_name }}" == "pull_request" ]; then + echo "pr_sha=${{ github.event.pull_request.head.sha }}" >> $GITHUB_OUTPUT + echo "Retrieved PR head SHA: ${{ github.event.pull_request.head.sha }}" + else + PR_URL="${{ github.event.issue.pull_request.url }}" + SHA=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" $PR_URL | jq -r '.head.sha') + echo "pr_sha=$SHA" >> $GITHUB_OUTPUT + echo "Retrieved PR head SHA from API: $SHA" + fi + - name: Check out code + uses: actions/checkout@v4 + with: + ref: ${{ steps.get-pr-sha.outputs.pr_sha }} + fetch-depth: 0 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install openai requests + - name: Create and run Python script + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }} + GH_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.pull_request.number || github.event.issue.number }} + run: |- + cat << 'EOF' > /tmp/_workflow_core.py + import os + import subprocess + import json + import requests + from openai import OpenAI + + def get_diff(): + result = subprocess.run( + ['git', 'diff', 'origin/main...HEAD'], + capture_output=True, text=True, check=True) + return '\n'.join( + line for line in result.stdout.split('\n') + if any(line.startswith(c) for c in ('+', '-')) + and not line.startswith(('---', '+++')) + )[:round(200000 * 0.4)] # Truncate to prevent overflow + + def generate_comment(diff_content): + client = OpenAI( + base_url=os.getenv("OPENAI_BASE_URL"), + api_key=os.getenv("OPENAI_API_KEY") + ) + + guidelines = ''' + 1. English version first, Chinese Simplified version after + 2. Example format: + # Diff Report + ## English + - Added `ABC` class + - Fixed `f()` behavior in `foo` module + + ### Comments Highlight + - `config.toml` needs to be configured properly to make sure new features work as expected. + + ### Spelling/Offensive Content Check + - No spelling mistakes or offensive content found in the code or comments. + 3. Highlight non-English comments + 4. Check for spelling/offensive content''' + + response = client.chat.completions.create( + model="o3-mini", + messages=[{ + "role": "system", + "content": "Generate bilingual code review feedback." + }, { + "role": "user", + "content": f"Review these changes per guidelines:\n{guidelines}\n\nDIFF:\n{diff_content}" + }] + ) + return response.choices[0].message.content + + def post_comment(comment): + repo = os.getenv("GITHUB_REPOSITORY") + pr_number = os.getenv("PR_NUMBER") + + headers = { + "Authorization": f"Bearer {os.getenv('GH_TOKEN')}", + "Accept": "application/vnd.github.v3+json" + } + url = f"https://api.github.com/repos/{repo}/issues/{pr_number}/comments" + + requests.post(url, json={"body": comment}, headers=headers) + + if __name__ == "__main__": + diff_content = get_diff() + if not diff_content.strip(): + print("No meaningful diff detected.") + exit(0) + + comment = generate_comment(diff_content) + post_comment(comment) + print("Comment posted successfully.") + EOF + + python /tmp/_workflow_core.py diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml index 70d8458..ea52562 100644 --- a/.github/workflows/stale.yaml +++ b/.github/workflows/stale.yaml @@ -11,7 +11,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@v5 + - uses: actions/stale@v9 with: days-before-issue-stale: 30 days-before-issue-close: 14 diff --git a/.github/workflows/top-issues.yaml b/.github/workflows/top-issues.yaml new file mode 100644 index 0000000..9ad9f59 --- /dev/null +++ b/.github/workflows/top-issues.yaml @@ -0,0 +1,29 @@ +name: Top issues +on: + schedule: + - cron: '0 0/2 * * *' + workflow_dispatch: +jobs: + ShowAndLabelTopIssues: + permissions: + issues: write + pull-requests: write + actions: read + contents: read + name: Display and label top issues + runs-on: ubuntu-latest + if: github.repository == 'mannaandpoem/OpenManus' + steps: + - name: Run top issues action + uses: rickstaa/top-issues-action@7e8dda5d5ae3087670f9094b9724a9a091fc3ba1 # v1.3.101 + env: + github_token: ${{ secrets.GITHUB_TOKEN }} + with: + label: true + dashboard: true + dashboard_show_total_reactions: true + top_issues: true + top_features: true + top_bugs: true + top_pull_requests: true + top_list_size: 14 diff --git a/.gitignore b/.gitignore index 653fd83..857ec7e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,14 @@ +### Project-specific ### +# Logs +logs/ + +# Data +data/ + +# Workspace +workspace/ + +### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -170,11 +181,19 @@ cython_debug/ # PyPI configuration file .pypirc -# Logs -logs/ +### Visual Studio Code ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets -# Data -data/ +# Local History for Visual Studio Code +.history/ -# Workspace -workspace/ +# Built Visual Studio Code Extensions +*.vsix + +# OSX +.DS_Store diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..f2c6cd0 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,8 @@ +{ + "recommendations": [ + "tamasfe.even-better-toml", + "ms-python.black-formatter", + "ms-python.isort" + ], + "unwantedRecommendations": [] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..d3aa302 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,20 @@ +{ + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.codeActionsOnSave": { + "source.organizeImports": "always" + } + }, + "[toml]": { + "editor.defaultFormatter": "tamasfe.even-better-toml", + }, + "pre-commit-helper.runOnSave": "none", + "pre-commit-helper.config": ".pre-commit-config.yaml", + "evenBetterToml.schema.enabled": true, + "evenBetterToml.schema.associations": { + "^.+config[/\\\\].+\\.toml$": "../config/schema.config.json" + }, + "files.insertFinalNewline": true, + "files.trimTrailingWhitespace": true, + "editor.formatOnSave": true +} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9f7a190 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.12-slim + +WORKDIR /app/OpenManus + +RUN apt-get update && apt-get install -y --no-install-recommends git curl \ + && rm -rf /var/lib/apt/lists/* \ + && (command -v uv >/dev/null 2>&1 || pip install --no-cache-dir uv) + +COPY . . + +RUN uv pip install --system -r requirements.txt + +CMD ["bash"] diff --git a/README.md b/README.md index ee33f75..d8e5bb0 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +

+ +

+ English | [中文](README_zh.md) | [한국어](README_ko.md) | [日本語](README_ja.md) [![GitHub stars](https://img.shields.io/github/stars/mannaandpoem/OpenManus?style=social)](https://github.com/mannaandpoem/OpenManus/stargazers) @@ -65,7 +69,7 @@ cd OpenManus 3. Create a new virtual environment and activate it: ```bash -uv venv +uv venv --python 3.12 source .venv/bin/activate # On Unix/macOS # Or on Windows: # .venv\Scripts\activate @@ -127,6 +131,8 @@ We welcome any friendly suggestions and helpful contributions! Just create issue Or contact @mannaandpoem via 📧email: mannaandpoem@gmail.com +**Note**: Before submitting a pull request, please use the pre-commit tool to check your changes. Run `pre-commit run --all-files` to execute the checks. + ## Community Group Join our networking group on Feishu and share your experience with other developers! @@ -143,7 +149,7 @@ Join our networking group on Feishu and share your experience with other develop Thanks to [anthropic-computer-use](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) and [browser-use](https://github.com/browser-use/browser-use) for providing basic support for this project! -Additionally, we are grateful to [AAAJ](https://github.com/metauto-ai/agent-as-a-judge), [MetaGPT](https://github.com/geekan/MetaGPT) and [OpenHands](https://github.com/All-Hands-AI/OpenHands). +Additionally, we are grateful to [AAAJ](https://github.com/metauto-ai/agent-as-a-judge), [MetaGPT](https://github.com/geekan/MetaGPT), [OpenHands](https://github.com/All-Hands-AI/OpenHands) and [SWE-agent](https://github.com/SWE-agent/SWE-agent). OpenManus is built by contributors from MetaGPT. Huge thanks to this agent community! diff --git a/README_ja.md b/README_ja.md index 668d9e3..71e5b68 100644 --- a/README_ja.md +++ b/README_ja.md @@ -1,5 +1,8 @@ -[English](README.md) | [中文](README_zh.md) | [한국어](README_ko.md) | 日本語 +

+ +

+[English](README.md) | [中文](README_zh.md) | [한국어](README_ko.md) | 日本語 [![GitHub stars](https://img.shields.io/github/stars/mannaandpoem/OpenManus?style=social)](https://github.com/mannaandpoem/OpenManus/stargazers)   @@ -66,7 +69,7 @@ cd OpenManus 3. 新しい仮想環境を作成してアクティベートします: ```bash -uv venv +uv venv --python 3.12 source .venv/bin/activate # Unix/macOSの場合 # Windowsの場合: # .venv\Scripts\activate @@ -128,6 +131,8 @@ python run_flow.py または @mannaandpoem に📧メールでご連絡ください:mannaandpoem@gmail.com +**注意**: プルリクエストを送信する前に、pre-commitツールを使用して変更を確認してください。`pre-commit run --all-files`を実行してチェックを実行します。 + ## コミュニティグループ Feishuのネットワーキンググループに参加して、他の開発者と経験を共有しましょう! @@ -144,7 +149,7 @@ Feishuのネットワーキンググループに参加して、他の開発者 このプロジェクトの基本的なサポートを提供してくれた[anthropic-computer-use](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) と[browser-use](https://github.com/browser-use/browser-use)に感謝します! -さらに、[AAAJ](https://github.com/metauto-ai/agent-as-a-judge)、[MetaGPT](https://github.com/geekan/MetaGPT)、[OpenHands](https://github.com/All-Hands-AI/OpenHands)にも感謝します。 +さらに、[AAAJ](https://github.com/metauto-ai/agent-as-a-judge)、[MetaGPT](https://github.com/geekan/MetaGPT)、[OpenHands](https://github.com/All-Hands-AI/OpenHands)、[SWE-agent](https://github.com/SWE-agent/SWE-agent)にも感謝します。 OpenManusはMetaGPTのコントリビューターによって構築されました。このエージェントコミュニティに大きな感謝を! diff --git a/README_ko.md b/README_ko.md index 5cefd84..1a00afb 100644 --- a/README_ko.md +++ b/README_ko.md @@ -1,5 +1,8 @@ -[English](README.md) | [中文](README_zh.md) | 한국어 | [日本語](README_ja.md) +

+ +

+[English](README.md) | [中文](README_zh.md) | 한국어 | [日本語](README_ja.md) [![GitHub stars](https://img.shields.io/github/stars/mannaandpoem/OpenManus?style=social)](https://github.com/mannaandpoem/OpenManus/stargazers)   @@ -66,7 +69,7 @@ cd OpenManus 3. 새로운 가상 환경을 생성하고 활성화합니다: ```bash -uv venv +uv venv --python 3.12 source .venv/bin/activate # Unix/macOS의 경우 # Windows의 경우: # .venv\Scripts\activate @@ -128,6 +131,8 @@ python run_flow.py 또는 📧 메일로 연락주세요. @mannaandpoem : mannaandpoem@gmail.com +**참고**: pull request를 제출하기 전에 pre-commit 도구를 사용하여 변경 사항을 확인하십시오. `pre-commit run --all-files`를 실행하여 검사를 실행합니다. + ## 커뮤니티 그룹 Feishu 네트워킹 그룹에 참여하여 다른 개발자들과 경험을 공유하세요! @@ -144,7 +149,7 @@ Feishu 네트워킹 그룹에 참여하여 다른 개발자들과 경험을 공 이 프로젝트에 기본적인 지원을 제공해 주신 [anthropic-computer-use](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo)와 [browser-use](https://github.com/browser-use/browser-use)에게 감사드립니다! -또한, [AAAJ](https://github.com/metauto-ai/agent-as-a-judge), [MetaGPT](https://github.com/geekan/MetaGPT), [OpenHands](https://github.com/All-Hands-AI/OpenHands)에 깊은 감사를 드립니다. +또한, [AAAJ](https://github.com/metauto-ai/agent-as-a-judge), [MetaGPT](https://github.com/geekan/MetaGPT), [OpenHands](https://github.com/All-Hands-AI/OpenHands), [SWE-agent](https://github.com/SWE-agent/SWE-agent)에 깊은 감사를 드립니다. OpenManus는 MetaGPT 기여자들에 의해 개발되었습니다. 이 에이전트 커뮤니티에 깊은 감사를 전합니다! diff --git a/README_zh.md b/README_zh.md index 28f6749..15e010b 100644 --- a/README_zh.md +++ b/README_zh.md @@ -1,8 +1,9 @@ +

+ +

[English](README.md) | 中文 | [한국어](README_ko.md) | [日本語](README_ja.md) - - [![GitHub stars](https://img.shields.io/github/stars/mannaandpoem/OpenManus?style=social)](https://github.com/mannaandpoem/OpenManus/stargazers)   [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)   @@ -69,7 +70,7 @@ cd OpenManus 3. 创建并激活虚拟环境: ```bash -uv venv +uv venv --python 3.12 source .venv/bin/activate # Unix/macOS 系统 # Windows 系统使用: # .venv\Scripts\activate @@ -119,7 +120,7 @@ python main.py 然后通过终端输入你的创意! -如需体验开发中版本,可运行: +如需体验不稳定的开发版本,可运行: ```bash python run_flow.py @@ -131,6 +132,8 @@ python run_flow.py 或通过 📧 邮件联系 @mannaandpoem:mannaandpoem@gmail.com +**注意**: 在提交 pull request 之前,请使用 pre-commit 工具检查您的更改。运行 `pre-commit run --all-files` 来执行检查。 + ## 交流群 加入我们的飞书交流群,与其他开发者分享经验! @@ -148,7 +151,7 @@ python run_flow.py 特别感谢 [anthropic-computer-use](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) 和 [browser-use](https://github.com/browser-use/browser-use) 为本项目提供的基础支持! -此外,我们感谢 [AAAJ](https://github.com/metauto-ai/agent-as-a-judge),[MetaGPT](https://github.com/geekan/MetaGPT) 和 [OpenHands](https://github.com/All-Hands-AI/OpenHands). +此外,我们感谢 [AAAJ](https://github.com/metauto-ai/agent-as-a-judge),[MetaGPT](https://github.com/geekan/MetaGPT),[OpenHands](https://github.com/All-Hands-AI/OpenHands) 和 [SWE-agent](https://github.com/SWE-agent/SWE-agent). OpenManus 由 MetaGPT 社区的贡献者共同构建,感谢这个充满活力的智能体开发者社区! diff --git a/app/__init__.py b/app/__init__.py index e69de29..0749c6d 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -0,0 +1,10 @@ +# Python version check: 3.11-3.13 +import sys + + +if sys.version_info < (3, 11) or sys.version_info > (3, 13): + print( + "Warning: Unsupported Python version {ver}, please use 3.11-3.13".format( + ver=".".join(map(str, sys.version_info)) + ) + ) diff --git a/app/agent/__init__.py b/app/agent/__init__.py index a7b69c7..4b8fb9d 100644 --- a/app/agent/__init__.py +++ b/app/agent/__init__.py @@ -1,4 +1,5 @@ from app.agent.base import BaseAgent +from app.agent.browser import BrowserAgent from app.agent.planning import PlanningAgent from app.agent.react import ReActAgent from app.agent.swe import SWEAgent @@ -7,6 +8,7 @@ from app.agent.toolcall import ToolCallAgent __all__ = [ "BaseAgent", + "BrowserAgent", "PlanningAgent", "ReActAgent", "SWEAgent", diff --git a/app/agent/base.py b/app/agent/base.py index 3830365..65f6600 100644 --- a/app/agent/base.py +++ b/app/agent/base.py @@ -6,7 +6,8 @@ from pydantic import BaseModel, Field, model_validator from app.llm import LLM from app.logger import logger -from app.schema import AgentState, Memory, Message, ROLE_TYPE +from app.sandbox.client import SANDBOX_CLIENT +from app.schema import ROLE_TYPE, AgentState, Memory, Message class BaseAgent(BaseModel, ABC): @@ -82,8 +83,9 @@ class BaseAgent(BaseModel, ABC): def update_memory( self, - role: ROLE_TYPE, # type: ignore + role: ROLE_TYPE, # type: ignore content: str, + base64_image: Optional[str] = None, **kwargs, ) -> None: """Add a message to the agent's memory. @@ -91,6 +93,7 @@ class BaseAgent(BaseModel, ABC): Args: role: The role of the message sender (user, system, assistant, tool). content: The message content. + base64_image: Optional base64 encoded image. **kwargs: Additional arguments (e.g., tool_call_id for tool messages). Raises: @@ -106,9 +109,9 @@ class BaseAgent(BaseModel, ABC): if role not in message_map: raise ValueError(f"Unsupported message role: {role}") - msg_factory = message_map[role] - msg = msg_factory(content, **kwargs) if role == "tool" else msg_factory(content) - self.memory.add_message(msg) + # Create message with appropriate parameters based on role + kwargs = {"base64_image": base64_image, **(kwargs if role == "tool" else {})} + self.memory.add_message(message_map[role](content, **kwargs)) async def run(self, request: Optional[str] = None) -> str: """Execute the agent's main loop asynchronously. @@ -147,7 +150,7 @@ class BaseAgent(BaseModel, ABC): self.current_step = 0 self.state = AgentState.IDLE results.append(f"Terminated: Reached max steps ({self.max_steps})") - + await SANDBOX_CLIENT.cleanup() return "\n".join(results) if results else "No steps executed" @abstractmethod diff --git a/app/agent/browser.py b/app/agent/browser.py new file mode 100644 index 0000000..ae0ce2f --- /dev/null +++ b/app/agent/browser.py @@ -0,0 +1,129 @@ +import json +from typing import Any, Optional + +from pydantic import Field + +from app.agent.toolcall import ToolCallAgent +from app.logger import logger +from app.prompt.browser import NEXT_STEP_PROMPT, SYSTEM_PROMPT +from app.schema import Message, ToolChoice +from app.tool import BrowserUseTool, Terminate, ToolCollection + + +class BrowserAgent(ToolCallAgent): + """ + A browser agent that uses the browser_use library to control a browser. + + This agent can navigate web pages, interact with elements, fill forms, + extract content, and perform other browser-based actions to accomplish tasks. + """ + + name: str = "browser" + description: str = "A browser agent that can control a browser to accomplish tasks" + + system_prompt: str = SYSTEM_PROMPT + next_step_prompt: str = NEXT_STEP_PROMPT + + max_observe: int = 10000 + max_steps: int = 20 + + # Configure the available tools + available_tools: ToolCollection = Field( + default_factory=lambda: ToolCollection(BrowserUseTool(), Terminate()) + ) + + # Use Auto for tool choice to allow both tool usage and free-form responses + tool_choices: ToolChoice = ToolChoice.AUTO + special_tool_names: list[str] = Field(default_factory=lambda: [Terminate().name]) + + _current_base64_image: Optional[str] = None + + async def _handle_special_tool(self, name: str, result: Any, **kwargs): + if not self._is_special_tool(name): + return + else: + await self.available_tools.get_tool(BrowserUseTool().name).cleanup() + await super()._handle_special_tool(name, result, **kwargs) + + async def get_browser_state(self) -> Optional[dict]: + """Get the current browser state for context in next steps.""" + browser_tool = self.available_tools.get_tool(BrowserUseTool().name) + if not browser_tool: + return None + + try: + # Get browser state directly from the tool + result = await browser_tool.get_current_state() + + if result.error: + logger.debug(f"Browser state error: {result.error}") + return None + + # Store screenshot if available + if hasattr(result, "base64_image") and result.base64_image: + self._current_base64_image = result.base64_image + + # Parse the state info + return json.loads(result.output) + + except Exception as e: + logger.debug(f"Failed to get browser state: {str(e)}") + return None + + async def think(self) -> bool: + """Process current state and decide next actions using tools, with browser state info added""" + # Add browser state to the context + browser_state = await self.get_browser_state() + + # Initialize placeholder values + url_info = "" + tabs_info = "" + content_above_info = "" + content_below_info = "" + results_info = "" + + if browser_state and not browser_state.get("error"): + # URL and title info + url_info = f"\n URL: {browser_state.get('url', 'N/A')}\n Title: {browser_state.get('title', 'N/A')}" + + # Tab information + if "tabs" in browser_state: + tabs = browser_state.get("tabs", []) + if tabs: + tabs_info = f"\n {len(tabs)} tab(s) available" + + # Content above/below viewport + pixels_above = browser_state.get("pixels_above", 0) + pixels_below = browser_state.get("pixels_below", 0) + + if pixels_above > 0: + content_above_info = f" ({pixels_above} pixels)" + + if pixels_below > 0: + content_below_info = f" ({pixels_below} pixels)" + + # Add screenshot as base64 if available + if self._current_base64_image: + # Create a message with image attachment + image_message = Message.user_message( + content="Current browser screenshot:", + base64_image=self._current_base64_image, + ) + self.memory.add_message(image_message) + + # Replace placeholders with actual browser state info + self.next_step_prompt = NEXT_STEP_PROMPT.format( + url_placeholder=url_info, + tabs_placeholder=tabs_info, + content_above_placeholder=content_above_info, + content_below_placeholder=content_below_info, + results_placeholder=results_info, + ) + + # Call parent implementation + result = await super().think() + + # Reset the next_step_prompt to its original state + self.next_step_prompt = NEXT_STEP_PROMPT + + return result diff --git a/app/agent/manus.py b/app/agent/manus.py index e11ca45..d7ec2f9 100644 --- a/app/agent/manus.py +++ b/app/agent/manus.py @@ -1,21 +1,20 @@ -from typing import Any - from pydantic import Field -from app.agent.toolcall import ToolCallAgent +from app.agent.browser import BrowserAgent +from app.config import config +from app.prompt.browser import NEXT_STEP_PROMPT as BROWSER_NEXT_STEP_PROMPT from app.prompt.manus import NEXT_STEP_PROMPT, SYSTEM_PROMPT from app.tool import Terminate, ToolCollection from app.tool.browser_use_tool import BrowserUseTool -from app.tool.file_saver import FileSaver -from app.tool.google_search import GoogleSearch from app.tool.python_execute import PythonExecute +from app.tool.str_replace_editor import StrReplaceEditor -class Manus(ToolCallAgent): +class Manus(BrowserAgent): """ A versatile general-purpose agent that uses planning to solve various tasks. - This agent extends PlanningAgent with a comprehensive set of tools and capabilities, + This agent extends BrowserAgent with a comprehensive set of tools and capabilities, including Python execution, web browsing, file operations, and information retrieval to handle a wide range of user requests. """ @@ -25,19 +24,40 @@ class Manus(ToolCallAgent): "A versatile agent that can solve various tasks using multiple tools" ) - system_prompt: str = SYSTEM_PROMPT + system_prompt: str = SYSTEM_PROMPT.format(directory=config.workspace_root) next_step_prompt: str = NEXT_STEP_PROMPT - max_observe: int = 2000 + max_observe: int = 10000 max_steps: int = 20 # Add general-purpose tools to the tool collection available_tools: ToolCollection = Field( default_factory=lambda: ToolCollection( - PythonExecute(), GoogleSearch(), BrowserUseTool(), FileSaver(), Terminate() + PythonExecute(), BrowserUseTool(), StrReplaceEditor(), Terminate() ) ) - async def _handle_special_tool(self, name: str, result: Any, **kwargs): - await self.available_tools.get_tool(BrowserUseTool().name).cleanup() - await super()._handle_special_tool(name, result, **kwargs) + async def think(self) -> bool: + """Process current state and decide next actions with appropriate context.""" + # Store original prompt + original_prompt = self.next_step_prompt + + # Only check recent messages (last 3) for browser activity + recent_messages = self.memory.messages[-3:] if self.memory.messages else [] + browser_in_use = any( + "browser_use" in msg.content.lower() + for msg in recent_messages + if hasattr(msg, "content") and isinstance(msg.content, str) + ) + + if browser_in_use: + # Override with browser-specific prompt temporarily to get browser context + self.next_step_prompt = BROWSER_NEXT_STEP_PROMPT + + # Call parent's think method + result = await super().think() + + # Restore original prompt + self.next_step_prompt = original_prompt + + return result diff --git a/app/agent/planning.py b/app/agent/planning.py index cbd15a0..7e98912 100644 --- a/app/agent/planning.py +++ b/app/agent/planning.py @@ -6,7 +6,7 @@ from pydantic import Field, model_validator from app.agent.toolcall import ToolCallAgent from app.logger import logger from app.prompt.planning import NEXT_STEP_PROMPT, PLANNING_SYSTEM_PROMPT -from app.schema import Message, TOOL_CHOICE_TYPE, ToolCall, ToolChoice +from app.schema import TOOL_CHOICE_TYPE, Message, ToolCall, ToolChoice from app.tool import PlanningTool, Terminate, ToolCollection @@ -27,7 +27,7 @@ class PlanningAgent(ToolCallAgent): available_tools: ToolCollection = Field( default_factory=lambda: ToolCollection(PlanningTool(), Terminate()) ) - tool_choices: TOOL_CHOICE_TYPE = ToolChoice.AUTO # type: ignore + tool_choices: TOOL_CHOICE_TYPE = ToolChoice.AUTO # type: ignore special_tool_names: List[str] = Field(default_factory=lambda: [Terminate().name]) tool_calls: List[ToolCall] = Field(default_factory=list) @@ -212,7 +212,7 @@ class PlanningAgent(ToolCallAgent): messages=messages, system_msgs=[Message.system_message(self.system_prompt)], tools=self.available_tools.to_params(), - tool_choice=ToolChoice.REQUIRED, + tool_choice=ToolChoice.AUTO, ) assistant_msg = Message.from_tool_calls( content=response.content, tool_calls=response.tool_calls diff --git a/app/agent/toolcall.py b/app/agent/toolcall.py index 1f04784..131fd91 100644 --- a/app/agent/toolcall.py +++ b/app/agent/toolcall.py @@ -1,13 +1,13 @@ import json - -from typing import Any, List, Literal, Optional, Union +from typing import Any, List, Optional, Union from pydantic import Field from app.agent.react import ReActAgent +from app.exceptions import TokenLimitExceeded from app.logger import logger from app.prompt.toolcall import NEXT_STEP_PROMPT, SYSTEM_PROMPT -from app.schema import AgentState, Message, ToolCall, TOOL_CHOICE_TYPE, ToolChoice +from app.schema import TOOL_CHOICE_TYPE, AgentState, Message, ToolCall, ToolChoice from app.tool import CreateChatCompletion, Terminate, ToolCollection @@ -26,10 +26,11 @@ class ToolCallAgent(ReActAgent): available_tools: ToolCollection = ToolCollection( CreateChatCompletion(), Terminate() ) - tool_choices: TOOL_CHOICE_TYPE = ToolChoice.AUTO # type: ignore + tool_choices: TOOL_CHOICE_TYPE = ToolChoice.AUTO # type: ignore special_tool_names: List[str] = Field(default_factory=lambda: [Terminate().name]) tool_calls: List[ToolCall] = Field(default_factory=list) + _current_base64_image: Optional[str] = None max_steps: int = 30 max_observe: Optional[Union[int, bool]] = None @@ -40,15 +41,36 @@ class ToolCallAgent(ReActAgent): user_msg = Message.user_message(self.next_step_prompt) self.messages += [user_msg] - # Get response with tool options - response = await self.llm.ask_tool( - messages=self.messages, - system_msgs=[Message.system_message(self.system_prompt)] - if self.system_prompt - else None, - tools=self.available_tools.to_params(), - tool_choice=self.tool_choices, - ) + try: + # Get response with tool options + response = await self.llm.ask_tool( + messages=self.messages, + system_msgs=( + [Message.system_message(self.system_prompt)] + if self.system_prompt + else None + ), + tools=self.available_tools.to_params(), + tool_choice=self.tool_choices, + ) + except ValueError: + raise + except Exception as e: + # Check if this is a RetryError containing TokenLimitExceeded + if hasattr(e, "__cause__") and isinstance(e.__cause__, TokenLimitExceeded): + token_limit_error = e.__cause__ + logger.error( + f"🚨 Token limit error (from RetryError): {token_limit_error}" + ) + self.memory.add_message( + Message.assistant_message( + f"Maximum token limit reached, cannot continue execution: {str(token_limit_error)}" + ) + ) + self.state = AgentState.FINISHED + return False + raise + self.tool_calls = response.tool_calls # Log response info @@ -60,6 +82,9 @@ class ToolCallAgent(ReActAgent): logger.info( f"🧰 Tools being prepared: {[call.function.name for call in response.tool_calls]}" ) + logger.info( + f"🔧 Tool arguments: {response.tool_calls[0].function.arguments}" + ) try: # Handle different tool_choices modes @@ -111,6 +136,9 @@ class ToolCallAgent(ReActAgent): results = [] for command in self.tool_calls: + # Reset base64_image for each tool call + self._current_base64_image = None + result = await self.execute_tool(command) if self.max_observe: @@ -122,7 +150,10 @@ class ToolCallAgent(ReActAgent): # Add tool response to memory tool_msg = Message.tool_message( - content=result, tool_call_id=command.id, name=command.function.name + content=result, + tool_call_id=command.id, + name=command.function.name, + base64_image=self._current_base64_image, ) self.memory.add_message(tool_msg) results.append(result) @@ -146,16 +177,29 @@ class ToolCallAgent(ReActAgent): logger.info(f"🔧 Activating tool: '{name}'...") result = await self.available_tools.execute(name=name, tool_input=args) - # Format result for display + # Handle special tools + await self._handle_special_tool(name=name, result=result) + + # Check if result is a ToolResult with base64_image + if hasattr(result, "base64_image") and result.base64_image: + # Store the base64_image for later use in tool_message + self._current_base64_image = result.base64_image + + # Format result for display + observation = ( + f"Observed output of cmd `{name}` executed:\n{str(result)}" + if result + else f"Cmd `{name}` completed with no output" + ) + return observation + + # Format result for display (standard case) observation = ( f"Observed output of cmd `{name}` executed:\n{str(result)}" if result else f"Cmd `{name}` completed with no output" ) - # Handle special tools like `finish` - await self._handle_special_tool(name=name, result=result) - return observation except json.JSONDecodeError: error_msg = f"Error parsing arguments for {name}: Invalid JSON format" diff --git a/app/config.py b/app/config.py index 64f478d..0be771b 100644 --- a/app/config.py +++ b/app/config.py @@ -20,6 +20,10 @@ class LLMSettings(BaseModel): base_url: str = Field(..., description="API base URL") api_key: str = Field(..., description="API key") max_tokens: int = Field(4096, description="Maximum number of tokens per request") + max_input_tokens: Optional[int] = Field( + None, + description="Maximum input tokens to use across all requests (None for unlimited)", + ) temperature: float = Field(1.0, description="Sampling temperature") api_type: str = Field(..., description="AzureOpenai or Openai") api_version: str = Field(..., description="Azure Openai version if AzureOpenai") @@ -31,6 +35,10 @@ class ProxySettings(BaseModel): password: Optional[str] = Field(None, description="Proxy password") +class SearchSettings(BaseModel): + engine: str = Field(default="Google", description="Search engine the llm to use") + + class BrowserSettings(BaseModel): headless: bool = Field(False, description="Whether to run browser in headless mode") disable_security: bool = Field( @@ -51,13 +59,36 @@ class BrowserSettings(BaseModel): proxy: Optional[ProxySettings] = Field( None, description="Proxy settings for the browser" ) + max_content_length: int = Field( + 2000, description="Maximum length for content retrieval operations" + ) + + +class SandboxSettings(BaseModel): + """Configuration for the execution sandbox""" + + use_sandbox: bool = Field(False, description="Whether to use the sandbox") + image: str = Field("python:3.12-slim", description="Base image") + work_dir: str = Field("/workspace", description="Container working directory") + memory_limit: str = Field("512m", description="Memory limit") + cpu_limit: float = Field(1.0, description="CPU limit") + timeout: int = Field(300, description="Default command timeout (seconds)") + network_enabled: bool = Field( + False, description="Whether network access is allowed" + ) class AppConfig(BaseModel): llm: Dict[str, LLMSettings] + sandbox: Optional[SandboxSettings] = Field( + None, description="Sandbox configuration" + ) browser_config: Optional[BrowserSettings] = Field( None, description="Browser configuration" ) + search_config: Optional[SearchSettings] = Field( + None, description="Search configuration" + ) class Config: arbitrary_types_allowed = True @@ -111,6 +142,7 @@ class Config: "base_url": base_llm.get("base_url"), "api_key": base_llm.get("api_key"), "max_tokens": base_llm.get("max_tokens", 4096), + "max_input_tokens": base_llm.get("max_input_tokens"), "temperature": base_llm.get("temperature", 1.0), "api_type": base_llm.get("api_type", ""), "api_version": base_llm.get("api_version", ""), @@ -149,6 +181,16 @@ class Config: if valid_browser_params: browser_settings = BrowserSettings(**valid_browser_params) + search_config = raw_config.get("search", {}) + search_settings = None + if search_config: + search_settings = SearchSettings(**search_config) + sandbox_config = raw_config.get("sandbox", {}) + if sandbox_config: + sandbox_settings = SandboxSettings(**sandbox_config) + else: + sandbox_settings = SandboxSettings() + config_dict = { "llm": { "default": default_settings, @@ -157,7 +199,9 @@ class Config: for name, override_config in llm_overrides.items() }, }, + "sandbox": sandbox_settings, "browser_config": browser_settings, + "search_config": search_settings, } self._config = AppConfig(**config_dict) @@ -166,9 +210,22 @@ class Config: def llm(self) -> Dict[str, LLMSettings]: return self._config.llm + @property + def sandbox(self) -> SandboxSettings: + return self._config.sandbox + @property def browser_config(self) -> Optional[BrowserSettings]: return self._config.browser_config + @property + def search_config(self) -> Optional[SearchSettings]: + return self._config.search_config + + @property + def workspace_root(self) -> Path: + """Get the workspace root directory""" + return WORKSPACE_ROOT + config = Config() diff --git a/app/exceptions.py b/app/exceptions.py index 57a0148..fc90087 100644 --- a/app/exceptions.py +++ b/app/exceptions.py @@ -3,3 +3,11 @@ class ToolError(Exception): def __init__(self, message): self.message = message + + +class OpenManusError(Exception): + """Base exception for all OpenManus errors""" + + +class TokenLimitExceeded(OpenManusError): + """Exception raised when the token limit is exceeded""" diff --git a/app/flow/planning.py b/app/flow/planning.py index a12bbe4..55ec5c9 100644 --- a/app/flow/planning.py +++ b/app/flow/planning.py @@ -124,7 +124,7 @@ class PlanningFlow(BaseFlow): messages=[user_message], system_msgs=[system_message], tools=[self.planning_tool.to_param()], - tool_choice=ToolChoice.REQUIRED, + tool_choice=ToolChoice.AUTO, ) # Process tool calls if present diff --git a/app/llm.py b/app/llm.py index 5b599e8..37de566 100644 --- a/app/llm.py +++ b/app/llm.py @@ -1,5 +1,7 @@ +import math from typing import Dict, List, Optional, Union +import tiktoken from openai import ( APIError, AsyncAzureOpenAI, @@ -8,11 +10,162 @@ from openai import ( OpenAIError, RateLimitError, ) -from tenacity import retry, stop_after_attempt, wait_random_exponential +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_random_exponential, +) from app.config import LLMSettings, config +from app.exceptions import TokenLimitExceeded from app.logger import logger # Assuming a logger is set up in your app -from app.schema import Message, TOOL_CHOICE_TYPE, ROLE_VALUES, TOOL_CHOICE_VALUES, ToolChoice +from app.schema import ( + ROLE_VALUES, + TOOL_CHOICE_TYPE, + TOOL_CHOICE_VALUES, + Message, + ToolChoice, +) + + +REASONING_MODELS = ["o1", "o3-mini"] + + +class TokenCounter: + # Token constants + BASE_MESSAGE_TOKENS = 4 + FORMAT_TOKENS = 2 + LOW_DETAIL_IMAGE_TOKENS = 85 + HIGH_DETAIL_TILE_TOKENS = 170 + + # Image processing constants + MAX_SIZE = 2048 + HIGH_DETAIL_TARGET_SHORT_SIDE = 768 + TILE_SIZE = 512 + + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def count_text(self, text: str) -> int: + """Calculate tokens for a text string""" + return 0 if not text else len(self.tokenizer.encode(text)) + + def count_image(self, image_item: dict) -> int: + """ + Calculate tokens for an image based on detail level and dimensions + + For "low" detail: fixed 85 tokens + For "high" detail: + 1. Scale to fit in 2048x2048 square + 2. Scale shortest side to 768px + 3. Count 512px tiles (170 tokens each) + 4. Add 85 tokens + """ + detail = image_item.get("detail", "medium") + + # For low detail, always return fixed token count + if detail == "low": + return self.LOW_DETAIL_IMAGE_TOKENS + + # For medium detail (default in OpenAI), use high detail calculation + # OpenAI doesn't specify a separate calculation for medium + + # For high detail, calculate based on dimensions if available + if detail == "high" or detail == "medium": + # If dimensions are provided in the image_item + if "dimensions" in image_item: + width, height = image_item["dimensions"] + return self._calculate_high_detail_tokens(width, height) + + # Default values when dimensions aren't available or detail level is unknown + if detail == "high": + # Default to a 1024x1024 image calculation for high detail + return self._calculate_high_detail_tokens(1024, 1024) # 765 tokens + elif detail == "medium": + # Default to a medium-sized image for medium detail + return 1024 # This matches the original default + else: + # For unknown detail levels, use medium as default + return 1024 + + def _calculate_high_detail_tokens(self, width: int, height: int) -> int: + """Calculate tokens for high detail images based on dimensions""" + # Step 1: Scale to fit in MAX_SIZE x MAX_SIZE square + if width > self.MAX_SIZE or height > self.MAX_SIZE: + scale = self.MAX_SIZE / max(width, height) + width = int(width * scale) + height = int(height * scale) + + # Step 2: Scale so shortest side is HIGH_DETAIL_TARGET_SHORT_SIDE + scale = self.HIGH_DETAIL_TARGET_SHORT_SIDE / min(width, height) + scaled_width = int(width * scale) + scaled_height = int(height * scale) + + # Step 3: Count number of 512px tiles + tiles_x = math.ceil(scaled_width / self.TILE_SIZE) + tiles_y = math.ceil(scaled_height / self.TILE_SIZE) + total_tiles = tiles_x * tiles_y + + # Step 4: Calculate final token count + return ( + total_tiles * self.HIGH_DETAIL_TILE_TOKENS + ) + self.LOW_DETAIL_IMAGE_TOKENS + + def count_content(self, content: Union[str, List[Union[str, dict]]]) -> int: + """Calculate tokens for message content""" + if not content: + return 0 + + if isinstance(content, str): + return self.count_text(content) + + token_count = 0 + for item in content: + if isinstance(item, str): + token_count += self.count_text(item) + elif isinstance(item, dict): + if "text" in item: + token_count += self.count_text(item["text"]) + elif "image_url" in item: + token_count += self.count_image(item) + return token_count + + def count_tool_calls(self, tool_calls: List[dict]) -> int: + """Calculate tokens for tool calls""" + token_count = 0 + for tool_call in tool_calls: + if "function" in tool_call: + function = tool_call["function"] + token_count += self.count_text(function.get("name", "")) + token_count += self.count_text(function.get("arguments", "")) + return token_count + + def count_message_tokens(self, messages: List[dict]) -> int: + """Calculate the total number of tokens in a message list""" + total_tokens = self.FORMAT_TOKENS # Base format tokens + + for message in messages: + tokens = self.BASE_MESSAGE_TOKENS # Base tokens per message + + # Add role tokens + tokens += self.count_text(message.get("role", "")) + + # Add content tokens + if "content" in message: + tokens += self.count_content(message["content"]) + + # Add tool calls tokens + if "tool_calls" in message: + tokens += self.count_tool_calls(message["tool_calls"]) + + # Add name and tool_call_id tokens + tokens += self.count_text(message.get("name", "")) + tokens += self.count_text(message.get("tool_call_id", "")) + + total_tokens += tokens + + return total_tokens class LLM: @@ -40,6 +193,23 @@ class LLM: self.api_key = llm_config.api_key self.api_version = llm_config.api_version self.base_url = llm_config.base_url + + # Add token counting related attributes + self.total_input_tokens = 0 + self.total_completion_tokens = 0 + self.max_input_tokens = ( + llm_config.max_input_tokens + if hasattr(llm_config, "max_input_tokens") + else None + ) + + # Initialize tokenizer + try: + self.tokenizer = tiktoken.encoding_for_model(self.model) + except KeyError: + # If the model is not in tiktoken's presets, use cl100k_base as default + self.tokenizer = tiktoken.get_encoding("cl100k_base") + if self.api_type == "azure": self.client = AsyncAzureOpenAI( base_url=self.base_url, @@ -49,6 +219,45 @@ class LLM: else: self.client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url) + self.token_counter = TokenCounter(self.tokenizer) + + def count_tokens(self, text: str) -> int: + """Calculate the number of tokens in a text""" + if not text: + return 0 + return len(self.tokenizer.encode(text)) + + def count_message_tokens(self, messages: List[dict]) -> int: + return self.token_counter.count_message_tokens(messages) + + def update_token_count(self, input_tokens: int, completion_tokens: int = 0) -> None: + """Update token counts""" + # Only track tokens if max_input_tokens is set + self.total_input_tokens += input_tokens + self.total_completion_tokens += completion_tokens + logger.info( + f"Token usage: Input={input_tokens}, Completion={completion_tokens}, " + f"Cumulative Input={self.total_input_tokens}, Cumulative Completion={self.total_completion_tokens}, " + f"Total={input_tokens + completion_tokens}, Cumulative Total={self.total_input_tokens + self.total_completion_tokens}" + ) + + def check_token_limit(self, input_tokens: int) -> bool: + """Check if token limits are exceeded""" + if self.max_input_tokens is not None: + return (self.total_input_tokens + input_tokens) <= self.max_input_tokens + # If max_input_tokens is not set, always return True + return True + + def get_limit_error_message(self, input_tokens: int) -> str: + """Generate error message for token limit exceeded""" + if ( + self.max_input_tokens is not None + and (self.total_input_tokens + input_tokens) > self.max_input_tokens + ): + return f"Request may exceed input token limit (Current: {self.total_input_tokens}, Needed: {input_tokens}, Max: {self.max_input_tokens})" + + return "Token limit exceeded" + @staticmethod def format_messages(messages: List[Union[dict, Message]]) -> List[dict]: """ @@ -75,31 +284,67 @@ class LLM: formatted_messages = [] for message in messages: - if isinstance(message, dict): - # If message is already a dict, ensure it has required fields - if "role" not in message: - raise ValueError("Message dict must contain 'role' field") - formatted_messages.append(message) - elif isinstance(message, Message): - # If message is a Message object, convert it to dict - formatted_messages.append(message.to_dict()) - else: + # Convert Message objects to dictionaries + if isinstance(message, Message): + message = message.to_dict() + + if not isinstance(message, dict): raise TypeError(f"Unsupported message type: {type(message)}") - # Validate all messages have required fields - for msg in formatted_messages: - if msg["role"] not in ROLE_VALUES: - raise ValueError(f"Invalid role: {msg['role']}") - if "content" not in msg and "tool_calls" not in msg: - raise ValueError( - "Message must contain either 'content' or 'tool_calls'" + # Validate required fields + if "role" not in message: + raise ValueError("Message dict must contain 'role' field") + + # Process base64 images if present + if message.get("base64_image"): + # Initialize or convert content to appropriate format + if not message.get("content"): + message["content"] = [] + elif isinstance(message["content"], str): + message["content"] = [{"type": "text", "text": message["content"]}] + elif isinstance(message["content"], list): + # Convert string items to proper text objects + message["content"] = [ + ( + {"type": "text", "text": item} + if isinstance(item, str) + else item + ) + for item in message["content"] + ] + + # Add the image to content + message["content"].append( + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{message['base64_image']}" + }, + } ) + # Remove the base64_image field + del message["base64_image"] + + # Only include messages with content or tool_calls + if "content" in message or "tool_calls" in message: + formatted_messages.append(message) + + # Validate all roles + invalid_roles = [ + msg for msg in formatted_messages if msg["role"] not in ROLE_VALUES + ] + if invalid_roles: + raise ValueError(f"Invalid role: {invalid_roles[0]['role']}") + return formatted_messages @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), + retry=retry_if_exception_type( + (OpenAIError, Exception, ValueError) + ), # Don't retry TokenLimitExceeded ) async def ask( self, @@ -121,6 +366,7 @@ class LLM: str: The generated response Raises: + TokenLimitExceeded: If token limits are exceeded ValueError: If messages are invalid or response is empty OpenAIError: If API call fails after retries Exception: For unexpected errors @@ -133,27 +379,204 @@ class LLM: else: messages = self.format_messages(messages) + # Calculate input token count + input_tokens = self.count_message_tokens(messages) + + # Check if token limits are exceeded + if not self.check_token_limit(input_tokens): + error_message = self.get_limit_error_message(input_tokens) + # Raise a special exception that won't be retried + raise TokenLimitExceeded(error_message) + + params = { + "model": self.model, + "messages": messages, + } + + if self.model in REASONING_MODELS: + params["max_completion_tokens"] = self.max_tokens + else: + params["max_tokens"] = self.max_tokens + params["temperature"] = ( + temperature if temperature is not None else self.temperature + ) + if not stream: # Non-streaming request - response = await self.client.chat.completions.create( - model=self.model, - messages=messages, - max_tokens=self.max_tokens, - temperature=temperature or self.temperature, - stream=False, - ) + params["stream"] = False + + response = await self.client.chat.completions.create(**params) + if not response.choices or not response.choices[0].message.content: raise ValueError("Empty or invalid response from LLM") + + # Update token counts + self.update_token_count( + response.usage.prompt_tokens, response.usage.completion_tokens + ) + return response.choices[0].message.content - # Streaming request - response = await self.client.chat.completions.create( - model=self.model, - messages=messages, - max_tokens=self.max_tokens, - temperature=temperature or self.temperature, - stream=True, + # Streaming request, For streaming, update estimated token count before making the request + self.update_token_count(input_tokens) + + params["stream"] = True + response = await self.client.chat.completions.create(**params) + + collected_messages = [] + completion_text = "" + async for chunk in response: + chunk_message = chunk.choices[0].delta.content or "" + collected_messages.append(chunk_message) + completion_text += chunk_message + print(chunk_message, end="", flush=True) + + print() # Newline after streaming + full_response = "".join(collected_messages).strip() + if not full_response: + raise ValueError("Empty response from streaming LLM") + + # estimate completion tokens for streaming response + completion_tokens = self.count_tokens(completion_text) + logger.info( + f"Estimated completion tokens for streaming response: {completion_tokens}" ) + self.total_completion_tokens += completion_tokens + + return full_response + + except TokenLimitExceeded: + # Re-raise token limit errors without logging + raise + except ValueError as ve: + logger.error(f"Validation error: {ve}") + raise + except OpenAIError as oe: + logger.error(f"OpenAI API error: {oe}") + if isinstance(oe, AuthenticationError): + logger.error("Authentication failed. Check API key.") + elif isinstance(oe, RateLimitError): + logger.error("Rate limit exceeded. Consider increasing retry attempts.") + elif isinstance(oe, APIError): + logger.error(f"API error: {oe}") + raise + except Exception as e: + logger.error(f"Unexpected error in ask: {e}") + raise + + @retry( + wait=wait_random_exponential(min=1, max=60), + stop=stop_after_attempt(6), + retry=retry_if_exception_type( + (OpenAIError, Exception, ValueError) + ), # Don't retry TokenLimitExceeded + ) + async def ask_with_images( + self, + messages: List[Union[dict, Message]], + images: List[Union[str, dict]], + system_msgs: Optional[List[Union[dict, Message]]] = None, + stream: bool = False, + temperature: Optional[float] = None, + ) -> str: + """ + Send a prompt with images to the LLM and get the response. + + Args: + messages: List of conversation messages + images: List of image URLs or image data dictionaries + system_msgs: Optional system messages to prepend + stream (bool): Whether to stream the response + temperature (float): Sampling temperature for the response + + Returns: + str: The generated response + + Raises: + TokenLimitExceeded: If token limits are exceeded + ValueError: If messages are invalid or response is empty + OpenAIError: If API call fails after retries + Exception: For unexpected errors + """ + try: + # Format messages + formatted_messages = self.format_messages(messages) + + # Ensure the last message is from the user to attach images + if not formatted_messages or formatted_messages[-1]["role"] != "user": + raise ValueError( + "The last message must be from the user to attach images" + ) + + # Process the last user message to include images + last_message = formatted_messages[-1] + + # Convert content to multimodal format if needed + content = last_message["content"] + multimodal_content = ( + [{"type": "text", "text": content}] + if isinstance(content, str) + else content + if isinstance(content, list) + else [] + ) + + # Add images to content + for image in images: + if isinstance(image, str): + multimodal_content.append( + {"type": "image_url", "image_url": {"url": image}} + ) + elif isinstance(image, dict) and "url" in image: + multimodal_content.append({"type": "image_url", "image_url": image}) + elif isinstance(image, dict) and "image_url" in image: + multimodal_content.append(image) + else: + raise ValueError(f"Unsupported image format: {image}") + + # Update the message with multimodal content + last_message["content"] = multimodal_content + + # Add system messages if provided + if system_msgs: + all_messages = self.format_messages(system_msgs) + formatted_messages + else: + all_messages = formatted_messages + + # Calculate tokens and check limits + input_tokens = self.count_message_tokens(all_messages) + if not self.check_token_limit(input_tokens): + raise TokenLimitExceeded(self.get_limit_error_message(input_tokens)) + + # Set up API parameters + params = { + "model": self.model, + "messages": all_messages, + "stream": stream, + } + + # Add model-specific parameters + if self.model in REASONING_MODELS: + params["max_completion_tokens"] = self.max_tokens + else: + params["max_tokens"] = self.max_tokens + params["temperature"] = ( + temperature if temperature is not None else self.temperature + ) + + # Handle non-streaming request + if not stream: + response = await self.client.chat.completions.create(**params) + + if not response.choices or not response.choices[0].message.content: + raise ValueError("Empty or invalid response from LLM") + + self.update_token_count(response.usage.prompt_tokens) + return response.choices[0].message.content + + # Handle streaming request + self.update_token_count(input_tokens) + response = await self.client.chat.completions.create(**params) collected_messages = [] async for chunk in response: @@ -163,23 +586,36 @@ class LLM: print() # Newline after streaming full_response = "".join(collected_messages).strip() + if not full_response: raise ValueError("Empty response from streaming LLM") + return full_response + except TokenLimitExceeded: + raise except ValueError as ve: - logger.error(f"Validation error: {ve}") + logger.error(f"Validation error in ask_with_images: {ve}") raise except OpenAIError as oe: logger.error(f"OpenAI API error: {oe}") + if isinstance(oe, AuthenticationError): + logger.error("Authentication failed. Check API key.") + elif isinstance(oe, RateLimitError): + logger.error("Rate limit exceeded. Consider increasing retry attempts.") + elif isinstance(oe, APIError): + logger.error(f"API error: {oe}") raise except Exception as e: - logger.error(f"Unexpected error in ask: {e}") + logger.error(f"Unexpected error in ask_with_images: {e}") raise @retry( wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), + retry=retry_if_exception_type( + (OpenAIError, Exception, ValueError) + ), # Don't retry TokenLimitExceeded ) async def ask_tool( self, @@ -187,7 +623,7 @@ class LLM: system_msgs: Optional[List[Union[dict, Message]]] = None, timeout: int = 300, tools: Optional[List[dict]] = None, - tool_choice: TOOL_CHOICE_TYPE = ToolChoice.AUTO, # type: ignore + tool_choice: TOOL_CHOICE_TYPE = ToolChoice.AUTO, # type: ignore temperature: Optional[float] = None, **kwargs, ): @@ -207,6 +643,7 @@ class LLM: ChatCompletionMessage: The model's response Raises: + TokenLimitExceeded: If token limits are exceeded ValueError: If tools, tool_choice, or messages are invalid OpenAIError: If API call fails after retries Exception: For unexpected errors @@ -223,6 +660,23 @@ class LLM: else: messages = self.format_messages(messages) + # Calculate input token count + input_tokens = self.count_message_tokens(messages) + + # If there are tools, calculate token count for tool descriptions + tools_tokens = 0 + if tools: + for tool in tools: + tools_tokens += self.count_tokens(str(tool)) + + input_tokens += tools_tokens + + # Check if token limits are exceeded + if not self.check_token_limit(input_tokens): + error_message = self.get_limit_error_message(input_tokens) + # Raise a special exception that won't be retried + raise TokenLimitExceeded(error_message) + # Validate tools if provided if tools: for tool in tools: @@ -230,28 +684,45 @@ class LLM: raise ValueError("Each tool must be a dict with 'type' field") # Set up the completion request - response = await self.client.chat.completions.create( - model=self.model, - messages=messages, - temperature=temperature or self.temperature, - max_tokens=self.max_tokens, - tools=tools, - tool_choice=tool_choice, - timeout=timeout, + params = { + "model": self.model, + "messages": messages, + "tools": tools, + "tool_choice": tool_choice, + "timeout": timeout, **kwargs, - ) + } + + if self.model in REASONING_MODELS: + params["max_completion_tokens"] = self.max_tokens + else: + params["max_tokens"] = self.max_tokens + params["temperature"] = ( + temperature if temperature is not None else self.temperature + ) + + response = await self.client.chat.completions.create(**params) # Check if response is valid if not response.choices or not response.choices[0].message: print(response) raise ValueError("Invalid or empty response from LLM") + # Update token counts + self.update_token_count( + response.usage.prompt_tokens, response.usage.completion_tokens + ) + return response.choices[0].message + except TokenLimitExceeded: + # Re-raise token limit errors without logging + raise except ValueError as ve: logger.error(f"Validation error in ask_tool: {ve}") raise except OpenAIError as oe: + logger.error(f"OpenAI API error: {oe}") if isinstance(oe, AuthenticationError): logger.error("Authentication failed. Check API key.") elif isinstance(oe, RateLimitError): diff --git a/app/prompt/browser.py b/app/prompt/browser.py new file mode 100644 index 0000000..70fed30 --- /dev/null +++ b/app/prompt/browser.py @@ -0,0 +1,92 @@ +SYSTEM_PROMPT = """\ +You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules. + +# Input Format +Task +Previous steps +Current URL +Open Tabs +Interactive Elements +[index]text +- index: Numeric identifier for interaction +- type: HTML element type (button, input, etc.) +- text: Element description +Example: +[33] + +- Only elements with numeric indexes in [] are interactive +- elements without [] provide only context + +# Response Rules +1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format: +{{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not", +"memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz", +"next_goal": "What needs to be done with the next immediate action"}}, +"action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}} + +2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence. +Common action sequences: +- Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}] +- Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}] +- Actions are executed in the given order +- If the page changes after an action, the sequence is interrupted and you get the new state. +- Only provide the action sequence until an action which changes the page state significantly. +- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page +- only use multiple actions if it makes sense. + +3. ELEMENT INTERACTION: +- Only use indexes of the interactive elements +- Elements marked with "[]Non-interactive text" are non-interactive + +4. NAVIGATION & ERROR HANDLING: +- If no suitable elements exist, use other functions to complete the task +- If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc. +- Handle popups/cookies by accepting or closing them +- Use scroll to find elements you are looking for +- If you want to research something, open a new tab instead of using the current tab +- If captcha pops up, try to solve it - else try a different approach +- If the page is not fully loaded, use wait action + +5. TASK COMPLETION: +- Use the done action as the last action as soon as the ultimate task is complete +- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps. +- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false! +- If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step. +- Don't hallucinate actions +- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task. + +6. VISUAL CONTEXT: +- When an image is provided, use it to understand the page layout +- Bounding boxes with labels on their top right corner correspond to element indexes + +7. Form filling: +- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. + +8. Long tasks: +- Keep track of the status and subresults in the memory. + +9. Extraction: +- If your task is to find information - call extract_content on the specific pages to get and store the information. +Your responses must be always JSON with the specified format. +""" + +NEXT_STEP_PROMPT = """ +What should I do next to achieve my goal? + +When you see [Current state starts here], focus on the following: +- Current URL and page title{url_placeholder} +- Available tabs{tabs_placeholder} +- Interactive elements and their indices +- Content above{content_above_placeholder} or below{content_below_placeholder} the viewport (if indicated) +- Any action results or errors{results_placeholder} + +For browser interactions: +- To navigate: browser_use with action="go_to_url", url="..." +- To click: browser_use with action="click_element", index=N +- To type: browser_use with action="input_text", index=N, text="..." +- To extract: browser_use with action="extract_content", goal="..." +- To scroll: browser_use with action="scroll_down" or "scroll_up" + +Consider both what's visible and what might be beyond the current viewport. +Be methodical - remember your progress and what you've learned so far. +""" diff --git a/app/prompt/manus.py b/app/prompt/manus.py index e46c793..f080ba4 100644 --- a/app/prompt/manus.py +++ b/app/prompt/manus.py @@ -1,18 +1,8 @@ -SYSTEM_PROMPT = "You are OpenManus, an all-capable AI assistant, aimed at solving any task presented by the user. You have various tools at your disposal that you can call upon to efficiently complete complex requests. Whether it's programming, information retrieval, file processing, or web browsing, you can handle it all." - -NEXT_STEP_PROMPT = """You can interact with the computer using PythonExecute, save important content and information files through FileSaver, open browsers with BrowserUseTool, and retrieve information using GoogleSearch. - -PythonExecute: Execute Python code to interact with the computer system, data processing, automation tasks, etc. - -FileSaver: Save files locally, such as txt, py, html, etc. - -BrowserUseTool: Open, browse, and use web browsers.If you open a local HTML file, you must provide the absolute path to the file. - -GoogleSearch: Perform web information retrieval - -Terminate: End the current interaction when the task is complete or when you need additional information from the user. Use this tool to signal that you've finished addressing the user's request or need clarification before proceeding further. +SYSTEM_PROMPT = ( + "You are OpenManus, an all-capable AI assistant, aimed at solving any task presented by the user. You have various tools at your disposal that you can call upon to efficiently complete complex requests. Whether it's programming, information retrieval, file processing, or web browsing, you can handle it all." + "The initial directory is: {directory}" +) +NEXT_STEP_PROMPT = """ Based on user needs, proactively select the most appropriate tool or combination of tools. For complex tasks, you can break down the problem and use different tools step by step to solve it. After using each tool, clearly explain the execution results and suggest the next steps. - -Always maintain a helpful, informative tone throughout the interaction. If you encounter any limitations or need more details, clearly communicate this to the user before terminating. """ diff --git a/app/sandbox/__init__.py b/app/sandbox/__init__.py new file mode 100644 index 0000000..ccf0df6 --- /dev/null +++ b/app/sandbox/__init__.py @@ -0,0 +1,30 @@ +""" +Docker Sandbox Module + +Provides secure containerized execution environment with resource limits +and isolation for running untrusted code. +""" +from app.sandbox.client import ( + BaseSandboxClient, + LocalSandboxClient, + create_sandbox_client, +) +from app.sandbox.core.exceptions import ( + SandboxError, + SandboxResourceError, + SandboxTimeoutError, +) +from app.sandbox.core.manager import SandboxManager +from app.sandbox.core.sandbox import DockerSandbox + + +__all__ = [ + "DockerSandbox", + "SandboxManager", + "BaseSandboxClient", + "LocalSandboxClient", + "create_sandbox_client", + "SandboxError", + "SandboxTimeoutError", + "SandboxResourceError", +] diff --git a/app/sandbox/client.py b/app/sandbox/client.py new file mode 100644 index 0000000..09a8f2e --- /dev/null +++ b/app/sandbox/client.py @@ -0,0 +1,201 @@ +from abc import ABC, abstractmethod +from typing import Dict, Optional, Protocol + +from app.config import SandboxSettings +from app.sandbox.core.sandbox import DockerSandbox + + +class SandboxFileOperations(Protocol): + """Protocol for sandbox file operations.""" + + async def copy_from(self, container_path: str, local_path: str) -> None: + """Copies file from container to local. + + Args: + container_path: File path in container. + local_path: Local destination path. + """ + ... + + async def copy_to(self, local_path: str, container_path: str) -> None: + """Copies file from local to container. + + Args: + local_path: Local source file path. + container_path: Destination path in container. + """ + ... + + async def read_file(self, path: str) -> str: + """Reads file content from container. + + Args: + path: File path in container. + + Returns: + str: File content. + """ + ... + + async def write_file(self, path: str, content: str) -> None: + """Writes content to file in container. + + Args: + path: File path in container. + content: Content to write. + """ + ... + + +class BaseSandboxClient(ABC): + """Base sandbox client interface.""" + + @abstractmethod + async def create( + self, + config: Optional[SandboxSettings] = None, + volume_bindings: Optional[Dict[str, str]] = None, + ) -> None: + """Creates sandbox.""" + + @abstractmethod + async def run_command(self, command: str, timeout: Optional[int] = None) -> str: + """Executes command.""" + + @abstractmethod + async def copy_from(self, container_path: str, local_path: str) -> None: + """Copies file from container.""" + + @abstractmethod + async def copy_to(self, local_path: str, container_path: str) -> None: + """Copies file to container.""" + + @abstractmethod + async def read_file(self, path: str) -> str: + """Reads file.""" + + @abstractmethod + async def write_file(self, path: str, content: str) -> None: + """Writes file.""" + + @abstractmethod + async def cleanup(self) -> None: + """Cleans up resources.""" + + +class LocalSandboxClient(BaseSandboxClient): + """Local sandbox client implementation.""" + + def __init__(self): + """Initializes local sandbox client.""" + self.sandbox: Optional[DockerSandbox] = None + + async def create( + self, + config: Optional[SandboxSettings] = None, + volume_bindings: Optional[Dict[str, str]] = None, + ) -> None: + """Creates a sandbox. + + Args: + config: Sandbox configuration. + volume_bindings: Volume mappings. + + Raises: + RuntimeError: If sandbox creation fails. + """ + self.sandbox = DockerSandbox(config, volume_bindings) + await self.sandbox.create() + + async def run_command(self, command: str, timeout: Optional[int] = None) -> str: + """Runs command in sandbox. + + Args: + command: Command to execute. + timeout: Execution timeout in seconds. + + Returns: + Command output. + + Raises: + RuntimeError: If sandbox not initialized. + """ + if not self.sandbox: + raise RuntimeError("Sandbox not initialized") + return await self.sandbox.run_command(command, timeout) + + async def copy_from(self, container_path: str, local_path: str) -> None: + """Copies file from container to local. + + Args: + container_path: File path in container. + local_path: Local destination path. + + Raises: + RuntimeError: If sandbox not initialized. + """ + if not self.sandbox: + raise RuntimeError("Sandbox not initialized") + await self.sandbox.copy_from(container_path, local_path) + + async def copy_to(self, local_path: str, container_path: str) -> None: + """Copies file from local to container. + + Args: + local_path: Local source file path. + container_path: Destination path in container. + + Raises: + RuntimeError: If sandbox not initialized. + """ + if not self.sandbox: + raise RuntimeError("Sandbox not initialized") + await self.sandbox.copy_to(local_path, container_path) + + async def read_file(self, path: str) -> str: + """Reads file from container. + + Args: + path: File path in container. + + Returns: + File content. + + Raises: + RuntimeError: If sandbox not initialized. + """ + if not self.sandbox: + raise RuntimeError("Sandbox not initialized") + return await self.sandbox.read_file(path) + + async def write_file(self, path: str, content: str) -> None: + """Writes file to container. + + Args: + path: File path in container. + content: File content. + + Raises: + RuntimeError: If sandbox not initialized. + """ + if not self.sandbox: + raise RuntimeError("Sandbox not initialized") + await self.sandbox.write_file(path, content) + + async def cleanup(self) -> None: + """Cleans up resources.""" + if self.sandbox: + await self.sandbox.cleanup() + self.sandbox = None + + +def create_sandbox_client() -> LocalSandboxClient: + """Creates a sandbox client. + + Returns: + LocalSandboxClient: Sandbox client instance. + """ + return LocalSandboxClient() + + +SANDBOX_CLIENT = create_sandbox_client() diff --git a/app/sandbox/core/exceptions.py b/app/sandbox/core/exceptions.py new file mode 100644 index 0000000..5c1f0e8 --- /dev/null +++ b/app/sandbox/core/exceptions.py @@ -0,0 +1,17 @@ +"""Exception classes for the sandbox system. + +This module defines custom exceptions used throughout the sandbox system to +handle various error conditions in a structured way. +""" + + +class SandboxError(Exception): + """Base exception for sandbox-related errors.""" + + +class SandboxTimeoutError(SandboxError): + """Exception raised when a sandbox operation times out.""" + + +class SandboxResourceError(SandboxError): + """Exception raised for resource-related errors.""" diff --git a/app/sandbox/core/manager.py b/app/sandbox/core/manager.py new file mode 100644 index 0000000..5814f12 --- /dev/null +++ b/app/sandbox/core/manager.py @@ -0,0 +1,313 @@ +import asyncio +import uuid +from contextlib import asynccontextmanager +from typing import Dict, Optional, Set + +import docker +from docker.errors import APIError, ImageNotFound + +from app.config import SandboxSettings +from app.logger import logger +from app.sandbox.core.sandbox import DockerSandbox + + +class SandboxManager: + """Docker sandbox manager. + + Manages multiple DockerSandbox instances lifecycle including creation, + monitoring, and cleanup. Provides concurrent access control and automatic + cleanup mechanisms for sandbox resources. + + Attributes: + max_sandboxes: Maximum allowed number of sandboxes. + idle_timeout: Sandbox idle timeout in seconds. + cleanup_interval: Cleanup check interval in seconds. + _sandboxes: Active sandbox instance mapping. + _last_used: Last used time record for sandboxes. + """ + + def __init__( + self, + max_sandboxes: int = 100, + idle_timeout: int = 3600, + cleanup_interval: int = 300, + ): + """Initializes sandbox manager. + + Args: + max_sandboxes: Maximum sandbox count limit. + idle_timeout: Idle timeout in seconds. + cleanup_interval: Cleanup check interval in seconds. + """ + self.max_sandboxes = max_sandboxes + self.idle_timeout = idle_timeout + self.cleanup_interval = cleanup_interval + + # Docker client + self._client = docker.from_env() + + # Resource mappings + self._sandboxes: Dict[str, DockerSandbox] = {} + self._last_used: Dict[str, float] = {} + + # Concurrency control + self._locks: Dict[str, asyncio.Lock] = {} + self._global_lock = asyncio.Lock() + self._active_operations: Set[str] = set() + + # Cleanup task + self._cleanup_task: Optional[asyncio.Task] = None + self._is_shutting_down = False + + # Start automatic cleanup + self.start_cleanup_task() + + async def ensure_image(self, image: str) -> bool: + """Ensures Docker image is available. + + Args: + image: Image name. + + Returns: + bool: Whether image is available. + """ + try: + self._client.images.get(image) + return True + except ImageNotFound: + try: + logger.info(f"Pulling image {image}...") + await asyncio.get_event_loop().run_in_executor( + None, self._client.images.pull, image + ) + return True + except (APIError, Exception) as e: + logger.error(f"Failed to pull image {image}: {e}") + return False + + @asynccontextmanager + async def sandbox_operation(self, sandbox_id: str): + """Context manager for sandbox operations. + + Provides concurrency control and usage time updates. + + Args: + sandbox_id: Sandbox ID. + + Raises: + KeyError: If sandbox not found. + """ + if sandbox_id not in self._locks: + self._locks[sandbox_id] = asyncio.Lock() + + async with self._locks[sandbox_id]: + if sandbox_id not in self._sandboxes: + raise KeyError(f"Sandbox {sandbox_id} not found") + + self._active_operations.add(sandbox_id) + try: + self._last_used[sandbox_id] = asyncio.get_event_loop().time() + yield self._sandboxes[sandbox_id] + finally: + self._active_operations.remove(sandbox_id) + + async def create_sandbox( + self, + config: Optional[SandboxSettings] = None, + volume_bindings: Optional[Dict[str, str]] = None, + ) -> str: + """Creates a new sandbox instance. + + Args: + config: Sandbox configuration. + volume_bindings: Volume mapping configuration. + + Returns: + str: Sandbox ID. + + Raises: + RuntimeError: If max sandbox count reached or creation fails. + """ + async with self._global_lock: + if len(self._sandboxes) >= self.max_sandboxes: + raise RuntimeError( + f"Maximum number of sandboxes ({self.max_sandboxes}) reached" + ) + + config = config or SandboxSettings() + if not await self.ensure_image(config.image): + raise RuntimeError(f"Failed to ensure Docker image: {config.image}") + + sandbox_id = str(uuid.uuid4()) + try: + sandbox = DockerSandbox(config, volume_bindings) + await sandbox.create() + + self._sandboxes[sandbox_id] = sandbox + self._last_used[sandbox_id] = asyncio.get_event_loop().time() + self._locks[sandbox_id] = asyncio.Lock() + + logger.info(f"Created sandbox {sandbox_id}") + return sandbox_id + + except Exception as e: + logger.error(f"Failed to create sandbox: {e}") + if sandbox_id in self._sandboxes: + await self.delete_sandbox(sandbox_id) + raise RuntimeError(f"Failed to create sandbox: {e}") + + async def get_sandbox(self, sandbox_id: str) -> DockerSandbox: + """Gets a sandbox instance. + + Args: + sandbox_id: Sandbox ID. + + Returns: + DockerSandbox: Sandbox instance. + + Raises: + KeyError: If sandbox does not exist. + """ + async with self.sandbox_operation(sandbox_id) as sandbox: + return sandbox + + def start_cleanup_task(self) -> None: + """Starts automatic cleanup task.""" + + async def cleanup_loop(): + while not self._is_shutting_down: + try: + await self._cleanup_idle_sandboxes() + except Exception as e: + logger.error(f"Error in cleanup loop: {e}") + await asyncio.sleep(self.cleanup_interval) + + self._cleanup_task = asyncio.create_task(cleanup_loop()) + + async def _cleanup_idle_sandboxes(self) -> None: + """Cleans up idle sandboxes.""" + current_time = asyncio.get_event_loop().time() + to_cleanup = [] + + async with self._global_lock: + for sandbox_id, last_used in self._last_used.items(): + if ( + sandbox_id not in self._active_operations + and current_time - last_used > self.idle_timeout + ): + to_cleanup.append(sandbox_id) + + for sandbox_id in to_cleanup: + try: + await self.delete_sandbox(sandbox_id) + except Exception as e: + logger.error(f"Error cleaning up sandbox {sandbox_id}: {e}") + + async def cleanup(self) -> None: + """Cleans up all resources.""" + logger.info("Starting manager cleanup...") + self._is_shutting_down = True + + # Cancel cleanup task + if self._cleanup_task: + self._cleanup_task.cancel() + try: + await asyncio.wait_for(self._cleanup_task, timeout=1.0) + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + + # Get all sandbox IDs to clean up + async with self._global_lock: + sandbox_ids = list(self._sandboxes.keys()) + + # Concurrently clean up all sandboxes + cleanup_tasks = [] + for sandbox_id in sandbox_ids: + task = asyncio.create_task(self._safe_delete_sandbox(sandbox_id)) + cleanup_tasks.append(task) + + if cleanup_tasks: + # Wait for all cleanup tasks to complete, with timeout to avoid infinite waiting + try: + await asyncio.wait(cleanup_tasks, timeout=30.0) + except asyncio.TimeoutError: + logger.error("Sandbox cleanup timed out") + + # Clean up remaining references + self._sandboxes.clear() + self._last_used.clear() + self._locks.clear() + self._active_operations.clear() + + logger.info("Manager cleanup completed") + + async def _safe_delete_sandbox(self, sandbox_id: str) -> None: + """Safely deletes a single sandbox. + + Args: + sandbox_id: Sandbox ID to delete. + """ + try: + if sandbox_id in self._active_operations: + logger.warning( + f"Sandbox {sandbox_id} has active operations, waiting for completion" + ) + for _ in range(10): # Wait at most 10 times + await asyncio.sleep(0.5) + if sandbox_id not in self._active_operations: + break + else: + logger.warning( + f"Timeout waiting for sandbox {sandbox_id} operations to complete" + ) + + # Get reference to sandbox object + sandbox = self._sandboxes.get(sandbox_id) + if sandbox: + await sandbox.cleanup() + + # Remove sandbox record from manager + async with self._global_lock: + self._sandboxes.pop(sandbox_id, None) + self._last_used.pop(sandbox_id, None) + self._locks.pop(sandbox_id, None) + logger.info(f"Deleted sandbox {sandbox_id}") + except Exception as e: + logger.error(f"Error during cleanup of sandbox {sandbox_id}: {e}") + + async def delete_sandbox(self, sandbox_id: str) -> None: + """Deletes specified sandbox. + + Args: + sandbox_id: Sandbox ID. + """ + if sandbox_id not in self._sandboxes: + return + + try: + await self._safe_delete_sandbox(sandbox_id) + except Exception as e: + logger.error(f"Failed to delete sandbox {sandbox_id}: {e}") + + async def __aenter__(self) -> "SandboxManager": + """Async context manager entry.""" + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Async context manager exit.""" + await self.cleanup() + + def get_stats(self) -> Dict: + """Gets manager statistics. + + Returns: + Dict: Statistics information. + """ + return { + "total_sandboxes": len(self._sandboxes), + "active_operations": len(self._active_operations), + "max_sandboxes": self.max_sandboxes, + "idle_timeout": self.idle_timeout, + "cleanup_interval": self.cleanup_interval, + "is_shutting_down": self._is_shutting_down, + } diff --git a/app/sandbox/core/sandbox.py b/app/sandbox/core/sandbox.py new file mode 100644 index 0000000..c57b3f2 --- /dev/null +++ b/app/sandbox/core/sandbox.py @@ -0,0 +1,462 @@ +import asyncio +import io +import os +import tarfile +import tempfile +import uuid +from typing import Dict, Optional + +import docker +from docker.errors import NotFound +from docker.models.containers import Container + +from app.config import SandboxSettings +from app.sandbox.core.exceptions import SandboxTimeoutError +from app.sandbox.core.terminal import AsyncDockerizedTerminal + + +class DockerSandbox: + """Docker sandbox environment. + + Provides a containerized execution environment with resource limits, + file operations, and command execution capabilities. + + Attributes: + config: Sandbox configuration. + volume_bindings: Volume mapping configuration. + client: Docker client. + container: Docker container instance. + terminal: Container terminal interface. + """ + + def __init__( + self, + config: Optional[SandboxSettings] = None, + volume_bindings: Optional[Dict[str, str]] = None, + ): + """Initializes a sandbox instance. + + Args: + config: Sandbox configuration. Default configuration used if None. + volume_bindings: Volume mappings in {host_path: container_path} format. + """ + self.config = config or SandboxSettings() + self.volume_bindings = volume_bindings or {} + self.client = docker.from_env() + self.container: Optional[Container] = None + self.terminal: Optional[AsyncDockerizedTerminal] = None + + async def create(self) -> "DockerSandbox": + """Creates and starts the sandbox container. + + Returns: + Current sandbox instance. + + Raises: + docker.errors.APIError: If Docker API call fails. + RuntimeError: If container creation or startup fails. + """ + try: + # Prepare container config + host_config = self.client.api.create_host_config( + mem_limit=self.config.memory_limit, + cpu_period=100000, + cpu_quota=int(100000 * self.config.cpu_limit), + network_mode="none" if not self.config.network_enabled else "bridge", + binds=self._prepare_volume_bindings(), + ) + + # Generate unique container name with sandbox_ prefix + container_name = f"sandbox_{uuid.uuid4().hex[:8]}" + + # Create container + container = await asyncio.to_thread( + self.client.api.create_container, + image=self.config.image, + command="tail -f /dev/null", + hostname="sandbox", + working_dir=self.config.work_dir, + host_config=host_config, + name=container_name, + tty=True, + detach=True, + ) + + self.container = self.client.containers.get(container["Id"]) + + # Start container + await asyncio.to_thread(self.container.start) + + # Initialize terminal + self.terminal = AsyncDockerizedTerminal( + container["Id"], + self.config.work_dir, + env_vars={"PYTHONUNBUFFERED": "1"} + # Ensure Python output is not buffered + ) + await self.terminal.init() + + return self + + except Exception as e: + await self.cleanup() # Ensure resources are cleaned up + raise RuntimeError(f"Failed to create sandbox: {e}") from e + + def _prepare_volume_bindings(self) -> Dict[str, Dict[str, str]]: + """Prepares volume binding configuration. + + Returns: + Volume binding configuration dictionary. + """ + bindings = {} + + # Create and add working directory mapping + work_dir = self._ensure_host_dir(self.config.work_dir) + bindings[work_dir] = {"bind": self.config.work_dir, "mode": "rw"} + + # Add custom volume bindings + for host_path, container_path in self.volume_bindings.items(): + bindings[host_path] = {"bind": container_path, "mode": "rw"} + + return bindings + + @staticmethod + def _ensure_host_dir(path: str) -> str: + """Ensures directory exists on the host. + + Args: + path: Directory path. + + Returns: + Actual path on the host. + """ + host_path = os.path.join( + tempfile.gettempdir(), + f"sandbox_{os.path.basename(path)}_{os.urandom(4).hex()}", + ) + os.makedirs(host_path, exist_ok=True) + return host_path + + async def run_command(self, cmd: str, timeout: Optional[int] = None) -> str: + """Runs a command in the sandbox. + + Args: + cmd: Command to execute. + timeout: Timeout in seconds. + + Returns: + Command output as string. + + Raises: + RuntimeError: If sandbox not initialized or command execution fails. + TimeoutError: If command execution times out. + """ + if not self.terminal: + raise RuntimeError("Sandbox not initialized") + + try: + return await self.terminal.run_command( + cmd, timeout=timeout or self.config.timeout + ) + except TimeoutError: + raise SandboxTimeoutError( + f"Command execution timed out after {timeout or self.config.timeout} seconds" + ) + + async def read_file(self, path: str) -> str: + """Reads a file from the container. + + Args: + path: File path. + + Returns: + File contents as string. + + Raises: + FileNotFoundError: If file does not exist. + RuntimeError: If read operation fails. + """ + if not self.container: + raise RuntimeError("Sandbox not initialized") + + try: + # Get file archive + resolved_path = self._safe_resolve_path(path) + tar_stream, _ = await asyncio.to_thread( + self.container.get_archive, resolved_path + ) + + # Read file content from tar stream + content = await self._read_from_tar(tar_stream) + return content.decode("utf-8") + + except NotFound: + raise FileNotFoundError(f"File not found: {path}") + except Exception as e: + raise RuntimeError(f"Failed to read file: {e}") + + async def write_file(self, path: str, content: str) -> None: + """Writes content to a file in the container. + + Args: + path: Target path. + content: File content. + + Raises: + RuntimeError: If write operation fails. + """ + if not self.container: + raise RuntimeError("Sandbox not initialized") + + try: + resolved_path = self._safe_resolve_path(path) + parent_dir = os.path.dirname(resolved_path) + + # Create parent directory + if parent_dir: + await self.run_command(f"mkdir -p {parent_dir}") + + # Prepare file data + tar_stream = await self._create_tar_stream( + os.path.basename(path), content.encode("utf-8") + ) + + # Write file + await asyncio.to_thread( + self.container.put_archive, parent_dir or "/", tar_stream + ) + + except Exception as e: + raise RuntimeError(f"Failed to write file: {e}") + + def _safe_resolve_path(self, path: str) -> str: + """Safely resolves container path, preventing path traversal. + + Args: + path: Original path. + + Returns: + Resolved absolute path. + + Raises: + ValueError: If path contains potentially unsafe patterns. + """ + # Check for path traversal attempts + if ".." in path.split("/"): + raise ValueError("Path contains potentially unsafe patterns") + + resolved = ( + os.path.join(self.config.work_dir, path) + if not os.path.isabs(path) + else path + ) + return resolved + + async def copy_from(self, src_path: str, dst_path: str) -> None: + """Copies a file from the container. + + Args: + src_path: Source file path (container). + dst_path: Destination path (host). + + Raises: + FileNotFoundError: If source file does not exist. + RuntimeError: If copy operation fails. + """ + try: + # Ensure destination file's parent directory exists + parent_dir = os.path.dirname(dst_path) + if parent_dir: + os.makedirs(parent_dir, exist_ok=True) + + # Get file stream + resolved_src = self._safe_resolve_path(src_path) + stream, stat = await asyncio.to_thread( + self.container.get_archive, resolved_src + ) + + # Create temporary directory to extract file + with tempfile.TemporaryDirectory() as tmp_dir: + # Write stream to temporary file + tar_path = os.path.join(tmp_dir, "temp.tar") + with open(tar_path, "wb") as f: + for chunk in stream: + f.write(chunk) + + # Extract file + with tarfile.open(tar_path) as tar: + members = tar.getmembers() + if not members: + raise FileNotFoundError(f"Source file is empty: {src_path}") + + # If destination is a directory, we should preserve relative path structure + if os.path.isdir(dst_path): + tar.extractall(dst_path) + else: + # If destination is a file, we only extract the source file's content + if len(members) > 1: + raise RuntimeError( + f"Source path is a directory but destination is a file: {src_path}" + ) + + with open(dst_path, "wb") as dst: + src_file = tar.extractfile(members[0]) + if src_file is None: + raise RuntimeError( + f"Failed to extract file: {src_path}" + ) + dst.write(src_file.read()) + + except docker.errors.NotFound: + raise FileNotFoundError(f"Source file not found: {src_path}") + except Exception as e: + raise RuntimeError(f"Failed to copy file: {e}") + + async def copy_to(self, src_path: str, dst_path: str) -> None: + """Copies a file to the container. + + Args: + src_path: Source file path (host). + dst_path: Destination path (container). + + Raises: + FileNotFoundError: If source file does not exist. + RuntimeError: If copy operation fails. + """ + try: + if not os.path.exists(src_path): + raise FileNotFoundError(f"Source file not found: {src_path}") + + # Create destination directory in container + resolved_dst = self._safe_resolve_path(dst_path) + container_dir = os.path.dirname(resolved_dst) + if container_dir: + await self.run_command(f"mkdir -p {container_dir}") + + # Create tar file to upload + with tempfile.TemporaryDirectory() as tmp_dir: + tar_path = os.path.join(tmp_dir, "temp.tar") + with tarfile.open(tar_path, "w") as tar: + # Handle directory source path + if os.path.isdir(src_path): + os.path.basename(src_path.rstrip("/")) + for root, _, files in os.walk(src_path): + for file in files: + file_path = os.path.join(root, file) + arcname = os.path.join( + os.path.basename(dst_path), + os.path.relpath(file_path, src_path), + ) + tar.add(file_path, arcname=arcname) + else: + # Add single file to tar + tar.add(src_path, arcname=os.path.basename(dst_path)) + + # Read tar file content + with open(tar_path, "rb") as f: + data = f.read() + + # Upload to container + await asyncio.to_thread( + self.container.put_archive, + os.path.dirname(resolved_dst) or "/", + data, + ) + + # Verify file was created successfully + try: + await self.run_command(f"test -e {resolved_dst}") + except Exception: + raise RuntimeError(f"Failed to verify file creation: {dst_path}") + + except FileNotFoundError: + raise + except Exception as e: + raise RuntimeError(f"Failed to copy file: {e}") + + @staticmethod + async def _create_tar_stream(name: str, content: bytes) -> io.BytesIO: + """Creates a tar file stream. + + Args: + name: Filename. + content: File content. + + Returns: + Tar file stream. + """ + tar_stream = io.BytesIO() + with tarfile.open(fileobj=tar_stream, mode="w") as tar: + tarinfo = tarfile.TarInfo(name=name) + tarinfo.size = len(content) + tar.addfile(tarinfo, io.BytesIO(content)) + tar_stream.seek(0) + return tar_stream + + @staticmethod + async def _read_from_tar(tar_stream) -> bytes: + """Reads file content from a tar stream. + + Args: + tar_stream: Tar file stream. + + Returns: + File content. + + Raises: + RuntimeError: If read operation fails. + """ + with tempfile.NamedTemporaryFile() as tmp: + for chunk in tar_stream: + tmp.write(chunk) + tmp.seek(0) + + with tarfile.open(fileobj=tmp) as tar: + member = tar.next() + if not member: + raise RuntimeError("Empty tar archive") + + file_content = tar.extractfile(member) + if not file_content: + raise RuntimeError("Failed to extract file content") + + return file_content.read() + + async def cleanup(self) -> None: + """Cleans up sandbox resources.""" + errors = [] + try: + if self.terminal: + try: + await self.terminal.close() + except Exception as e: + errors.append(f"Terminal cleanup error: {e}") + finally: + self.terminal = None + + if self.container: + try: + await asyncio.to_thread(self.container.stop, timeout=5) + except Exception as e: + errors.append(f"Container stop error: {e}") + + try: + await asyncio.to_thread(self.container.remove, force=True) + except Exception as e: + errors.append(f"Container remove error: {e}") + finally: + self.container = None + + except Exception as e: + errors.append(f"General cleanup error: {e}") + + if errors: + print(f"Warning: Errors during cleanup: {', '.join(errors)}") + + async def __aenter__(self) -> "DockerSandbox": + """Async context manager entry.""" + return await self.create() + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Async context manager exit.""" + await self.cleanup() diff --git a/app/sandbox/core/terminal.py b/app/sandbox/core/terminal.py new file mode 100644 index 0000000..aee5184 --- /dev/null +++ b/app/sandbox/core/terminal.py @@ -0,0 +1,346 @@ +""" +Asynchronous Docker Terminal + +This module provides asynchronous terminal functionality for Docker containers, +allowing interactive command execution with timeout control. +""" + +import asyncio +import re +import socket +from typing import Dict, Optional, Tuple, Union + +import docker +from docker import APIClient +from docker.errors import APIError +from docker.models.containers import Container + + +class DockerSession: + def __init__(self, container_id: str) -> None: + """Initializes a Docker session. + + Args: + container_id: ID of the Docker container. + """ + self.api = APIClient() + self.container_id = container_id + self.exec_id = None + self.socket = None + + async def create(self, working_dir: str, env_vars: Dict[str, str]) -> None: + """Creates an interactive session with the container. + + Args: + working_dir: Working directory inside the container. + env_vars: Environment variables to set. + + Raises: + RuntimeError: If socket connection fails. + """ + startup_command = [ + "bash", + "-c", + f"cd {working_dir} && " + "PROMPT_COMMAND='' " + "PS1='$ ' " + "exec bash --norc --noprofile", + ] + + exec_data = self.api.exec_create( + self.container_id, + startup_command, + stdin=True, + tty=True, + stdout=True, + stderr=True, + privileged=True, + user="root", + environment={**env_vars, "TERM": "dumb", "PS1": "$ ", "PROMPT_COMMAND": ""}, + ) + self.exec_id = exec_data["Id"] + + socket_data = self.api.exec_start( + self.exec_id, socket=True, tty=True, stream=True, demux=True + ) + + if hasattr(socket_data, "_sock"): + self.socket = socket_data._sock + self.socket.setblocking(False) + else: + raise RuntimeError("Failed to get socket connection") + + await self._read_until_prompt() + + async def close(self) -> None: + """Cleans up session resources. + + 1. Sends exit command + 2. Closes socket connection + 3. Checks and cleans up exec instance + """ + try: + if self.socket: + # Send exit command to close bash session + try: + self.socket.sendall(b"exit\n") + # Allow time for command execution + await asyncio.sleep(0.1) + except: + pass # Ignore sending errors, continue cleanup + + # Close socket connection + try: + self.socket.shutdown(socket.SHUT_RDWR) + except: + pass # Some platforms may not support shutdown + + self.socket.close() + self.socket = None + + if self.exec_id: + try: + # Check exec instance status + exec_inspect = self.api.exec_inspect(self.exec_id) + if exec_inspect.get("Running", False): + # If still running, wait for it to complete + await asyncio.sleep(0.5) + except: + pass # Ignore inspection errors, continue cleanup + + self.exec_id = None + + except Exception as e: + # Log error but don't raise, ensure cleanup continues + print(f"Warning: Error during session cleanup: {e}") + + async def _read_until_prompt(self) -> str: + """Reads output until prompt is found. + + Returns: + String containing output up to the prompt. + + Raises: + socket.error: If socket communication fails. + """ + buffer = b"" + while b"$ " not in buffer: + try: + chunk = self.socket.recv(4096) + if chunk: + buffer += chunk + except socket.error as e: + if e.errno == socket.EWOULDBLOCK: + await asyncio.sleep(0.1) + continue + raise + return buffer.decode("utf-8") + + async def execute(self, command: str, timeout: Optional[int] = None) -> str: + """Executes a command and returns cleaned output. + + Args: + command: Shell command to execute. + timeout: Maximum execution time in seconds. + + Returns: + Command output as string with prompt markers removed. + + Raises: + RuntimeError: If session not initialized or execution fails. + TimeoutError: If command execution exceeds timeout. + """ + if not self.socket: + raise RuntimeError("Session not initialized") + + try: + # Sanitize command to prevent shell injection + sanitized_command = self._sanitize_command(command) + full_command = f"{sanitized_command}\necho $?\n" + self.socket.sendall(full_command.encode()) + + async def read_output() -> str: + buffer = b"" + result_lines = [] + command_sent = False + + while True: + try: + chunk = self.socket.recv(4096) + if not chunk: + break + + buffer += chunk + lines = buffer.split(b"\n") + + buffer = lines[-1] + lines = lines[:-1] + + for line in lines: + line = line.rstrip(b"\r") + + if not command_sent: + command_sent = True + continue + + if line.strip() == b"echo $?" or line.strip().isdigit(): + continue + + if line.strip(): + result_lines.append(line) + + if buffer.endswith(b"$ "): + break + + except socket.error as e: + if e.errno == socket.EWOULDBLOCK: + await asyncio.sleep(0.1) + continue + raise + + output = b"\n".join(result_lines).decode("utf-8") + output = re.sub(r"\n\$ echo \$\$?.*$", "", output) + + return output + + if timeout: + result = await asyncio.wait_for(read_output(), timeout) + else: + result = await read_output() + + return result.strip() + + except asyncio.TimeoutError: + raise TimeoutError(f"Command execution timed out after {timeout} seconds") + except Exception as e: + raise RuntimeError(f"Failed to execute command: {e}") + + def _sanitize_command(self, command: str) -> str: + """Sanitizes the command string to prevent shell injection. + + Args: + command: Raw command string. + + Returns: + Sanitized command string. + + Raises: + ValueError: If command contains potentially dangerous patterns. + """ + + # Additional checks for specific risky commands + risky_commands = [ + "rm -rf /", + "rm -rf /*", + "mkfs", + "dd if=/dev/zero", + ":(){:|:&};:", + "chmod -R 777 /", + "chown -R", + ] + + for risky in risky_commands: + if risky in command.lower(): + raise ValueError( + f"Command contains potentially dangerous operation: {risky}" + ) + + return command + + +class AsyncDockerizedTerminal: + def __init__( + self, + container: Union[str, Container], + working_dir: str = "/workspace", + env_vars: Optional[Dict[str, str]] = None, + default_timeout: int = 60, + ) -> None: + """Initializes an asynchronous terminal for Docker containers. + + Args: + container: Docker container ID or Container object. + working_dir: Working directory inside the container. + env_vars: Environment variables to set. + default_timeout: Default command execution timeout in seconds. + """ + self.client = docker.from_env() + self.container = ( + container + if isinstance(container, Container) + else self.client.containers.get(container) + ) + self.working_dir = working_dir + self.env_vars = env_vars or {} + self.default_timeout = default_timeout + self.session = None + + async def init(self) -> None: + """Initializes the terminal environment. + + Ensures working directory exists and creates an interactive session. + + Raises: + RuntimeError: If initialization fails. + """ + await self._ensure_workdir() + + self.session = DockerSession(self.container.id) + await self.session.create(self.working_dir, self.env_vars) + + async def _ensure_workdir(self) -> None: + """Ensures working directory exists in container. + + Raises: + RuntimeError: If directory creation fails. + """ + try: + await self._exec_simple(f"mkdir -p {self.working_dir}") + except APIError as e: + raise RuntimeError(f"Failed to create working directory: {e}") + + async def _exec_simple(self, cmd: str) -> Tuple[int, str]: + """Executes a simple command using Docker's exec_run. + + Args: + cmd: Command to execute. + + Returns: + Tuple of (exit_code, output). + """ + result = await asyncio.to_thread( + self.container.exec_run, cmd, environment=self.env_vars + ) + return result.exit_code, result.output.decode("utf-8") + + async def run_command(self, cmd: str, timeout: Optional[int] = None) -> str: + """Runs a command in the container with timeout. + + Args: + cmd: Shell command to execute. + timeout: Maximum execution time in seconds. + + Returns: + Command output as string. + + Raises: + RuntimeError: If terminal not initialized. + """ + if not self.session: + raise RuntimeError("Terminal not initialized") + + return await self.session.execute(cmd, timeout=timeout or self.default_timeout) + + async def close(self) -> None: + """Closes the terminal session.""" + if self.session: + await self.session.close() + + async def __aenter__(self) -> "AsyncDockerizedTerminal": + """Async context manager entry.""" + await self.init() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Async context manager exit.""" + await self.close() diff --git a/app/schema.py b/app/schema.py index 30ccf6c..de18c4f 100644 --- a/app/schema.py +++ b/app/schema.py @@ -3,25 +3,32 @@ from typing import Any, List, Literal, Optional, Union from pydantic import BaseModel, Field + class Role(str, Enum): """Message role options""" + SYSTEM = "system" USER = "user" - ASSISTANT = "assistant" + ASSISTANT = "assistant" TOOL = "tool" + ROLE_VALUES = tuple(role.value for role in Role) ROLE_TYPE = Literal[ROLE_VALUES] # type: ignore + class ToolChoice(str, Enum): """Tool choice options""" + NONE = "none" AUTO = "auto" REQUIRED = "required" + TOOL_CHOICE_VALUES = tuple(choice.value for choice in ToolChoice) TOOL_CHOICE_TYPE = Literal[TOOL_CHOICE_VALUES] # type: ignore + class AgentState(str, Enum): """Agent execution states""" @@ -47,11 +54,12 @@ class ToolCall(BaseModel): class Message(BaseModel): """Represents a chat message in the conversation""" - role: ROLE_TYPE = Field(...) # type: ignore + role: ROLE_TYPE = Field(...) # type: ignore content: Optional[str] = Field(default=None) tool_calls: Optional[List[ToolCall]] = Field(default=None) name: Optional[str] = Field(default=None) tool_call_id: Optional[str] = Field(default=None) + base64_image: Optional[str] = Field(default=None) def __add__(self, other) -> List["Message"]: """支持 Message + list 或 Message + Message 的操作""" @@ -84,12 +92,16 @@ class Message(BaseModel): message["name"] = self.name if self.tool_call_id is not None: message["tool_call_id"] = self.tool_call_id + if self.base64_image is not None: + message["base64_image"] = self.base64_image return message @classmethod - def user_message(cls, content: str) -> "Message": + def user_message( + cls, content: str, base64_image: Optional[str] = None + ) -> "Message": """Create a user message""" - return cls(role=Role.USER, content=content) + return cls(role=Role.USER, content=content, base64_image=base64_image) @classmethod def system_message(cls, content: str) -> "Message": @@ -97,31 +109,50 @@ class Message(BaseModel): return cls(role=Role.SYSTEM, content=content) @classmethod - def assistant_message(cls, content: Optional[str] = None) -> "Message": + def assistant_message( + cls, content: Optional[str] = None, base64_image: Optional[str] = None + ) -> "Message": """Create an assistant message""" - return cls(role=Role.ASSISTANT, content=content) + return cls(role=Role.ASSISTANT, content=content, base64_image=base64_image) @classmethod - def tool_message(cls, content: str, name, tool_call_id: str) -> "Message": + def tool_message( + cls, content: str, name, tool_call_id: str, base64_image: Optional[str] = None + ) -> "Message": """Create a tool message""" - return cls(role=Role.TOOL, content=content, name=name, tool_call_id=tool_call_id) + return cls( + role=Role.TOOL, + content=content, + name=name, + tool_call_id=tool_call_id, + base64_image=base64_image, + ) @classmethod def from_tool_calls( - cls, tool_calls: List[Any], content: Union[str, List[str]] = "", **kwargs + cls, + tool_calls: List[Any], + content: Union[str, List[str]] = "", + base64_image: Optional[str] = None, + **kwargs, ): """Create ToolCallsMessage from raw tool calls. Args: tool_calls: Raw tool calls from LLM content: Optional message content + base64_image: Optional base64 encoded image """ formatted_calls = [ {"id": call.id, "function": call.function.model_dump(), "type": "function"} for call in tool_calls ] return cls( - role=Role.ASSISTANT, content=content, tool_calls=formatted_calls, **kwargs + role=Role.ASSISTANT, + content=content, + tool_calls=formatted_calls, + base64_image=base64_image, + **kwargs, ) diff --git a/app/tool/__init__.py b/app/tool/__init__.py index 9ab569a..6fbd1bc 100644 --- a/app/tool/__init__.py +++ b/app/tool/__init__.py @@ -1,5 +1,6 @@ from app.tool.base import BaseTool from app.tool.bash import Bash +from app.tool.browser_use_tool import BrowserUseTool from app.tool.create_chat_completion import CreateChatCompletion from app.tool.planning import PlanningTool from app.tool.str_replace_editor import StrReplaceEditor @@ -10,6 +11,7 @@ from app.tool.tool_collection import ToolCollection __all__ = [ "BaseTool", "Bash", + "BrowserUseTool", "Terminate", "StrReplaceEditor", "ToolCollection", diff --git a/app/tool/base.py b/app/tool/base.py index ae3c9f5..ba4084d 100644 --- a/app/tool/base.py +++ b/app/tool/base.py @@ -37,6 +37,7 @@ class ToolResult(BaseModel): output: Any = Field(default=None) error: Optional[str] = Field(default=None) + base64_image: Optional[str] = Field(default=None) system: Optional[str] = Field(default=None) class Config: @@ -58,6 +59,7 @@ class ToolResult(BaseModel): return ToolResult( output=combine_fields(self.output, other.output), error=combine_fields(self.error, other.error), + base64_image=combine_fields(self.base64_image, other.base64_image, False), system=combine_fields(self.system, other.system), ) @@ -76,7 +78,3 @@ class CLIResult(ToolResult): class ToolFailure(ToolResult): """A ToolResult that represents a failure.""" - - -class AgentAwareTool: - agent: Optional = None diff --git a/app/tool/browser_use_tool.py b/app/tool/browser_use_tool.py index 57ad03c..7817aef 100644 --- a/app/tool/browser_use_tool.py +++ b/app/tool/browser_use_tool.py @@ -1,6 +1,6 @@ import asyncio import json -from typing import Optional +from typing import Generic, Optional, TypeVar from browser_use import Browser as BrowserUseBrowser from browser_use import BrowserConfig @@ -10,31 +10,45 @@ from pydantic import Field, field_validator from pydantic_core.core_schema import ValidationInfo from app.config import config +from app.llm import LLM from app.tool.base import BaseTool, ToolResult +from app.tool.web_search import WebSearch -MAX_LENGTH = 2000 - _BROWSER_DESCRIPTION = """ -Interact with a web browser to perform various actions such as navigation, element interaction, -content extraction, and tab management. Supported actions include: -- 'navigate': Go to a specific URL -- 'click': Click an element by index -- 'input_text': Input text into an element -- 'screenshot': Capture a screenshot -- 'get_html': Get page HTML content -- 'get_text': Get text content of the page -- 'read_links': Get all links on the page -- 'execute_js': Execute JavaScript code -- 'scroll': Scroll the page -- 'switch_tab': Switch to a specific tab -- 'new_tab': Open a new tab -- 'close_tab': Close the current tab +Interact with a web browser to perform various actions such as navigation, element interaction, content extraction, and tab management. This tool provides a comprehensive set of browser automation capabilities: + +Navigation: +- 'go_to_url': Go to a specific URL in the current tab +- 'go_back': Go back - 'refresh': Refresh the current page +- 'web_search': Search the query in the current tab, the query should be a search query like humans search in web, concrete and not vague or super long. More the single most important items. + +Element Interaction: +- 'click_element': Click an element by index +- 'input_text': Input text into a form element +- 'scroll_down'/'scroll_up': Scroll the page (with optional pixel amount) +- 'scroll_to_text': If you dont find something which you want to interact with, scroll to it +- 'send_keys': Send strings of special keys like Escape,Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. +- 'get_dropdown_options': Get all options from a dropdown +- 'select_dropdown_option': Select dropdown option for interactive element index by the text of the option you want to select + +Content Extraction: +- 'extract_content': Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links + +Tab Management: +- 'switch_tab': Switch to a specific tab +- 'open_tab': Open a new tab with a URL +- 'close_tab': Close the current tab + +Utility: +- 'wait': Wait for a specified number of seconds """ +Context = TypeVar("Context") -class BrowserUseTool(BaseTool): + +class BrowserUseTool(BaseTool, Generic[Context]): name: str = "browser_use" description: str = _BROWSER_DESCRIPTION parameters: dict = { @@ -43,52 +57,79 @@ class BrowserUseTool(BaseTool): "action": { "type": "string", "enum": [ - "navigate", - "click", + "go_to_url", + "click_element", "input_text", - "screenshot", - "get_html", - "get_text", - "execute_js", - "scroll", + "scroll_down", + "scroll_up", + "scroll_to_text", + "send_keys", + "get_dropdown_options", + "select_dropdown_option", + "go_back", + "web_search", + "wait", + "extract_content", "switch_tab", - "new_tab", + "open_tab", "close_tab", - "refresh", ], "description": "The browser action to perform", }, "url": { "type": "string", - "description": "URL for 'navigate' or 'new_tab' actions", + "description": "URL for 'go_to_url' or 'open_tab' actions", }, "index": { "type": "integer", - "description": "Element index for 'click' or 'input_text' actions", + "description": "Element index for 'click_element', 'input_text', 'get_dropdown_options', or 'select_dropdown_option' actions", }, - "text": {"type": "string", "description": "Text for 'input_text' action"}, - "script": { + "text": { "type": "string", - "description": "JavaScript code for 'execute_js' action", + "description": "Text for 'input_text', 'scroll_to_text', or 'select_dropdown_option' actions", }, "scroll_amount": { "type": "integer", - "description": "Pixels to scroll (positive for down, negative for up) for 'scroll' action", + "description": "Pixels to scroll (positive for down, negative for up) for 'scroll_down' or 'scroll_up' actions", }, "tab_id": { "type": "integer", "description": "Tab ID for 'switch_tab' action", }, + "query": { + "type": "string", + "description": "Search query for 'web_search' action", + }, + "goal": { + "type": "string", + "description": "Extraction goal for 'extract_content' action", + }, + "keys": { + "type": "string", + "description": "Keys to send for 'send_keys' action", + }, + "seconds": { + "type": "integer", + "description": "Seconds to wait for 'wait' action", + }, }, "required": ["action"], "dependencies": { - "navigate": ["url"], - "click": ["index"], + "go_to_url": ["url"], + "click_element": ["index"], "input_text": ["index", "text"], - "execute_js": ["script"], "switch_tab": ["tab_id"], - "new_tab": ["url"], - "scroll": ["scroll_amount"], + "open_tab": ["url"], + "scroll_down": ["scroll_amount"], + "scroll_up": ["scroll_amount"], + "scroll_to_text": ["text"], + "send_keys": ["keys"], + "get_dropdown_options": ["index"], + "select_dropdown_option": ["index", "text"], + "go_back": [], + "web_search": ["query"], + "wait": ["seconds"], + "extract_content": ["goal"], }, } @@ -96,6 +137,12 @@ class BrowserUseTool(BaseTool): browser: Optional[BrowserUseBrowser] = Field(default=None, exclude=True) context: Optional[BrowserContext] = Field(default=None, exclude=True) dom_service: Optional[DomService] = Field(default=None, exclude=True) + web_search_tool: WebSearch = Field(default_factory=WebSearch, exclude=True) + + # Context for generic functionality + tool_context: Optional[Context] = Field(default=None, exclude=True) + + llm: Optional[LLM] = Field(default_factory=LLM) @field_validator("parameters", mode="before") def validate_parameters(cls, v: dict, info: ValidationInfo) -> dict: @@ -106,7 +153,7 @@ class BrowserUseTool(BaseTool): async def _ensure_browser_initialized(self) -> BrowserContext: """Ensure browser and context are initialized.""" if self.browser is None: - browser_config_kwargs = {"headless": False} + browser_config_kwargs = {"headless": False, "disable_security": True} if config.browser_config: from browser_use.browser.browser import ProxySettings @@ -158,9 +205,12 @@ class BrowserUseTool(BaseTool): url: Optional[str] = None, index: Optional[int] = None, text: Optional[str] = None, - script: Optional[str] = None, scroll_amount: Optional[int] = None, tab_id: Optional[int] = None, + query: Optional[str] = None, + goal: Optional[str] = None, + keys: Optional[str] = None, + seconds: Optional[int] = None, **kwargs, ) -> ToolResult: """ @@ -170,10 +220,13 @@ class BrowserUseTool(BaseTool): action: The browser action to perform url: URL for navigation or new tab index: Element index for click or input actions - text: Text for input action - script: JavaScript code for execution + text: Text for input action or search query scroll_amount: Pixels to scroll for scroll action tab_id: Tab ID for switch_tab action + query: Search query for Google search + goal: Extraction goal for content extraction + keys: Keys to send for keyboard actions + seconds: Seconds to wait **kwargs: Additional arguments Returns: @@ -183,15 +236,68 @@ class BrowserUseTool(BaseTool): try: context = await self._ensure_browser_initialized() - if action == "navigate": + # Get max content length from config + max_content_length = getattr( + config.browser_config, "max_content_length", 2000 + ) + + # Navigation actions + if action == "go_to_url": if not url: - return ToolResult(error="URL is required for 'navigate' action") - await context.navigate_to(url) + return ToolResult( + error="URL is required for 'go_to_url' action" + ) + page = await context.get_current_page() + await page.goto(url) + await page.wait_for_load_state() return ToolResult(output=f"Navigated to {url}") - elif action == "click": + elif action == "go_back": + await context.go_back() + return ToolResult(output="Navigated back") + + elif action == "refresh": + await context.refresh_page() + return ToolResult(output="Refreshed current page") + + elif action == "web_search": + if not query: + return ToolResult( + error="Query is required for 'web_search' action" + ) + search_results = await self.web_search_tool.execute(query) + + if search_results: + # Navigate to the first search result + first_result = search_results[0] + if isinstance(first_result, dict) and "url" in first_result: + url_to_navigate = first_result["url"] + elif isinstance(first_result, str): + url_to_navigate = first_result + else: + return ToolResult( + error=f"Invalid search result format: {first_result}" + ) + + page = await context.get_current_page() + await page.goto(url_to_navigate) + await page.wait_for_load_state() + + return ToolResult( + output=f"Searched for '{query}' and navigated to first result: {url_to_navigate}\nAll results:" + + "\n".join([str(r) for r in search_results]) + ) + else: + return ToolResult( + error=f"No search results found for '{query}'" + ) + + # Element interaction actions + elif action == "click_element": if index is None: - return ToolResult(error="Index is required for 'click' action") + return ToolResult( + error="Index is required for 'click_element' action" + ) element = await context.get_dom_element_by_index(index) if not element: return ToolResult(error=f"Element with index {index} not found") @@ -214,72 +320,174 @@ class BrowserUseTool(BaseTool): output=f"Input '{text}' into element at index {index}" ) - elif action == "screenshot": - screenshot = await context.take_screenshot(full_page=True) - return ToolResult( - output=f"Screenshot captured (base64 length: {len(screenshot)})", - system=screenshot, + elif action == "scroll_down" or action == "scroll_up": + direction = 1 if action == "scroll_down" else -1 + amount = ( + scroll_amount + if scroll_amount is not None + else context.config.browser_window_size["height"] ) - - elif action == "get_html": - html = await context.get_page_html() - truncated = ( - html[:MAX_LENGTH] + "..." if len(html) > MAX_LENGTH else html - ) - return ToolResult(output=truncated) - - elif action == "get_text": - text = await context.execute_javascript("document.body.innerText") - return ToolResult(output=text) - - elif action == "read_links": - links = await context.execute_javascript( - "document.querySelectorAll('a[href]').forEach((elem) => {if (elem.innerText) {console.log(elem.innerText, elem.href)}})" - ) - return ToolResult(output=links) - - elif action == "execute_js": - if not script: - return ToolResult( - error="Script is required for 'execute_js' action" - ) - result = await context.execute_javascript(script) - return ToolResult(output=str(result)) - - elif action == "scroll": - if scroll_amount is None: - return ToolResult( - error="Scroll amount is required for 'scroll' action" - ) await context.execute_javascript( - f"window.scrollBy(0, {scroll_amount});" + f"window.scrollBy(0, {direction * amount});" ) - direction = "down" if scroll_amount > 0 else "up" return ToolResult( - output=f"Scrolled {direction} by {abs(scroll_amount)} pixels" + output=f"Scrolled {'down' if direction > 0 else 'up'} by {amount} pixels" ) + elif action == "scroll_to_text": + if not text: + return ToolResult( + error="Text is required for 'scroll_to_text' action" + ) + page = await context.get_current_page() + try: + locator = page.get_by_text(text, exact=False) + await locator.scroll_into_view_if_needed() + return ToolResult(output=f"Scrolled to text: '{text}'") + except Exception as e: + return ToolResult(error=f"Failed to scroll to text: {str(e)}") + + elif action == "send_keys": + if not keys: + return ToolResult( + error="Keys are required for 'send_keys' action" + ) + page = await context.get_current_page() + await page.keyboard.press(keys) + return ToolResult(output=f"Sent keys: {keys}") + + elif action == "get_dropdown_options": + if index is None: + return ToolResult( + error="Index is required for 'get_dropdown_options' action" + ) + element = await context.get_dom_element_by_index(index) + if not element: + return ToolResult(error=f"Element with index {index} not found") + page = await context.get_current_page() + options = await page.evaluate( + """ + (xpath) => { + const select = document.evaluate(xpath, document, null, + XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; + if (!select) return null; + return Array.from(select.options).map(opt => ({ + text: opt.text, + value: opt.value, + index: opt.index + })); + } + """, + element.xpath, + ) + return ToolResult(output=f"Dropdown options: {options}") + + elif action == "select_dropdown_option": + if index is None or not text: + return ToolResult( + error="Index and text are required for 'select_dropdown_option' action" + ) + element = await context.get_dom_element_by_index(index) + if not element: + return ToolResult(error=f"Element with index {index} not found") + page = await context.get_current_page() + await page.select_option(element.xpath, label=text) + return ToolResult( + output=f"Selected option '{text}' from dropdown at index {index}" + ) + + # Content extraction actions + elif action == "extract_content": + if not goal: + return ToolResult( + error="Goal is required for 'extract_content' action" + ) + page = await context.get_current_page() + try: + # Get page content and convert to markdown for better processing + html_content = await page.content() + + # Import markdownify here to avoid global import + try: + import markdownify + + content = markdownify.markdownify(html_content) + except ImportError: + # Fallback if markdownify is not available + content = html_content + + # Create prompt for LLM + prompt_text = """ +Your task is to extract the content of the page. You will be given a page and a goal, and you should extract all relevant information around this goal from the page. + +Examples of extraction goals: +- Extract all company names +- Extract specific descriptions +- Extract all information about a topic +- Extract links with companies in structured format +- Extract all links + +If the goal is vague, summarize the page. Respond in JSON format. + +Extraction goal: {goal} + +Page content: +{page} +""" + # Format the prompt with the goal and content + max_content_length = min(50000, len(content)) + formatted_prompt = prompt_text.format( + goal=goal, page=content[:max_content_length] + ) + + # Create a proper message list for the LLM + from app.schema import Message + + messages = [Message.user_message(formatted_prompt)] + + # Use LLM to extract content based on the goal + response = await self.llm.ask(messages) + + msg = f"Extracted from page:\n{response}\n" + return ToolResult(output=msg) + except Exception as e: + # Provide a more helpful error message + error_msg = f"Failed to extract content: {str(e)}" + try: + # Try to return a portion of the page content as fallback + return ToolResult( + output=f"{error_msg}\nHere's a portion of the page content:\n{content[:2000]}..." + ) + except: + # If all else fails, just return the error + return ToolResult(error=error_msg) + + # Tab management actions elif action == "switch_tab": if tab_id is None: return ToolResult( error="Tab ID is required for 'switch_tab' action" ) await context.switch_to_tab(tab_id) + page = await context.get_current_page() + await page.wait_for_load_state() return ToolResult(output=f"Switched to tab {tab_id}") - elif action == "new_tab": + elif action == "open_tab": if not url: - return ToolResult(error="URL is required for 'new_tab' action") + return ToolResult(error="URL is required for 'open_tab' action") await context.create_new_tab(url) - return ToolResult(output=f"Opened new tab with URL {url}") + return ToolResult(output=f"Opened new tab with {url}") elif action == "close_tab": await context.close_current_tab() return ToolResult(output="Closed current tab") - elif action == "refresh": - await context.refresh_page() - return ToolResult(output="Refreshed current page") + # Utility actions + elif action == "wait": + seconds_to_wait = seconds if seconds is not None else 3 + await asyncio.sleep(seconds_to_wait) + return ToolResult(output=f"Waited for {seconds_to_wait} seconds") else: return ToolResult(error=f"Unknown action: {action}") @@ -287,21 +495,58 @@ class BrowserUseTool(BaseTool): except Exception as e: return ToolResult(error=f"Browser action '{action}' failed: {str(e)}") - async def get_current_state(self) -> ToolResult: - """Get the current browser state as a ToolResult.""" - async with self.lock: - try: - context = await self._ensure_browser_initialized() - state = await context.get_state() - state_info = { - "url": state.url, - "title": state.title, - "tabs": [tab.model_dump() for tab in state.tabs], - "interactive_elements": state.element_tree.clickable_elements_to_string(), - } - return ToolResult(output=json.dumps(state_info)) - except Exception as e: - return ToolResult(error=f"Failed to get browser state: {str(e)}") + async def get_current_state( + self, context: Optional[BrowserContext] = None + ) -> ToolResult: + """ + Get the current browser state as a ToolResult. + If context is not provided, uses self.context. + """ + try: + # Use provided context or fall back to self.context + ctx = context or self.context + if not ctx: + return ToolResult(error="Browser context not initialized") + + state = await ctx.get_state() + + # Create a viewport_info dictionary if it doesn't exist + viewport_height = 0 + if hasattr(state, "viewport_info") and state.viewport_info: + viewport_height = state.viewport_info.height + elif hasattr(ctx, "config") and hasattr(ctx.config, "browser_window_size"): + viewport_height = ctx.config.browser_window_size.get("height", 0) + + # Take a screenshot for the state + screenshot = await ctx.take_screenshot(full_page=True) + + # Build the state info with all required fields + state_info = { + "url": state.url, + "title": state.title, + "tabs": [tab.model_dump() for tab in state.tabs], + "help": "[0], [1], [2], etc., represent clickable indices corresponding to the elements listed. Clicking on these indices will navigate to or interact with the respective content behind them.", + "interactive_elements": ( + state.element_tree.clickable_elements_to_string() + if state.element_tree + else "" + ), + "scroll_info": { + "pixels_above": getattr(state, "pixels_above", 0), + "pixels_below": getattr(state, "pixels_below", 0), + "total_height": getattr(state, "pixels_above", 0) + + getattr(state, "pixels_below", 0) + + viewport_height, + }, + "viewport_height": viewport_height, + } + + return ToolResult( + output=json.dumps(state_info, indent=4, ensure_ascii=False), + base64_image=screenshot, + ) + except Exception as e: + return ToolResult(error=f"Failed to get browser state: {str(e)}") async def cleanup(self): """Clean up browser resources.""" @@ -323,3 +568,10 @@ class BrowserUseTool(BaseTool): loop = asyncio.new_event_loop() loop.run_until_complete(self.cleanup()) loop.close() + + @classmethod + def create_with_context(cls, context: Context) -> "BrowserUseTool[Context]": + """Factory method to create a BrowserUseTool with a specific context.""" + tool = cls() + tool.tool_context = context + return tool diff --git a/app/tool/file_operators.py b/app/tool/file_operators.py new file mode 100644 index 0000000..61f8b16 --- /dev/null +++ b/app/tool/file_operators.py @@ -0,0 +1,156 @@ +"""File operation interfaces and implementations for local and sandbox environments.""" + +import asyncio +from pathlib import Path +from typing import Optional, Protocol, Tuple, Union, runtime_checkable + +from app.config import SandboxSettings +from app.exceptions import ToolError +from app.sandbox.client import SANDBOX_CLIENT + + +PathLike = Union[str, Path] + + +@runtime_checkable +class FileOperator(Protocol): + """Interface for file operations in different environments.""" + + async def read_file(self, path: PathLike) -> str: + """Read content from a file.""" + ... + + async def write_file(self, path: PathLike, content: str) -> None: + """Write content to a file.""" + ... + + async def is_directory(self, path: PathLike) -> bool: + """Check if path points to a directory.""" + ... + + async def exists(self, path: PathLike) -> bool: + """Check if path exists.""" + ... + + async def run_command( + self, cmd: str, timeout: Optional[float] = 120.0 + ) -> Tuple[int, str, str]: + """Run a shell command and return (return_code, stdout, stderr).""" + ... + + +class LocalFileOperator(FileOperator): + """File operations implementation for local filesystem.""" + + async def read_file(self, path: PathLike) -> str: + """Read content from a local file.""" + try: + return Path(path).read_text() + except Exception as e: + raise ToolError(f"Failed to read {path}: {str(e)}") from None + + async def write_file(self, path: PathLike, content: str) -> None: + """Write content to a local file.""" + try: + Path(path).write_text(content) + except Exception as e: + raise ToolError(f"Failed to write to {path}: {str(e)}") from None + + async def is_directory(self, path: PathLike) -> bool: + """Check if path points to a directory.""" + return Path(path).is_dir() + + async def exists(self, path: PathLike) -> bool: + """Check if path exists.""" + return Path(path).exists() + + async def run_command( + self, cmd: str, timeout: Optional[float] = 120.0 + ) -> Tuple[int, str, str]: + """Run a shell command locally.""" + process = await asyncio.create_subprocess_shell( + cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + + try: + stdout, stderr = await asyncio.wait_for( + process.communicate(), timeout=timeout + ) + return ( + process.returncode or 0, + stdout.decode(), + stderr.decode(), + ) + except asyncio.TimeoutError as exc: + try: + process.kill() + except ProcessLookupError: + pass + raise TimeoutError( + f"Command '{cmd}' timed out after {timeout} seconds" + ) from exc + + +class SandboxFileOperator(FileOperator): + """File operations implementation for sandbox environment.""" + + def __init__(self): + self.sandbox_client = SANDBOX_CLIENT + + async def _ensure_sandbox_initialized(self): + """Ensure sandbox is initialized.""" + if not self.sandbox_client.sandbox: + await self.sandbox_client.create(config=SandboxSettings()) + + async def read_file(self, path: PathLike) -> str: + """Read content from a file in sandbox.""" + await self._ensure_sandbox_initialized() + try: + return await self.sandbox_client.read_file(str(path)) + except Exception as e: + raise ToolError(f"Failed to read {path} in sandbox: {str(e)}") from None + + async def write_file(self, path: PathLike, content: str) -> None: + """Write content to a file in sandbox.""" + await self._ensure_sandbox_initialized() + try: + await self.sandbox_client.write_file(str(path), content) + except Exception as e: + raise ToolError(f"Failed to write to {path} in sandbox: {str(e)}") from None + + async def is_directory(self, path: PathLike) -> bool: + """Check if path points to a directory in sandbox.""" + await self._ensure_sandbox_initialized() + result = await self.sandbox_client.run_command( + f"test -d {path} && echo 'true' || echo 'false'" + ) + return result.strip() == "true" + + async def exists(self, path: PathLike) -> bool: + """Check if path exists in sandbox.""" + await self._ensure_sandbox_initialized() + result = await self.sandbox_client.run_command( + f"test -e {path} && echo 'true' || echo 'false'" + ) + return result.strip() == "true" + + async def run_command( + self, cmd: str, timeout: Optional[float] = 120.0 + ) -> Tuple[int, str, str]: + """Run a command in sandbox environment.""" + await self._ensure_sandbox_initialized() + try: + stdout = await self.sandbox_client.run_command( + cmd, timeout=int(timeout) if timeout else None + ) + return ( + 0, # Always return 0 since we don't have explicit return code from sandbox + stdout, + "", # No stderr capture in the current sandbox implementation + ) + except TimeoutError as exc: + raise TimeoutError( + f"Command '{cmd}' timed out after {timeout} seconds in sandbox" + ) from exc + except Exception as exc: + return 1, "", f"Error executing command in sandbox: {str(exc)}" diff --git a/app/tool/file_saver.py b/app/tool/file_saver.py index d6a3766..7d92a02 100644 --- a/app/tool/file_saver.py +++ b/app/tool/file_saver.py @@ -2,6 +2,7 @@ import os import aiofiles +from app.config import WORKSPACE_ROOT from app.tool.base import BaseTool @@ -45,15 +46,22 @@ The tool accepts content and a file path, and saves the content to that location str: A message indicating the result of the operation. """ try: + # Place the generated file in the workspace directory + if os.path.isabs(file_path): + file_name = os.path.basename(file_path) + full_path = os.path.join(WORKSPACE_ROOT, file_name) + else: + full_path = os.path.join(WORKSPACE_ROOT, file_path) + # Ensure the directory exists - directory = os.path.dirname(file_path) + directory = os.path.dirname(full_path) if directory and not os.path.exists(directory): os.makedirs(directory) # Write directly to the file - async with aiofiles.open(file_path, mode, encoding="utf-8") as file: + async with aiofiles.open(full_path, mode, encoding="utf-8") as file: await file.write(content) - return f"Content successfully saved to {file_path}" + return f"Content successfully saved to {full_path}" except Exception as e: return f"Error saving file: {str(e)}" diff --git a/app/tool/google_search.py b/app/tool/google_search.py deleted file mode 100644 index ed5d7d5..0000000 --- a/app/tool/google_search.py +++ /dev/null @@ -1,48 +0,0 @@ -import asyncio -from typing import List - -from googlesearch import search - -from app.tool.base import BaseTool - - -class GoogleSearch(BaseTool): - name: str = "google_search" - description: str = """Perform a Google search and return a list of relevant links. -Use this tool when you need to find information on the web, get up-to-date data, or research specific topics. -The tool returns a list of URLs that match the search query. -""" - parameters: dict = { - "type": "object", - "properties": { - "query": { - "type": "string", - "description": "(required) The search query to submit to Google.", - }, - "num_results": { - "type": "integer", - "description": "(optional) The number of search results to return. Default is 10.", - "default": 10, - }, - }, - "required": ["query"], - } - - async def execute(self, query: str, num_results: int = 10) -> List[str]: - """ - Execute a Google search and return a list of URLs. - - Args: - query (str): The search query to submit to Google. - num_results (int, optional): The number of search results to return. Default is 10. - - Returns: - List[str]: A list of URLs matching the search query. - """ - # Run the search in a thread pool to prevent blocking - loop = asyncio.get_event_loop() - links = await loop.run_in_executor( - None, lambda: list(search(query, num_results=num_results)) - ) - - return links diff --git a/app/tool/python_execute.py b/app/tool/python_execute.py index 88e1aab..08ceffa 100644 --- a/app/tool/python_execute.py +++ b/app/tool/python_execute.py @@ -1,4 +1,6 @@ -import threading +import multiprocessing +import sys +from io import StringIO from typing import Dict from app.tool.base import BaseTool @@ -20,6 +22,20 @@ class PythonExecute(BaseTool): "required": ["code"], } + def _run_code(self, code: str, result_dict: dict, safe_globals: dict) -> None: + original_stdout = sys.stdout + try: + output_buffer = StringIO() + sys.stdout = output_buffer + exec(code, safe_globals, safe_globals) + result_dict["observation"] = output_buffer.getvalue() + result_dict["success"] = True + except Exception as e: + result_dict["observation"] = str(e) + result_dict["success"] = False + finally: + sys.stdout = original_stdout + async def execute( self, code: str, @@ -35,36 +51,25 @@ class PythonExecute(BaseTool): Returns: Dict: Contains 'output' with execution output or error message and 'success' status. """ - result = {"observation": ""} - def run_code(): - try: - safe_globals = {"__builtins__": dict(__builtins__)} + with multiprocessing.Manager() as manager: + result = manager.dict({"observation": "", "success": False}) + if isinstance(__builtins__, dict): + safe_globals = {"__builtins__": __builtins__} + else: + safe_globals = {"__builtins__": __builtins__.__dict__.copy()} + proc = multiprocessing.Process( + target=self._run_code, args=(code, result, safe_globals) + ) + proc.start() + proc.join(timeout) - import sys - from io import StringIO - - output_buffer = StringIO() - sys.stdout = output_buffer - - exec(code, safe_globals, {}) - - sys.stdout = sys.__stdout__ - - result["observation"] = output_buffer.getvalue() - - except Exception as e: - result["observation"] = str(e) - result["success"] = False - - thread = threading.Thread(target=run_code) - thread.start() - thread.join(timeout) - - if thread.is_alive(): - return { - "observation": f"Execution timeout after {timeout} seconds", - "success": False, - } - - return result + # timeout process + if proc.is_alive(): + proc.terminate() + proc.join(1) + return { + "observation": f"Execution timeout after {timeout} seconds", + "success": False, + } + return dict(result) diff --git a/app/tool/run.py b/app/tool/run.py deleted file mode 100644 index 8896c58..0000000 --- a/app/tool/run.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Utility to run shell commands asynchronously with a timeout.""" - -import asyncio - - -TRUNCATED_MESSAGE: str = "To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for." -MAX_RESPONSE_LEN: int = 16000 - - -def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN): - """Truncate content and append a notice if content exceeds the specified length.""" - return ( - content - if not truncate_after or len(content) <= truncate_after - else content[:truncate_after] + TRUNCATED_MESSAGE - ) - - -async def run( - cmd: str, - timeout: float | None = 120.0, # seconds - truncate_after: int | None = MAX_RESPONSE_LEN, -): - """Run a shell command asynchronously with a timeout.""" - process = await asyncio.create_subprocess_shell( - cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE - ) - - try: - stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout) - return ( - process.returncode or 0, - maybe_truncate(stdout.decode(), truncate_after=truncate_after), - maybe_truncate(stderr.decode(), truncate_after=truncate_after), - ) - except asyncio.TimeoutError as exc: - try: - process.kill() - except ProcessLookupError: - pass - raise TimeoutError( - f"Command '{cmd}' timed out after {timeout} seconds" - ) from exc diff --git a/app/tool/search/__init__.py b/app/tool/search/__init__.py new file mode 100644 index 0000000..4f486ac --- /dev/null +++ b/app/tool/search/__init__.py @@ -0,0 +1,12 @@ +from app.tool.search.baidu_search import BaiduSearchEngine +from app.tool.search.base import WebSearchEngine +from app.tool.search.duckduckgo_search import DuckDuckGoSearchEngine +from app.tool.search.google_search import GoogleSearchEngine + + +__all__ = [ + "WebSearchEngine", + "BaiduSearchEngine", + "DuckDuckGoSearchEngine", + "GoogleSearchEngine", +] diff --git a/app/tool/search/baidu_search.py b/app/tool/search/baidu_search.py new file mode 100644 index 0000000..d415ce8 --- /dev/null +++ b/app/tool/search/baidu_search.py @@ -0,0 +1,9 @@ +from baidusearch.baidusearch import search + +from app.tool.search.base import WebSearchEngine + + +class BaiduSearchEngine(WebSearchEngine): + def perform_search(self, query, num_results=10, *args, **kwargs): + """Baidu search engine.""" + return search(query, num_results=num_results) diff --git a/app/tool/search/base.py b/app/tool/search/base.py new file mode 100644 index 0000000..3132381 --- /dev/null +++ b/app/tool/search/base.py @@ -0,0 +1,17 @@ +class WebSearchEngine(object): + def perform_search( + self, query: str, num_results: int = 10, *args, **kwargs + ) -> list[dict]: + """ + Perform a web search and return a list of URLs. + + Args: + query (str): The search query to submit to the search engine. + num_results (int, optional): The number of search results to return. Default is 10. + args: Additional arguments. + kwargs: Additional keyword arguments. + + Returns: + List: A list of dict matching the search query. + """ + raise NotImplementedError diff --git a/app/tool/search/duckduckgo_search.py b/app/tool/search/duckduckgo_search.py new file mode 100644 index 0000000..3dd5c52 --- /dev/null +++ b/app/tool/search/duckduckgo_search.py @@ -0,0 +1,9 @@ +from duckduckgo_search import DDGS + +from app.tool.search.base import WebSearchEngine + + +class DuckDuckGoSearchEngine(WebSearchEngine): + async def perform_search(self, query, num_results=10, *args, **kwargs): + """DuckDuckGo search engine.""" + return DDGS.text(query, num_results=num_results) diff --git a/app/tool/search/google_search.py b/app/tool/search/google_search.py new file mode 100644 index 0000000..425106d --- /dev/null +++ b/app/tool/search/google_search.py @@ -0,0 +1,9 @@ +from googlesearch import search + +from app.tool.search.base import WebSearchEngine + + +class GoogleSearchEngine(WebSearchEngine): + def perform_search(self, query, num_results=10, *args, **kwargs): + """Google search engine.""" + return search(query, num_results=num_results) diff --git a/app/tool/str_replace_editor.py b/app/tool/str_replace_editor.py index 4094565..a907f41 100644 --- a/app/tool/str_replace_editor.py +++ b/app/tool/str_replace_editor.py @@ -1,11 +1,19 @@ +"""File and directory manipulation tool with sandbox support.""" + from collections import defaultdict from pathlib import Path -from typing import Literal, get_args +from typing import Any, DefaultDict, List, Literal, Optional, get_args +from app.config import config from app.exceptions import ToolError from app.tool import BaseTool from app.tool.base import CLIResult, ToolResult -from app.tool.run import run +from app.tool.file_operators import ( + FileOperator, + LocalFileOperator, + PathLike, + SandboxFileOperator, +) Command = Literal[ @@ -15,12 +23,17 @@ Command = Literal[ "insert", "undo_edit", ] + +# Constants SNIPPET_LINES: int = 4 - MAX_RESPONSE_LEN: int = 16000 +TRUNCATED_MESSAGE: str = ( + "To save on context only part of this file has been shown to you. " + "You should retry this tool after you have searched inside the file with `grep -n` " + "in order to find the line numbers of what you are looking for." +) -TRUNCATED_MESSAGE: str = "To save on context only part of this file has been shown to you. You should retry this tool after you have searched inside the file with `grep -n` in order to find the line numbers of what you are looking for." - +# Tool description _STR_REPLACE_EDITOR_DESCRIPTION = """Custom editing tool for viewing, creating and editing files * State is persistent across command calls and discussions with the user * If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep @@ -35,17 +48,17 @@ Notes for using the `str_replace` command: """ -def maybe_truncate(content: str, truncate_after: int | None = MAX_RESPONSE_LEN): +def maybe_truncate( + content: str, truncate_after: Optional[int] = MAX_RESPONSE_LEN +) -> str: """Truncate content and append a notice if content exceeds the specified length.""" - return ( - content - if not truncate_after or len(content) <= truncate_after - else content[:truncate_after] + TRUNCATED_MESSAGE - ) + if not truncate_after or len(content) <= truncate_after: + return content + return content[:truncate_after] + TRUNCATED_MESSAGE class StrReplaceEditor(BaseTool): - """A tool for executing bash commands""" + """A tool for viewing, creating, and editing files with sandbox support.""" name: str = "str_replace_editor" description: str = _STR_REPLACE_EDITOR_DESCRIPTION @@ -85,8 +98,18 @@ class StrReplaceEditor(BaseTool): }, "required": ["command", "path"], } + _file_history: DefaultDict[PathLike, List[str]] = defaultdict(list) + _local_operator: LocalFileOperator = LocalFileOperator() + _sandbox_operator: SandboxFileOperator = SandboxFileOperator() - _file_history: list = defaultdict(list) + # def _get_operator(self, use_sandbox: bool) -> FileOperator: + def _get_operator(self) -> FileOperator: + """Get the appropriate file operator based on execution mode.""" + return ( + self._sandbox_operator + if config.sandbox.use_sandbox + else self._local_operator + ) async def execute( self, @@ -98,24 +121,30 @@ class StrReplaceEditor(BaseTool): old_str: str | None = None, new_str: str | None = None, insert_line: int | None = None, - **kwargs, + **kwargs: Any, ) -> str: - _path = Path(path) - self.validate_path(command, _path) + """Execute a file operation command.""" + # Get the appropriate file operator + operator = self._get_operator() + + # Validate path and command combination + await self.validate_path(command, Path(path), operator) + + # Execute the appropriate command if command == "view": - result = await self.view(_path, view_range) + result = await self.view(path, view_range, operator) elif command == "create": if file_text is None: raise ToolError("Parameter `file_text` is required for command: create") - self.write_file(_path, file_text) - self._file_history[_path].append(file_text) - result = ToolResult(output=f"File created successfully at: {_path}") + await operator.write_file(path, file_text) + self._file_history[path].append(file_text) + result = ToolResult(output=f"File created successfully at: {path}") elif command == "str_replace": if old_str is None: raise ToolError( "Parameter `old_str` is required for command: str_replace" ) - result = self.str_replace(_path, old_str, new_str) + result = await self.str_replace(path, old_str, new_str, operator) elif command == "insert": if insert_line is None: raise ToolError( @@ -123,92 +152,145 @@ class StrReplaceEditor(BaseTool): ) if new_str is None: raise ToolError("Parameter `new_str` is required for command: insert") - result = self.insert(_path, insert_line, new_str) + result = await self.insert(path, insert_line, new_str, operator) elif command == "undo_edit": - result = self.undo_edit(_path) + result = await self.undo_edit(path, operator) else: + # This should be caught by type checking, but we include it for safety raise ToolError( f'Unrecognized command {command}. The allowed commands for the {self.name} tool are: {", ".join(get_args(Command))}' ) + return str(result) - def validate_path(self, command: str, path: Path): - """ - Check that the path/command combination is valid. - """ - # Check if its an absolute path + async def validate_path( + self, command: str, path: Path, operator: FileOperator + ) -> None: + """Validate path and command combination based on execution environment.""" + # Check if path is absolute if not path.is_absolute(): - suggested_path = Path("") / path - raise ToolError( - f"The path {path} is not an absolute path, it should start with `/`. Maybe you meant {suggested_path}?" - ) - # Check if path exists - if not path.exists() and command != "create": - raise ToolError( - f"The path {path} does not exist. Please provide a valid path." - ) - if path.exists() and command == "create": - raise ToolError( - f"File already exists at: {path}. Cannot overwrite files using command `create`." - ) - # Check if the path points to a directory - if path.is_dir(): - if command != "view": + raise ToolError(f"The path {path} is not an absolute path") + + # Only check if path exists for non-create commands + if command != "create": + if not await operator.exists(path): + raise ToolError( + f"The path {path} does not exist. Please provide a valid path." + ) + + # Check if path is a directory + is_dir = await operator.is_directory(path) + if is_dir and command != "view": raise ToolError( f"The path {path} is a directory and only the `view` command can be used on directories" ) - async def view(self, path: Path, view_range: list[int] | None = None): - """Implement the view command""" - if path.is_dir(): + # Check if file exists for create command + elif command == "create": + exists = await operator.exists(path) + if exists: + raise ToolError( + f"File already exists at: {path}. Cannot overwrite files using command `create`." + ) + + async def view( + self, + path: PathLike, + view_range: Optional[List[int]] = None, + operator: FileOperator = None, + ) -> CLIResult: + """Display file or directory content.""" + # Determine if path is a directory + is_dir = await operator.is_directory(path) + + if is_dir: + # Directory handling if view_range: raise ToolError( "The `view_range` parameter is not allowed when `path` points to a directory." ) - _, stdout, stderr = await run( - rf"find {path} -maxdepth 2 -not -path '*/\.*'" - ) - if not stderr: - stdout = f"Here's the files and directories up to 2 levels deep in {path}, excluding hidden items:\n{stdout}\n" - return CLIResult(output=stdout, error=stderr) + return await self._view_directory(path, operator) + else: + # File handling + return await self._view_file(path, operator, view_range) - file_content = self.read_file(path) + @staticmethod + async def _view_directory(path: PathLike, operator: FileOperator) -> CLIResult: + """Display directory contents.""" + find_cmd = f"find {path} -maxdepth 2 -not -path '*/\\.*'" + + # Execute command using the operator + returncode, stdout, stderr = await operator.run_command(find_cmd) + + if not stderr: + stdout = ( + f"Here's the files and directories up to 2 levels deep in {path}, " + f"excluding hidden items:\n{stdout}\n" + ) + + return CLIResult(output=stdout, error=stderr) + + async def _view_file( + self, + path: PathLike, + operator: FileOperator, + view_range: Optional[List[int]] = None, + ) -> CLIResult: + """Display file content, optionally within a specified line range.""" + # Read file content + file_content = await operator.read_file(path) init_line = 1 + + # Apply view range if specified if view_range: if len(view_range) != 2 or not all(isinstance(i, int) for i in view_range): raise ToolError( "Invalid `view_range`. It should be a list of two integers." ) + file_lines = file_content.split("\n") n_lines_file = len(file_lines) init_line, final_line = view_range + + # Validate view range if init_line < 1 or init_line > n_lines_file: raise ToolError( - f"Invalid `view_range`: {view_range}. Its first element `{init_line}` should be within the range of lines of the file: {[1, n_lines_file]}" + f"Invalid `view_range`: {view_range}. Its first element `{init_line}` should be " + f"within the range of lines of the file: {[1, n_lines_file]}" ) if final_line > n_lines_file: raise ToolError( - f"Invalid `view_range`: {view_range}. Its second element `{final_line}` should be smaller than the number of lines in the file: `{n_lines_file}`" + f"Invalid `view_range`: {view_range}. Its second element `{final_line}` should be " + f"smaller than the number of lines in the file: `{n_lines_file}`" ) if final_line != -1 and final_line < init_line: raise ToolError( - f"Invalid `view_range`: {view_range}. Its second element `{final_line}` should be larger or equal than its first `{init_line}`" + f"Invalid `view_range`: {view_range}. Its second element `{final_line}` should be " + f"larger or equal than its first `{init_line}`" ) + # Apply range if final_line == -1: file_content = "\n".join(file_lines[init_line - 1 :]) else: file_content = "\n".join(file_lines[init_line - 1 : final_line]) + # Format and return result return CLIResult( output=self._make_output(file_content, str(path), init_line=init_line) ) - def str_replace(self, path: Path, old_str: str, new_str: str | None): - """Implement the str_replace command, which replaces old_str with new_str in the file content""" - # Read the file content - file_content = self.read_file(path).expandtabs() + async def str_replace( + self, + path: PathLike, + old_str: str, + new_str: Optional[str] = None, + operator: FileOperator = None, + ) -> CLIResult: + """Replace a unique string in a file with a new string.""" + # Read file content and expand tabs + file_content = (await operator.read_file(path)).expandtabs() old_str = old_str.expandtabs() new_str = new_str.expandtabs() if new_str is not None else "" @@ -219,6 +301,7 @@ class StrReplaceEditor(BaseTool): f"No replacement was performed, old_str `{old_str}` did not appear verbatim in {path}." ) elif occurrences > 1: + # Find line numbers of occurrences file_content_lines = file_content.split("\n") lines = [ idx + 1 @@ -226,16 +309,17 @@ class StrReplaceEditor(BaseTool): if old_str in line ] raise ToolError( - f"No replacement was performed. Multiple occurrences of old_str `{old_str}` in lines {lines}. Please ensure it is unique" + f"No replacement was performed. Multiple occurrences of old_str `{old_str}` " + f"in lines {lines}. Please ensure it is unique" ) # Replace old_str with new_str new_file_content = file_content.replace(old_str, new_str) # Write the new content to the file - self.write_file(path, new_file_content) + await operator.write_file(path, new_file_content) - # Save the content to history + # Save the original content to history self._file_history[path].append(file_content) # Create a snippet of the edited section @@ -253,36 +337,50 @@ class StrReplaceEditor(BaseTool): return CLIResult(output=success_msg) - def insert(self, path: Path, insert_line: int, new_str: str): - """Implement the insert command, which inserts new_str at the specified line in the file content.""" - file_text = self.read_file(path).expandtabs() + async def insert( + self, + path: PathLike, + insert_line: int, + new_str: str, + operator: FileOperator = None, + ) -> CLIResult: + """Insert text at a specific line in a file.""" + # Read and prepare content + file_text = (await operator.read_file(path)).expandtabs() new_str = new_str.expandtabs() file_text_lines = file_text.split("\n") n_lines_file = len(file_text_lines) + # Validate insert_line if insert_line < 0 or insert_line > n_lines_file: raise ToolError( - f"Invalid `insert_line` parameter: {insert_line}. It should be within the range of lines of the file: {[0, n_lines_file]}" + f"Invalid `insert_line` parameter: {insert_line}. It should be within " + f"the range of lines of the file: {[0, n_lines_file]}" ) + # Perform insertion new_str_lines = new_str.split("\n") new_file_text_lines = ( file_text_lines[:insert_line] + new_str_lines + file_text_lines[insert_line:] ) + + # Create a snippet for preview snippet_lines = ( file_text_lines[max(0, insert_line - SNIPPET_LINES) : insert_line] + new_str_lines + file_text_lines[insert_line : insert_line + SNIPPET_LINES] ) + # Join lines and write to file new_file_text = "\n".join(new_file_text_lines) snippet = "\n".join(snippet_lines) - self.write_file(path, new_file_text) + await operator.write_file(path, new_file_text) self._file_history[path].append(file_text) + # Prepare success message success_msg = f"The file {path} has been edited. " success_msg += self._make_output( snippet, @@ -290,51 +388,43 @@ class StrReplaceEditor(BaseTool): max(1, insert_line - SNIPPET_LINES + 1), ) success_msg += "Review the changes and make sure they are as expected (correct indentation, no duplicate lines, etc). Edit the file again if necessary." + return CLIResult(output=success_msg) - def undo_edit(self, path: Path): - """Implement the undo_edit command.""" + async def undo_edit( + self, path: PathLike, operator: FileOperator = None + ) -> CLIResult: + """Revert the last edit made to a file.""" if not self._file_history[path]: raise ToolError(f"No edit history found for {path}.") old_text = self._file_history[path].pop() - self.write_file(path, old_text) + await operator.write_file(path, old_text) return CLIResult( output=f"Last edit to {path} undone successfully. {self._make_output(old_text, str(path))}" ) - def read_file(self, path: Path): - """Read the content of a file from a given path; raise a ToolError if an error occurs.""" - try: - return path.read_text() - except Exception as e: - raise ToolError(f"Ran into {e} while trying to read {path}") from None - - def write_file(self, path: Path, file: str): - """Write the content of a file to a given path; raise a ToolError if an error occurs.""" - try: - path.write_text(file) - except Exception as e: - raise ToolError(f"Ran into {e} while trying to write to {path}") from None - def _make_output( self, file_content: str, file_descriptor: str, init_line: int = 1, expand_tabs: bool = True, - ): - """Generate output for the CLI based on the content of a file.""" + ) -> str: + """Format file content for display with line numbers.""" file_content = maybe_truncate(file_content) if expand_tabs: file_content = file_content.expandtabs() + + # Add line numbers to each line file_content = "\n".join( [ f"{i + init_line:6}\t{line}" for i, line in enumerate(file_content.split("\n")) ] ) + return ( f"Here's the result of running `cat -n` on {file_descriptor}:\n" + file_content diff --git a/app/tool/terminal.py b/app/tool/terminal.py index df5996e..86b401c 100644 --- a/app/tool/terminal.py +++ b/app/tool/terminal.py @@ -40,7 +40,7 @@ Note: You MUST append a `sleep 0.05` to the end of the command for commands that str: The output, and error of the command execution. """ # Split the command by & to handle multiple commands - commands = [cmd.strip() for cmd in command.split('&') if cmd.strip()] + commands = [cmd.strip() for cmd in command.split("&") if cmd.strip()] final_output = CLIResult(output="", error="") for cmd in commands: @@ -61,7 +61,7 @@ Note: You MUST append a `sleep 0.05` to the end of the command for commands that stdout, stderr = await self.process.communicate() result = CLIResult( output=stdout.decode().strip(), - error=stderr.decode().strip() + error=stderr.decode().strip(), ) except Exception as e: result = CLIResult(output="", error=str(e)) @@ -70,9 +70,13 @@ Note: You MUST append a `sleep 0.05` to the end of the command for commands that # Combine outputs if result.output: - final_output.output += (result.output + "\n") if final_output.output else result.output + final_output.output += ( + (result.output + "\n") if final_output.output else result.output + ) if result.error: - final_output.error += (result.error + "\n") if final_output.error else result.error + final_output.error += ( + (result.error + "\n") if final_output.error else result.error + ) # Remove trailing newlines final_output.output = final_output.output.rstrip() @@ -124,14 +128,10 @@ Note: You MUST append a `sleep 0.05` to the end of the command for commands that if os.path.isdir(new_path): self.current_path = new_path return CLIResult( - output=f"Changed directory to {self.current_path}", - error="" + output=f"Changed directory to {self.current_path}", error="" ) else: - return CLIResult( - output="", - error=f"No such directory: {new_path}" - ) + return CLIResult(output="", error=f"No such directory: {new_path}") except Exception as e: return CLIResult(output="", error=str(e)) @@ -152,7 +152,7 @@ Note: You MUST append a `sleep 0.05` to the end of the command for commands that parts = shlex.split(command) if any(cmd in dangerous_commands for cmd in parts): raise ValueError("Use of dangerous commands is restricted.") - except Exception as e: + except Exception: # If shlex.split fails, try basic string comparison if any(cmd in command for cmd in dangerous_commands): raise ValueError("Use of dangerous commands is restricted.") diff --git a/app/tool/web_search.py b/app/tool/web_search.py new file mode 100644 index 0000000..7b1018b --- /dev/null +++ b/app/tool/web_search.py @@ -0,0 +1,99 @@ +import asyncio +from typing import List + +from tenacity import retry, stop_after_attempt, wait_exponential + +from app.config import config +from app.tool.base import BaseTool +from app.tool.search import ( + BaiduSearchEngine, + DuckDuckGoSearchEngine, + GoogleSearchEngine, + WebSearchEngine, +) + + +class WebSearch(BaseTool): + name: str = "web_search" + description: str = """Perform a web search and return a list of relevant links. + This function attempts to use the primary search engine API to get up-to-date results. + If an error occurs, it falls back to an alternative search engine.""" + parameters: dict = { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "(required) The search query to submit to the search engine.", + }, + "num_results": { + "type": "integer", + "description": "(optional) The number of search results to return. Default is 10.", + "default": 10, + }, + }, + "required": ["query"], + } + _search_engine: dict[str, WebSearchEngine] = { + "google": GoogleSearchEngine(), + "baidu": BaiduSearchEngine(), + "duckduckgo": DuckDuckGoSearchEngine(), + } + + async def execute(self, query: str, num_results: int = 10) -> List[str]: + """ + Execute a Web search and return a list of URLs. + + Args: + query (str): The search query to submit to the search engine. + num_results (int, optional): The number of search results to return. Default is 10. + + Returns: + List[str]: A list of URLs matching the search query. + """ + engine_order = self._get_engine_order() + for engine_name in engine_order: + engine = self._search_engine[engine_name] + try: + links = await self._perform_search_with_engine( + engine, query, num_results + ) + if links: + return links + except Exception as e: + print(f"Search engine '{engine_name}' failed with error: {e}") + return [] + + def _get_engine_order(self) -> List[str]: + """ + Determines the order in which to try search engines. + Preferred engine is first (based on configuration), followed by the remaining engines. + + Returns: + List[str]: Ordered list of search engine names. + """ + preferred = "google" + if config.search_config and config.search_config.engine: + preferred = config.search_config.engine.lower() + + engine_order = [] + if preferred in self._search_engine: + engine_order.append(preferred) + for key in self._search_engine: + if key not in engine_order: + engine_order.append(key) + return engine_order + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=1, max=10), + ) + async def _perform_search_with_engine( + self, + engine: WebSearchEngine, + query: str, + num_results: int, + ) -> List[str]: + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + None, lambda: list(engine.perform_search(query, num_results=num_results)) + ) diff --git a/assets/logo.jpg b/assets/logo.jpg new file mode 100644 index 0000000..634b8f6 Binary files /dev/null and b/assets/logo.jpg differ diff --git a/config/.gitignore b/config/.gitignore new file mode 100644 index 0000000..eaff182 --- /dev/null +++ b/config/.gitignore @@ -0,0 +1,2 @@ +# prevent the local config file from being uploaded to the remote repository +config.toml diff --git a/config/config.example.toml b/config/config.example.toml index 13648dd..d5750a2 100644 --- a/config/config.example.toml +++ b/config/config.example.toml @@ -1,10 +1,10 @@ # Global LLM configuration [llm] -model = "claude-3-5-sonnet" -base_url = "https://api.openai.com/v1" -api_key = "sk-..." -max_tokens = 4096 -temperature = 0.0 +model = "claude-3-7-sonnet-20250219" # The LLM model to use +base_url = "https://api.anthropic.com/v1/" # API endpoint URL +api_key = "YOUR_API_KEY" # Your API key +max_tokens = 8192 # Maximum number of tokens in the response +temperature = 0.0 # Controls randomness # [llm] #AZURE OPENAI: # api_type= 'azure' @@ -15,11 +15,29 @@ temperature = 0.0 # temperature = 0.0 # api_version="AZURE API VERSION" #"2024-08-01-preview" +# [llm] #OLLAMA: +# api_type = 'ollama' +# model = "llama3.2" +# base_url = "http://localhost:11434/v1" +# api_key = "ollama" +# max_tokens = 4096 +# temperature = 0.0 + # Optional configuration for specific LLM models [llm.vision] -model = "claude-3-5-sonnet" -base_url = "https://api.openai.com/v1" -api_key = "sk-..." +model = "claude-3-7-sonnet-20250219" # The vision model to use +base_url = "https://api.anthropic.com/v1/" # API endpoint URL for vision model +api_key = "YOUR_API_KEY" # Your API key for vision model +max_tokens = 8192 # Maximum number of tokens in the response +temperature = 0.0 # Controls randomness for vision model + +# [llm.vision] #OLLAMA VISION: +# api_type = 'ollama' +# model = "llama3.2-vision" +# base_url = "http://localhost:11434/v1" +# api_key = "ollama" +# max_tokens = 4096 +# temperature = 0.0 # Optional configuration for specific browser configuration # [browser] @@ -42,3 +60,18 @@ api_key = "sk-..." # server = "http://proxy-server:port" # username = "proxy-username" # password = "proxy-password" + +# Optional configuration, Search settings. +# [search] +# Search engine for agent to use. Default is "Google", can be set to "Baidu" or "DuckDuckGo". +#engine = "Google" + +## Sandbox configuration +#[sandbox] +#use_sandbox = false +#image = "python:3.12-slim" +#work_dir = "/workspace" +#memory_limit = "1g" # 512m +#cpu_limit = 2.0 +#timeout = 300 +#network_enabled = true diff --git a/openmanus_server/README.md b/mcp/README.md similarity index 81% rename from openmanus_server/README.md rename to mcp/README.md index bfd3339..c7892a3 100644 --- a/openmanus_server/README.md +++ b/mcp/README.md @@ -1,6 +1,6 @@ -# OpenManus-server 🤖 +# OpenManus-mcp 🤖 -This project provides a server based on [Model Context Protocol (MCP)](https://modelcontextprotocol.io/) that exposes **OpenManus** tool functionalities as standardized APIs. +Implement a server based on [Model Context Protocol (MCP)](https://modelcontextprotocol.io/) that exposes **OpenManus** tool functionalities as standardized APIs and create a simple client to interact with the server. ## ✨ Features @@ -42,7 +42,8 @@ uv pip install -r requirements.txt 3. Install MCP dependencies: ```bash -uv pip install -r openmanus_server/mcp_requirements.txt +uv pip install -r mcp/mcp_requirements.txt +playright install ``` ## Demo display @@ -50,7 +51,7 @@ https://github.com/user-attachments/assets/177b1f50-422f-4c2e-ab7d-1f3d7ff27679 ## 📖 Usage -### 1. Testing your server with Claude for Desktop 🖥️ +### 1. Testing the server with Claude for Desktop 🖥️ > ⚠️ **Note**: Claude for Desktop is not yet available on Linux. Linux users can build an MCP client that connects to the server we just built. @@ -75,9 +76,9 @@ In this case, we'll add our single Openmanus server like so: "command": "/ABSOLUTE/PATH/TO/PARENT/FOLDER/uv", "args": [ "--directory", - "/ABSOLUTE/PATH/TO/OpenManus/openmanus_server", + "/ABSOLUTE/PATH/TO/OpenManus/mcp/server", "run", - "openmanus_server.py" + "server.py" ] } } @@ -91,13 +92,13 @@ In this case, we'll add our single Openmanus server like so: #### Step 4: Understanding the Configuration 📝 This tells Claude for Desktop: 1. There's an MCP server named "openmanus" 🔌 -2. To launch it by running `uv --directory /ABSOLUTE/PATH/TO/OpenManus/openmanus_server run openmanus_server.py` 🚀 +2. To launch it by running `uv --directory /ABSOLUTE/PATH/TO/OpenManus/mcp/server run server.py` 🚀 #### Step 5: Activation 🔄 Save the file, and restart Claude for Desktop. #### Step 6: Verification ✨ -Let's make sure Claude for Desktop is picking up the six tools we've exposed in our `openmanus` server. You can do this by looking for the hammer icon ![hammer icon](./assets/claude-desktop-mcp-hammer-icon.svg) +Let's make sure Claude for Desktop is picking up the five tools we've exposed in our `openmanus` server. You can do this by looking for the hammer icon ![hammer icon](./assets/claude-desktop-mcp-hammer-icon.svg) ![tools_in_claude](./assets/1.jpg) After clicking on the hammer icon, you should see tools listed: @@ -111,12 +112,12 @@ After clicking on the hammer icon, you should see tools listed: ### 💻 2. Testing with simple Client Example -Check out `openmanus_client.py` to test the openmanus server using the MCP client. +Check out `client.py` to test the openmanus server using the MCP client. #### Demo display https://github.com/user-attachments/assets/aeacd93d-9bec-46d1-831b-20e898c7507b ``` -python openmanus_server/openmanus_client.py +python mcp/client/client.py ``` diff --git a/openmanus_server/assets/1.jpg b/mcp/assets/1.jpg similarity index 100% rename from openmanus_server/assets/1.jpg rename to mcp/assets/1.jpg diff --git a/openmanus_server/assets/2.png b/mcp/assets/2.png similarity index 100% rename from openmanus_server/assets/2.png rename to mcp/assets/2.png diff --git a/openmanus_server/assets/claude-desktop-mcp-hammer-icon.svg b/mcp/assets/claude-desktop-mcp-hammer-icon.svg similarity index 100% rename from openmanus_server/assets/claude-desktop-mcp-hammer-icon.svg rename to mcp/assets/claude-desktop-mcp-hammer-icon.svg diff --git a/openmanus_server/assets/demo.mp4 b/mcp/assets/demo.mp4 similarity index 100% rename from openmanus_server/assets/demo.mp4 rename to mcp/assets/demo.mp4 diff --git a/openmanus_server/openmanus_client.py b/mcp/client/client.py similarity index 85% rename from openmanus_server/openmanus_client.py rename to mcp/client/client.py index 0be1f47..5a1ec8d 100644 --- a/openmanus_server/openmanus_client.py +++ b/mcp/client/client.py @@ -1,17 +1,27 @@ -import ast import asyncio +import json import os import sys from contextlib import AsyncExitStack -from pathlib import Path from typing import Optional -import tomli from colorama import Fore, init -from dotenv import load_dotenv +from openai import AsyncOpenAI + from mcp import ClientSession, StdioServerParameters from mcp.client.stdio import stdio_client -from openai import AsyncOpenAI + + +# Add current directory to Python path +current_dir = os.path.dirname(os.path.abspath(__file__)) +parent_dir = os.path.dirname(current_dir) +sys.path.insert(0, parent_dir) +sys.path.insert(0, current_dir) + +# Add root directory to Python path +root_dir = os.path.dirname(parent_dir) +sys.path.insert(0, root_dir) +from app.config import config # Initialize colorama @@ -19,48 +29,31 @@ def init_colorama(): init(autoreset=True) -# Load config -def load_config(): - config_path = Path(__file__).parent.parent / "config" / "config.toml" - try: - with open(config_path, "rb") as f: - return tomli.load(f) - except FileNotFoundError: - print(f"Error: config.toml not found at {config_path}") - sys.exit(1) - except tomli.TOMLDecodeError as e: - print(f"Error: Invalid TOML in config.toml: {e}") - sys.exit(1) - - -# Load environment variables (as fallback) -load_dotenv() - - class OpenManusClient: def __init__(self): # Load configuration - self.config = load_config() + # self.config = load_config() # Initialize session and client objects self.session: Optional[ClientSession] = None self.exit_stack = AsyncExitStack() # Initialize AsyncOpenAI client with config - api_key = self.config["llm"]["api_key"] or os.getenv("OPENAI_API_KEY") + self.llm_config = config.llm["default"] + api_key = self.llm_config.api_key or os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError( "OpenAI API key not found in config.toml or environment variables" ) self.openai_client = AsyncOpenAI( - api_key=api_key, base_url=self.config["llm"]["base_url"] + api_key=api_key, base_url=self.llm_config.base_url ) async def connect_to_server(self, server_script_path: str = None): """Connect to the openmanus MCP server""" # Use provided path or default from config - script_path = server_script_path or self.config["server"]["default_script"] + script_path = server_script_path server_params = StdioServerParameters( command="python", args=[script_path], env=None @@ -134,7 +127,7 @@ class OpenManusClient: ] # Initial LLM API call response = await self.openai_client.chat.completions.create( - model=self.config["llm"]["model"], + model=self.llm_config.model, messages=messages, tools=available_tools, tool_choice="auto", @@ -171,7 +164,7 @@ class OpenManusClient: # Convert tool_args from string to dictionary if necessary if isinstance(tool_args, str): try: - tool_args = ast.literal_eval(tool_args) + tool_args = json.loads(tool_args) except (ValueError, SyntaxError) as e: print(f"Error converting tool_args to dict: {e}") tool_args = {} @@ -197,7 +190,7 @@ class OpenManusClient: # Get next response from LLM response = await self.openai_client.chat.completions.create( - model=self.config["llm"]["model"], + model=self.llm_config.model, messages=messages, tools=available_tools, tool_choice="auto", @@ -210,7 +203,7 @@ async def main(): if len(sys.argv) > 1: server_script = sys.argv[1] else: - server_script = "./openmanus_server/openmanus_server.py" + server_script = "mcp/server/server.py" client = OpenManusClient() try: diff --git a/openmanus_server/mcp_requirements.txt b/mcp/mcp_requirements.txt similarity index 100% rename from openmanus_server/mcp_requirements.txt rename to mcp/mcp_requirements.txt diff --git a/openmanus_server/openmanus_server.py b/mcp/server/server.py similarity index 89% rename from openmanus_server/openmanus_server.py rename to mcp/server/server.py index 50b4351..1107ef9 100644 --- a/openmanus_server/openmanus_server.py +++ b/mcp/server/server.py @@ -4,7 +4,6 @@ import json import logging import os import sys -from typing import Optional from mcp.server.fastmcp import FastMCP @@ -15,6 +14,10 @@ parent_dir = os.path.dirname(current_dir) sys.path.insert(0, parent_dir) sys.path.insert(0, current_dir) +# Add root directory to Python path +root_dir = os.path.dirname(parent_dir) +sys.path.insert(0, root_dir) + # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -24,7 +27,6 @@ logger = logging.getLogger("mcp-server") # Import OpenManus tools from app.tool.browser_use_tool import BrowserUseTool from app.tool.file_saver import FileSaver -from app.tool.google_search import GoogleSearch from app.tool.python_execute import PythonExecute from app.tool.terminate import Terminate @@ -34,7 +36,6 @@ openmanus = FastMCP("openmanus") # Initialize tool instances browser_tool = BrowserUseTool() -google_search_tool = GoogleSearch() python_execute_tool = PythonExecute() file_saver_tool = FileSaver() terminate_tool = Terminate() @@ -95,20 +96,6 @@ async def get_browser_state() -> str: return json.dumps(result.model_dump()) -# Google search tool -@openmanus.tool() -async def google_search(query: str, num_results: int = 10) -> str: - """Execute Google search and return list of relevant links. - - Args: - query: Search query - num_results: Number of results to return (default is 10) - """ - logger.info(f"Executing Google search: {query}") - results = await google_search_tool.execute(query=query, num_results=num_results) - return json.dumps(results) - - # Python execution tool @openmanus.tool() async def python_execute(code: str, timeout: int = 5) -> str: diff --git a/requirements.txt b/requirements.txt index 7ce4b52..77308f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,12 @@ -pydantic~=2.10.4 -openai~=1.58.1 +pydantic~=2.10.6 +openai~=1.66.3 tenacity~=9.0.0 pyyaml~=6.0.2 loguru~=0.7.3 numpy datasets~=3.2.0 fastapi~=0.115.11 +tiktoken~=0.9.0 html2text~=2024.2.26 gymnasium~=1.0.0 @@ -15,8 +16,14 @@ uvicorn~=0.34.0 unidiff~=0.7.5 browser-use~=0.1.40 googlesearch-python~=1.3.0 +baidusearch~=1.0.3 +duckduckgo_search~=7.5.1 aiofiles~=24.1.0 pydantic_core~=2.27.2 colorama~=0.4.6 -playwright~=1.49.1 +playwright~=1.50.0 + +docker~=7.1.0 +pytest~=8.3.5 +pytest-asyncio~=0.25.3 diff --git a/setup.py b/setup.py index dd46f9c..eb36dac 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( packages=find_packages(), install_requires=[ "pydantic~=2.10.4", - "openai~=1.58.1", + "openai>=1.58.1,<1.67.0", "tenacity~=9.0.0", "pyyaml~=6.0.2", "loguru~=0.7.3", @@ -31,7 +31,7 @@ setup( "browser-use~=0.1.40", "googlesearch-python~=1.3.0", "aiofiles~=24.1.0", - "pydantic_core~=2.27.2", + "pydantic_core>=2.27.2,<2.28.0", "colorama~=0.4.6", ], classifiers=[ diff --git a/tests/sandbox/test_client.py b/tests/sandbox/test_client.py new file mode 100644 index 0000000..6b2c61f --- /dev/null +++ b/tests/sandbox/test_client.py @@ -0,0 +1,110 @@ +import tempfile +from pathlib import Path +from typing import AsyncGenerator + +import pytest +import pytest_asyncio + +from app.config import SandboxSettings +from app.sandbox.client import LocalSandboxClient, create_sandbox_client + + +@pytest_asyncio.fixture(scope="function") +async def local_client() -> AsyncGenerator[LocalSandboxClient, None]: + """Creates a local sandbox client for testing.""" + client = create_sandbox_client() + try: + yield client + finally: + await client.cleanup() + + +@pytest.fixture(scope="function") +def temp_dir() -> Path: + """Creates a temporary directory for testing.""" + with tempfile.TemporaryDirectory() as tmp_dir: + yield Path(tmp_dir) + + +@pytest.mark.asyncio +async def test_sandbox_creation(local_client: LocalSandboxClient): + """Tests sandbox creation with specific configuration.""" + config = SandboxSettings( + image="python:3.12-slim", + work_dir="/workspace", + memory_limit="512m", + cpu_limit=0.5, + ) + + await local_client.create(config) + result = await local_client.run_command("python3 --version") + assert "Python 3.10" in result + + +@pytest.mark.asyncio +async def test_local_command_execution(local_client: LocalSandboxClient): + """Tests command execution in local sandbox.""" + await local_client.create() + + result = await local_client.run_command("echo 'test'") + assert result.strip() == "test" + + with pytest.raises(Exception): + await local_client.run_command("sleep 10", timeout=1) + + +@pytest.mark.asyncio +async def test_local_file_operations(local_client: LocalSandboxClient, temp_dir: Path): + """Tests file operations in local sandbox.""" + await local_client.create() + + # Test write and read operations + test_content = "Hello, World!" + await local_client.write_file("/workspace/test.txt", test_content) + content = await local_client.read_file("/workspace/test.txt") + assert content.strip() == test_content + + # Test copying file to container + src_file = temp_dir / "src.txt" + src_file.write_text("Copy to container") + await local_client.copy_to(str(src_file), "/workspace/copied.txt") + content = await local_client.read_file("/workspace/copied.txt") + assert content.strip() == "Copy to container" + + # Test copying file from container + dst_file = temp_dir / "dst.txt" + await local_client.copy_from("/workspace/test.txt", str(dst_file)) + assert dst_file.read_text().strip() == test_content + + +@pytest.mark.asyncio +async def test_local_volume_binding(local_client: LocalSandboxClient, temp_dir: Path): + """Tests volume binding in local sandbox.""" + bind_path = str(temp_dir) + volume_bindings = {bind_path: "/data"} + + await local_client.create(volume_bindings=volume_bindings) + + test_file = temp_dir / "test.txt" + test_file.write_text("Volume test") + + content = await local_client.read_file("/data/test.txt") + assert "Volume test" in content + + +@pytest.mark.asyncio +async def test_local_error_handling(local_client: LocalSandboxClient): + """Tests error handling in local sandbox.""" + await local_client.create() + + with pytest.raises(Exception) as exc: + await local_client.read_file("/nonexistent.txt") + assert "not found" in str(exc.value).lower() + + with pytest.raises(Exception) as exc: + await local_client.copy_from("/nonexistent.txt", "local.txt") + assert "not found" in str(exc.value).lower() + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/sandbox/test_docker_terminal.py b/tests/sandbox/test_docker_terminal.py new file mode 100644 index 0000000..bf0821a --- /dev/null +++ b/tests/sandbox/test_docker_terminal.py @@ -0,0 +1,104 @@ +"""Tests for the AsyncDockerizedTerminal implementation.""" + +import docker +import pytest +import pytest_asyncio + +from app.sandbox.core.terminal import AsyncDockerizedTerminal + + +@pytest.fixture(scope="module") +def docker_client(): + """Fixture providing a Docker client.""" + return docker.from_env() + + +@pytest_asyncio.fixture(scope="module") +async def docker_container(docker_client): + """Fixture providing a test Docker container.""" + container = docker_client.containers.run( + "python:3.12-slim", + "tail -f /dev/null", + name="test_container", + detach=True, + remove=True, + ) + yield container + container.stop() + + +@pytest_asyncio.fixture +async def terminal(docker_container): + """Fixture providing an initialized AsyncDockerizedTerminal instance.""" + terminal = AsyncDockerizedTerminal( + docker_container, + working_dir="/workspace", + env_vars={"TEST_VAR": "test_value"}, + default_timeout=30, + ) + await terminal.init() + yield terminal + await terminal.close() + + +class TestAsyncDockerizedTerminal: + """Test cases for AsyncDockerizedTerminal.""" + + @pytest.mark.asyncio + async def test_basic_command_execution(self, terminal): + """Test basic command execution functionality.""" + result = await terminal.run_command("echo 'Hello World'") + assert "Hello World" in result + + @pytest.mark.asyncio + async def test_environment_variables(self, terminal): + """Test environment variable setting and access.""" + result = await terminal.run_command("echo $TEST_VAR") + assert "test_value" in result + + @pytest.mark.asyncio + async def test_working_directory(self, terminal): + """Test working directory setup.""" + result = await terminal.run_command("pwd") + assert "/workspace" == result + + @pytest.mark.asyncio + async def test_command_timeout(self, docker_container): + """Test command timeout functionality.""" + terminal = AsyncDockerizedTerminal(docker_container, default_timeout=1) + await terminal.init() + try: + with pytest.raises(TimeoutError): + await terminal.run_command("sleep 5") + finally: + await terminal.close() + + @pytest.mark.asyncio + async def test_multiple_commands(self, terminal): + """Test execution of multiple commands in sequence.""" + cmd1 = await terminal.run_command("echo 'First'") + cmd2 = await terminal.run_command("echo 'Second'") + assert "First" in cmd1 + assert "Second" in cmd2 + + @pytest.mark.asyncio + async def test_session_cleanup(self, docker_container): + """Test proper cleanup of resources.""" + terminal = AsyncDockerizedTerminal(docker_container) + await terminal.init() + assert terminal.session is not None + await terminal.close() + # Verify session is properly cleaned up + # Note: session object still exists, but internal connection is closed + assert terminal.session is not None + + +# Configure pytest-asyncio +def pytest_configure(config): + """Configure pytest-asyncio.""" + config.addinivalue_line("asyncio_mode", "strict") + config.addinivalue_line("asyncio_default_fixture_loop_scope", "function") + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/sandbox/test_sandbox.py b/tests/sandbox/test_sandbox.py new file mode 100644 index 0000000..b21dd6f --- /dev/null +++ b/tests/sandbox/test_sandbox.py @@ -0,0 +1,152 @@ +import pytest +import pytest_asyncio + +from app.sandbox.core.sandbox import DockerSandbox, SandboxSettings + + +@pytest.fixture(scope="module") +def sandbox_config(): + """Creates sandbox configuration for testing.""" + return SandboxSettings( + image="python:3.12-slim", + work_dir="/workspace", + memory_limit="1g", + cpu_limit=0.5, + network_enabled=True, + ) + + +@pytest_asyncio.fixture(scope="module") +async def sandbox(sandbox_config): + """Creates and manages a test sandbox instance.""" + sandbox = DockerSandbox(sandbox_config) + await sandbox.create() + try: + yield sandbox + finally: + await sandbox.cleanup() + + +@pytest.mark.asyncio +async def test_sandbox_working_directory(sandbox): + """Tests sandbox working directory configuration.""" + result = await sandbox.terminal.run_command("pwd") + assert result.strip() == "/workspace" + + +@pytest.mark.asyncio +async def test_sandbox_file_operations(sandbox): + """Tests sandbox file read/write operations.""" + # Test file writing + test_content = "Hello from sandbox!" + await sandbox.write_file("/workspace/test.txt", test_content) + + # Test file reading + content = await sandbox.read_file("/workspace/test.txt") + assert content.strip() == test_content + + +@pytest.mark.asyncio +async def test_sandbox_python_execution(sandbox): + """Tests Python code execution in sandbox.""" + # Write test file + await sandbox.write_file("/workspace/test.txt", "Hello from file!") + + # Write Python script + python_code = """ +print("Hello from Python!") +with open('/workspace/test.txt') as f: + print(f.read()) +""" + await sandbox.write_file("/workspace/test.py", python_code) + + # Execute script and verify output + result = await sandbox.terminal.run_command("python3 /workspace/test.py") + assert "Hello from Python!" in result + assert "Hello from file!" in result + + +@pytest.mark.asyncio +async def test_sandbox_file_persistence(sandbox): + """Tests file persistence in sandbox.""" + # Create multiple files + files = { + "file1.txt": "Content 1", + "file2.txt": "Content 2", + "nested/file3.txt": "Content 3", + } + + # Write files + for path, content in files.items(): + await sandbox.write_file(f"/workspace/{path}", content) + + # Verify file contents + for path, expected_content in files.items(): + content = await sandbox.read_file(f"/workspace/{path}") + assert content.strip() == expected_content + + +@pytest.mark.asyncio +async def test_sandbox_python_environment(sandbox): + """Tests Python environment configuration.""" + # Test Python version + result = await sandbox.terminal.run_command("python3 --version") + assert "Python 3.10" in result + + # Test basic module imports + python_code = """ +import sys +import os +import json +print("Python is working!") +""" + await sandbox.write_file("/workspace/env_test.py", python_code) + result = await sandbox.terminal.run_command("python3 /workspace/env_test.py") + assert "Python is working!" in result + + +@pytest.mark.asyncio +async def test_sandbox_network_access(sandbox): + """Tests sandbox network access.""" + if not sandbox.config.network_enabled: + pytest.skip("Network access is disabled") + + # Test network connectivity + await sandbox.terminal.run_command("apt update && apt install curl -y") + result = await sandbox.terminal.run_command("curl -I https://www.example.com") + assert "HTTP/2 200" in result + + +@pytest.mark.asyncio +async def test_sandbox_cleanup(sandbox_config): + """Tests sandbox cleanup process.""" + sandbox = DockerSandbox(sandbox_config) + await sandbox.create() + + # Create test files + await sandbox.write_file("/workspace/test.txt", "test") + container_id = sandbox.terminal.container.id + # Perform cleanup + await sandbox.cleanup() + + # Verify container has been removed + import docker + + client = docker.from_env() + containers = client.containers.list(all=True) + assert not any(c.id == container_id for c in containers) + + +@pytest.mark.asyncio +async def test_sandbox_error_handling(): + """Tests error handling with invalid configuration.""" + # Test invalid configuration + invalid_config = SandboxSettings(image="nonexistent:latest", work_dir="/invalid") + + sandbox = DockerSandbox(invalid_config) + with pytest.raises(Exception): + await sandbox.create() + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/sandbox/test_sandbox_manager.py b/tests/sandbox/test_sandbox_manager.py new file mode 100644 index 0000000..09f498d --- /dev/null +++ b/tests/sandbox/test_sandbox_manager.py @@ -0,0 +1,138 @@ +import asyncio +import os +import tempfile +from typing import AsyncGenerator + +import pytest +import pytest_asyncio + +from app.sandbox.core.manager import SandboxManager + + +@pytest_asyncio.fixture(scope="function") +async def manager() -> AsyncGenerator[SandboxManager, None]: + """Creates a sandbox manager instance. + + Uses function scope to ensure each test case has its own manager instance. + """ + manager = SandboxManager(max_sandboxes=2, idle_timeout=60, cleanup_interval=30) + try: + yield manager + finally: + # Ensure all resources are cleaned up + await manager.cleanup() + + +@pytest.fixture +def temp_file(): + """Creates a temporary test file.""" + with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f: + f.write("test content") + path = f.name + try: + yield path + finally: + if os.path.exists(path): + os.unlink(path) + + +@pytest.mark.asyncio +async def test_create_sandbox(manager): + """Tests sandbox creation.""" + # Create default sandbox + sandbox_id = await manager.create_sandbox() + assert sandbox_id in manager._sandboxes + assert sandbox_id in manager._last_used + + # Verify sandbox functionality + sandbox = await manager.get_sandbox(sandbox_id) + result = await sandbox.run_command("echo 'test'") + assert result.strip() == "test" + + +@pytest.mark.asyncio +async def test_max_sandboxes_limit(manager): + """Tests maximum sandbox limit enforcement.""" + created_sandboxes = [] + try: + # Create maximum number of sandboxes + for _ in range(manager.max_sandboxes): + sandbox_id = await manager.create_sandbox() + created_sandboxes.append(sandbox_id) + + # Verify created sandbox count + assert len(manager._sandboxes) == manager.max_sandboxes + + # Attempting to create additional sandbox should fail + with pytest.raises(RuntimeError) as exc_info: + await manager.create_sandbox() + + # Verify error message + expected_message = ( + f"Maximum number of sandboxes ({manager.max_sandboxes}) reached" + ) + assert str(exc_info.value) == expected_message + + finally: + # Clean up all created sandboxes + for sandbox_id in created_sandboxes: + try: + await manager.delete_sandbox(sandbox_id) + except Exception as e: + print(f"Failed to cleanup sandbox {sandbox_id}: {e}") + + +@pytest.mark.asyncio +async def test_get_nonexistent_sandbox(manager): + """Tests retrieving a non-existent sandbox.""" + with pytest.raises(KeyError, match="Sandbox .* not found"): + await manager.get_sandbox("nonexistent-id") + + +@pytest.mark.asyncio +async def test_sandbox_cleanup(manager): + """Tests sandbox cleanup functionality.""" + sandbox_id = await manager.create_sandbox() + assert sandbox_id in manager._sandboxes + + await manager.delete_sandbox(sandbox_id) + assert sandbox_id not in manager._sandboxes + assert sandbox_id not in manager._last_used + + +@pytest.mark.asyncio +async def test_idle_sandbox_cleanup(manager): + """Tests automatic cleanup of idle sandboxes.""" + # Set short idle timeout + manager.idle_timeout = 0.1 + + sandbox_id = await manager.create_sandbox() + assert sandbox_id in manager._sandboxes + + # Wait longer than idle timeout + await asyncio.sleep(0.2) + + # Trigger cleanup + await manager._cleanup_idle_sandboxes() + assert sandbox_id not in manager._sandboxes + + +@pytest.mark.asyncio +async def test_manager_cleanup(manager): + """Tests manager cleanup functionality.""" + # Create multiple sandboxes + sandbox_ids = [] + for _ in range(2): + sandbox_id = await manager.create_sandbox() + sandbox_ids.append(sandbox_id) + + # Clean up all resources + await manager.cleanup() + + # Verify all sandboxes have been cleaned up + assert not manager._sandboxes + assert not manager._last_used + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/workspace/example.txt b/workspace/example.txt new file mode 100644 index 0000000..08a2808 --- /dev/null +++ b/workspace/example.txt @@ -0,0 +1 @@ +This is a sample file. Files generated by OpenManus are stored in the current folder by default.