314 lines
10 KiB
Python
314 lines
10 KiB
Python
import asyncio
|
|
import uuid
|
|
from contextlib import asynccontextmanager
|
|
from typing import Dict, Optional, Set
|
|
|
|
import docker
|
|
from docker.errors import APIError, ImageNotFound
|
|
|
|
from app.config import SandboxSettings
|
|
from app.logger import logger
|
|
from app.sandbox.core.sandbox import DockerSandbox
|
|
|
|
|
|
class SandboxManager:
|
|
"""Docker sandbox manager.
|
|
|
|
Manages multiple DockerSandbox instances lifecycle including creation,
|
|
monitoring, and cleanup. Provides concurrent access control and automatic
|
|
cleanup mechanisms for sandbox resources.
|
|
|
|
Attributes:
|
|
max_sandboxes: Maximum allowed number of sandboxes.
|
|
idle_timeout: Sandbox idle timeout in seconds.
|
|
cleanup_interval: Cleanup check interval in seconds.
|
|
_sandboxes: Active sandbox instance mapping.
|
|
_last_used: Last used time record for sandboxes.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
max_sandboxes: int = 100,
|
|
idle_timeout: int = 3600,
|
|
cleanup_interval: int = 300,
|
|
):
|
|
"""Initializes sandbox manager.
|
|
|
|
Args:
|
|
max_sandboxes: Maximum sandbox count limit.
|
|
idle_timeout: Idle timeout in seconds.
|
|
cleanup_interval: Cleanup check interval in seconds.
|
|
"""
|
|
self.max_sandboxes = max_sandboxes
|
|
self.idle_timeout = idle_timeout
|
|
self.cleanup_interval = cleanup_interval
|
|
|
|
# Docker client
|
|
self._client = docker.from_env()
|
|
|
|
# Resource mappings
|
|
self._sandboxes: Dict[str, DockerSandbox] = {}
|
|
self._last_used: Dict[str, float] = {}
|
|
|
|
# Concurrency control
|
|
self._locks: Dict[str, asyncio.Lock] = {}
|
|
self._global_lock = asyncio.Lock()
|
|
self._active_operations: Set[str] = set()
|
|
|
|
# Cleanup task
|
|
self._cleanup_task: Optional[asyncio.Task] = None
|
|
self._is_shutting_down = False
|
|
|
|
# Start automatic cleanup
|
|
self.start_cleanup_task()
|
|
|
|
async def ensure_image(self, image: str) -> bool:
|
|
"""Ensures Docker image is available.
|
|
|
|
Args:
|
|
image: Image name.
|
|
|
|
Returns:
|
|
bool: Whether image is available.
|
|
"""
|
|
try:
|
|
self._client.images.get(image)
|
|
return True
|
|
except ImageNotFound:
|
|
try:
|
|
logger.info(f"Pulling image {image}...")
|
|
await asyncio.get_event_loop().run_in_executor(
|
|
None, self._client.images.pull, image
|
|
)
|
|
return True
|
|
except (APIError, Exception) as e:
|
|
logger.error(f"Failed to pull image {image}: {e}")
|
|
return False
|
|
|
|
@asynccontextmanager
|
|
async def sandbox_operation(self, sandbox_id: str):
|
|
"""Context manager for sandbox operations.
|
|
|
|
Provides concurrency control and usage time updates.
|
|
|
|
Args:
|
|
sandbox_id: Sandbox ID.
|
|
|
|
Raises:
|
|
KeyError: If sandbox not found.
|
|
"""
|
|
if sandbox_id not in self._locks:
|
|
self._locks[sandbox_id] = asyncio.Lock()
|
|
|
|
async with self._locks[sandbox_id]:
|
|
if sandbox_id not in self._sandboxes:
|
|
raise KeyError(f"Sandbox {sandbox_id} not found")
|
|
|
|
self._active_operations.add(sandbox_id)
|
|
try:
|
|
self._last_used[sandbox_id] = asyncio.get_event_loop().time()
|
|
yield self._sandboxes[sandbox_id]
|
|
finally:
|
|
self._active_operations.remove(sandbox_id)
|
|
|
|
async def create_sandbox(
|
|
self,
|
|
config: Optional[SandboxSettings] = None,
|
|
volume_bindings: Optional[Dict[str, str]] = None,
|
|
) -> str:
|
|
"""Creates a new sandbox instance.
|
|
|
|
Args:
|
|
config: Sandbox configuration.
|
|
volume_bindings: Volume mapping configuration.
|
|
|
|
Returns:
|
|
str: Sandbox ID.
|
|
|
|
Raises:
|
|
RuntimeError: If max sandbox count reached or creation fails.
|
|
"""
|
|
async with self._global_lock:
|
|
if len(self._sandboxes) >= self.max_sandboxes:
|
|
raise RuntimeError(
|
|
f"Maximum number of sandboxes ({self.max_sandboxes}) reached"
|
|
)
|
|
|
|
config = config or SandboxSettings()
|
|
if not await self.ensure_image(config.image):
|
|
raise RuntimeError(f"Failed to ensure Docker image: {config.image}")
|
|
|
|
sandbox_id = str(uuid.uuid4())
|
|
try:
|
|
sandbox = DockerSandbox(config, volume_bindings)
|
|
await sandbox.create()
|
|
|
|
self._sandboxes[sandbox_id] = sandbox
|
|
self._last_used[sandbox_id] = asyncio.get_event_loop().time()
|
|
self._locks[sandbox_id] = asyncio.Lock()
|
|
|
|
logger.info(f"Created sandbox {sandbox_id}")
|
|
return sandbox_id
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to create sandbox: {e}")
|
|
if sandbox_id in self._sandboxes:
|
|
await self.delete_sandbox(sandbox_id)
|
|
raise RuntimeError(f"Failed to create sandbox: {e}")
|
|
|
|
async def get_sandbox(self, sandbox_id: str) -> DockerSandbox:
|
|
"""Gets a sandbox instance.
|
|
|
|
Args:
|
|
sandbox_id: Sandbox ID.
|
|
|
|
Returns:
|
|
DockerSandbox: Sandbox instance.
|
|
|
|
Raises:
|
|
KeyError: If sandbox does not exist.
|
|
"""
|
|
async with self.sandbox_operation(sandbox_id) as sandbox:
|
|
return sandbox
|
|
|
|
def start_cleanup_task(self) -> None:
|
|
"""Starts automatic cleanup task."""
|
|
|
|
async def cleanup_loop():
|
|
while not self._is_shutting_down:
|
|
try:
|
|
await self._cleanup_idle_sandboxes()
|
|
except Exception as e:
|
|
logger.error(f"Error in cleanup loop: {e}")
|
|
await asyncio.sleep(self.cleanup_interval)
|
|
|
|
self._cleanup_task = asyncio.create_task(cleanup_loop())
|
|
|
|
async def _cleanup_idle_sandboxes(self) -> None:
|
|
"""Cleans up idle sandboxes."""
|
|
current_time = asyncio.get_event_loop().time()
|
|
to_cleanup = []
|
|
|
|
async with self._global_lock:
|
|
for sandbox_id, last_used in self._last_used.items():
|
|
if (
|
|
sandbox_id not in self._active_operations
|
|
and current_time - last_used > self.idle_timeout
|
|
):
|
|
to_cleanup.append(sandbox_id)
|
|
|
|
for sandbox_id in to_cleanup:
|
|
try:
|
|
await self.delete_sandbox(sandbox_id)
|
|
except Exception as e:
|
|
logger.error(f"Error cleaning up sandbox {sandbox_id}: {e}")
|
|
|
|
async def cleanup(self) -> None:
|
|
"""Cleans up all resources."""
|
|
logger.info("Starting manager cleanup...")
|
|
self._is_shutting_down = True
|
|
|
|
# Cancel cleanup task
|
|
if self._cleanup_task:
|
|
self._cleanup_task.cancel()
|
|
try:
|
|
await asyncio.wait_for(self._cleanup_task, timeout=1.0)
|
|
except (asyncio.CancelledError, asyncio.TimeoutError):
|
|
pass
|
|
|
|
# Get all sandbox IDs to clean up
|
|
async with self._global_lock:
|
|
sandbox_ids = list(self._sandboxes.keys())
|
|
|
|
# Concurrently clean up all sandboxes
|
|
cleanup_tasks = []
|
|
for sandbox_id in sandbox_ids:
|
|
task = asyncio.create_task(self._safe_delete_sandbox(sandbox_id))
|
|
cleanup_tasks.append(task)
|
|
|
|
if cleanup_tasks:
|
|
# Wait for all cleanup tasks to complete, with timeout to avoid infinite waiting
|
|
try:
|
|
await asyncio.wait(cleanup_tasks, timeout=30.0)
|
|
except asyncio.TimeoutError:
|
|
logger.error("Sandbox cleanup timed out")
|
|
|
|
# Clean up remaining references
|
|
self._sandboxes.clear()
|
|
self._last_used.clear()
|
|
self._locks.clear()
|
|
self._active_operations.clear()
|
|
|
|
logger.info("Manager cleanup completed")
|
|
|
|
async def _safe_delete_sandbox(self, sandbox_id: str) -> None:
|
|
"""Safely deletes a single sandbox.
|
|
|
|
Args:
|
|
sandbox_id: Sandbox ID to delete.
|
|
"""
|
|
try:
|
|
if sandbox_id in self._active_operations:
|
|
logger.warning(
|
|
f"Sandbox {sandbox_id} has active operations, waiting for completion"
|
|
)
|
|
for _ in range(10): # Wait at most 10 times
|
|
await asyncio.sleep(0.5)
|
|
if sandbox_id not in self._active_operations:
|
|
break
|
|
else:
|
|
logger.warning(
|
|
f"Timeout waiting for sandbox {sandbox_id} operations to complete"
|
|
)
|
|
|
|
# Get reference to sandbox object
|
|
sandbox = self._sandboxes.get(sandbox_id)
|
|
if sandbox:
|
|
await sandbox.cleanup()
|
|
|
|
# Remove sandbox record from manager
|
|
async with self._global_lock:
|
|
self._sandboxes.pop(sandbox_id, None)
|
|
self._last_used.pop(sandbox_id, None)
|
|
self._locks.pop(sandbox_id, None)
|
|
logger.info(f"Deleted sandbox {sandbox_id}")
|
|
except Exception as e:
|
|
logger.error(f"Error during cleanup of sandbox {sandbox_id}: {e}")
|
|
|
|
async def delete_sandbox(self, sandbox_id: str) -> None:
|
|
"""Deletes specified sandbox.
|
|
|
|
Args:
|
|
sandbox_id: Sandbox ID.
|
|
"""
|
|
if sandbox_id not in self._sandboxes:
|
|
return
|
|
|
|
try:
|
|
await self._safe_delete_sandbox(sandbox_id)
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete sandbox {sandbox_id}: {e}")
|
|
|
|
async def __aenter__(self) -> "SandboxManager":
|
|
"""Async context manager entry."""
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
"""Async context manager exit."""
|
|
await self.cleanup()
|
|
|
|
def get_stats(self) -> Dict:
|
|
"""Gets manager statistics.
|
|
|
|
Returns:
|
|
Dict: Statistics information.
|
|
"""
|
|
return {
|
|
"total_sandboxes": len(self._sandboxes),
|
|
"active_operations": len(self._active_operations),
|
|
"max_sandboxes": self.max_sandboxes,
|
|
"idle_timeout": self.idle_timeout,
|
|
"cleanup_interval": self.cleanup_interval,
|
|
"is_shutting_down": self._is_shutting_down,
|
|
}
|