import asyncio import uuid from contextlib import asynccontextmanager from typing import Dict, Optional, Set import docker from docker.errors import APIError, ImageNotFound from app.config import SandboxSettings from app.logger import logger from app.sandbox.core.sandbox import DockerSandbox class SandboxManager: """Docker sandbox manager. Manages multiple DockerSandbox instances lifecycle including creation, monitoring, and cleanup. Provides concurrent access control and automatic cleanup mechanisms for sandbox resources. Attributes: max_sandboxes: Maximum allowed number of sandboxes. idle_timeout: Sandbox idle timeout in seconds. cleanup_interval: Cleanup check interval in seconds. _sandboxes: Active sandbox instance mapping. _last_used: Last used time record for sandboxes. """ def __init__( self, max_sandboxes: int = 100, idle_timeout: int = 3600, cleanup_interval: int = 300, ): """Initializes sandbox manager. Args: max_sandboxes: Maximum sandbox count limit. idle_timeout: Idle timeout in seconds. cleanup_interval: Cleanup check interval in seconds. """ self.max_sandboxes = max_sandboxes self.idle_timeout = idle_timeout self.cleanup_interval = cleanup_interval # Docker client self._client = docker.from_env() # Resource mappings self._sandboxes: Dict[str, DockerSandbox] = {} self._last_used: Dict[str, float] = {} # Concurrency control self._locks: Dict[str, asyncio.Lock] = {} self._global_lock = asyncio.Lock() self._active_operations: Set[str] = set() # Cleanup task self._cleanup_task: Optional[asyncio.Task] = None self._is_shutting_down = False # Start automatic cleanup self.start_cleanup_task() async def ensure_image(self, image: str) -> bool: """Ensures Docker image is available. Args: image: Image name. Returns: bool: Whether image is available. """ try: self._client.images.get(image) return True except ImageNotFound: try: logger.info(f"Pulling image {image}...") await asyncio.get_event_loop().run_in_executor( None, self._client.images.pull, image ) return True except (APIError, Exception) as e: logger.error(f"Failed to pull image {image}: {e}") return False @asynccontextmanager async def sandbox_operation(self, sandbox_id: str): """Context manager for sandbox operations. Provides concurrency control and usage time updates. Args: sandbox_id: Sandbox ID. Raises: KeyError: If sandbox not found. """ if sandbox_id not in self._locks: self._locks[sandbox_id] = asyncio.Lock() async with self._locks[sandbox_id]: if sandbox_id not in self._sandboxes: raise KeyError(f"Sandbox {sandbox_id} not found") self._active_operations.add(sandbox_id) try: self._last_used[sandbox_id] = asyncio.get_event_loop().time() yield self._sandboxes[sandbox_id] finally: self._active_operations.remove(sandbox_id) async def create_sandbox( self, config: Optional[SandboxSettings] = None, volume_bindings: Optional[Dict[str, str]] = None, ) -> str: """Creates a new sandbox instance. Args: config: Sandbox configuration. volume_bindings: Volume mapping configuration. Returns: str: Sandbox ID. Raises: RuntimeError: If max sandbox count reached or creation fails. """ async with self._global_lock: if len(self._sandboxes) >= self.max_sandboxes: raise RuntimeError( f"Maximum number of sandboxes ({self.max_sandboxes}) reached" ) config = config or SandboxSettings() if not await self.ensure_image(config.image): raise RuntimeError(f"Failed to ensure Docker image: {config.image}") sandbox_id = str(uuid.uuid4()) try: sandbox = DockerSandbox(config, volume_bindings) await sandbox.create() self._sandboxes[sandbox_id] = sandbox self._last_used[sandbox_id] = asyncio.get_event_loop().time() self._locks[sandbox_id] = asyncio.Lock() logger.info(f"Created sandbox {sandbox_id}") return sandbox_id except Exception as e: logger.error(f"Failed to create sandbox: {e}") if sandbox_id in self._sandboxes: await self.delete_sandbox(sandbox_id) raise RuntimeError(f"Failed to create sandbox: {e}") async def get_sandbox(self, sandbox_id: str) -> DockerSandbox: """Gets a sandbox instance. Args: sandbox_id: Sandbox ID. Returns: DockerSandbox: Sandbox instance. Raises: KeyError: If sandbox does not exist. """ async with self.sandbox_operation(sandbox_id) as sandbox: return sandbox def start_cleanup_task(self) -> None: """Starts automatic cleanup task.""" async def cleanup_loop(): while not self._is_shutting_down: try: await self._cleanup_idle_sandboxes() except Exception as e: logger.error(f"Error in cleanup loop: {e}") await asyncio.sleep(self.cleanup_interval) self._cleanup_task = asyncio.create_task(cleanup_loop()) async def _cleanup_idle_sandboxes(self) -> None: """Cleans up idle sandboxes.""" current_time = asyncio.get_event_loop().time() to_cleanup = [] async with self._global_lock: for sandbox_id, last_used in self._last_used.items(): if ( sandbox_id not in self._active_operations and current_time - last_used > self.idle_timeout ): to_cleanup.append(sandbox_id) for sandbox_id in to_cleanup: try: await self.delete_sandbox(sandbox_id) except Exception as e: logger.error(f"Error cleaning up sandbox {sandbox_id}: {e}") async def cleanup(self) -> None: """Cleans up all resources.""" logger.info("Starting manager cleanup...") self._is_shutting_down = True # Cancel cleanup task if self._cleanup_task: self._cleanup_task.cancel() try: await asyncio.wait_for(self._cleanup_task, timeout=1.0) except (asyncio.CancelledError, asyncio.TimeoutError): pass # Get all sandbox IDs to clean up async with self._global_lock: sandbox_ids = list(self._sandboxes.keys()) # Concurrently clean up all sandboxes cleanup_tasks = [] for sandbox_id in sandbox_ids: task = asyncio.create_task(self._safe_delete_sandbox(sandbox_id)) cleanup_tasks.append(task) if cleanup_tasks: # Wait for all cleanup tasks to complete, with timeout to avoid infinite waiting try: await asyncio.wait(cleanup_tasks, timeout=30.0) except asyncio.TimeoutError: logger.error("Sandbox cleanup timed out") # Clean up remaining references self._sandboxes.clear() self._last_used.clear() self._locks.clear() self._active_operations.clear() logger.info("Manager cleanup completed") async def _safe_delete_sandbox(self, sandbox_id: str) -> None: """Safely deletes a single sandbox. Args: sandbox_id: Sandbox ID to delete. """ try: if sandbox_id in self._active_operations: logger.warning( f"Sandbox {sandbox_id} has active operations, waiting for completion" ) for _ in range(10): # Wait at most 10 times await asyncio.sleep(0.5) if sandbox_id not in self._active_operations: break else: logger.warning( f"Timeout waiting for sandbox {sandbox_id} operations to complete" ) # Get reference to sandbox object sandbox = self._sandboxes.get(sandbox_id) if sandbox: await sandbox.cleanup() # Remove sandbox record from manager async with self._global_lock: self._sandboxes.pop(sandbox_id, None) self._last_used.pop(sandbox_id, None) self._locks.pop(sandbox_id, None) logger.info(f"Deleted sandbox {sandbox_id}") except Exception as e: logger.error(f"Error during cleanup of sandbox {sandbox_id}: {e}") async def delete_sandbox(self, sandbox_id: str) -> None: """Deletes specified sandbox. Args: sandbox_id: Sandbox ID. """ if sandbox_id not in self._sandboxes: return try: await self._safe_delete_sandbox(sandbox_id) except Exception as e: logger.error(f"Failed to delete sandbox {sandbox_id}: {e}") async def __aenter__(self) -> "SandboxManager": """Async context manager entry.""" return self async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: """Async context manager exit.""" await self.cleanup() def get_stats(self) -> Dict: """Gets manager statistics. Returns: Dict: Statistics information. """ return { "total_sandboxes": len(self._sandboxes), "active_operations": len(self._active_operations), "max_sandboxes": self.max_sandboxes, "idle_timeout": self.idle_timeout, "cleanup_interval": self.cleanup_interval, "is_shutting_down": self._is_shutting_down, }