diff --git a/Dockerfile b/Dockerfile index ea08b2a..38cd5c3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,14 @@ FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 +# GPU Hot - Real-time NVIDIA GPU Monitoring with Disconnect Testing +# +# IMPORTANT: For GPU disconnect functionality, this container requires: +# - privileged: true (to access PCI sysfs) +# - volumes: /sys/bus/pci:/sys/bus/pci:rw (for PCI operations) +# - volumes: /sys/devices:/sys/devices:ro (for device enumeration) +# +# See docker-compose.yml for complete configuration example + # Set environment variables ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 diff --git a/README.md b/README.md index 41693c8..45f503a 100644 --- a/README.md +++ b/README.md @@ -58,11 +58,91 @@ docker-compose up --build - Historical charts (utilization, temperature, power, clocks) - System metrics (CPU, RAM) - Scale from 1 to 100+ GPUs +- **GPU Disconnect Testing** - Simulate GPU failures for fault tolerance testing **Metrics:** Utilization, temperature, memory, power draw, fan speed, clock speeds, PCIe info, P-State, throttle status, encoder/decoder sessions --- +## GPU Disconnect Testing + +GPU Hot includes advanced fault tolerance testing through simulated GPU disconnect/reconnect operations. This feature helps test how your applications handle GPU failures in production environments. + +### Features +- **Multiple disconnect methods** - Auto-select the most realistic method available: + - **Slot Power Toggle** - Actually cut and restore slot power (closest to physical disconnect) + - **Hot Reset** - Reset PCIe link using upstream bridge controls + - **Logical Remove** - Software remove and re-scan (no hardware reset) + - **NVIDIA Reset** - Use NVIDIA driver reset functionality +- **Individual GPU control** - Disconnect specific GPUs from detailed view +- **Multi-GPU operations** - Select and disconnect multiple GPUs simultaneously +- **Hub coordination** - Hub can trigger disconnects on remote nodes +- **Real-time feedback** - Live status updates during operations +- **Safety features** - Process detection, confirmation dialogs, timeout protection + +### Requirements + +**For GPU disconnect functionality, the container requires elevated privileges:** +```bash +# Docker run with privileged mode +docker run -d --gpus all --privileged \ + -v /sys/bus/pci:/sys/bus/pci:rw \ + -v /sys/devices:/sys/devices:ro \ + -p 1312:1312 ghcr.io/psalias2006/gpu-hot:latest +``` + +**Or use docker-compose (recommended):** +```bash +# docker-compose.yml includes the required privileged configuration +docker-compose up -d +``` + +### Usage + +1. **Individual GPU**: Click the "Disconnect" button in any GPU's detailed view +2. **Multiple GPUs**: + - Select GPUs using checkboxes in overview tab + - Click "Disconnect Selected" from the batch toolbar +3. **Choose method** and duration in the modal dialog +4. **Monitor progress** with real-time status updates + +### Security & Safety + +⚠️ **Important Considerations:** +- Requires **root privileges** inside container (privileged mode) +- Will **interrupt running processes** on affected GPUs +- Includes **confirmation dialogs** and active process warnings +- All operations are **logged** for audit trails +- **Rate limiting** prevents abuse +- Works on **dedicated GPU slots** (avoid shared PCIe buses) + +### Hub Mode +The hub can coordinate disconnect operations across multiple nodes: +```bash +# Hub triggers disconnect on specific node +POST /api/hub/gpu/{node_name}/{gpu_id}/disconnect + +# Multi-node batch operations supported +POST /api/hub/gpu/disconnect-multiple +``` + +### Integration Testing + +GPU Hot includes comprehensive API testing for disconnect functionality: + +**Manual API Testing:** +```bash +# Test disconnect functionality +curl -X POST http://localhost:1312/api/gpu/disconnect-multiple \ + -H "Content-Type: application/json" \ + -d '{"gpu_indices": [0], "method": "auto", "down_time": 10}' + +# Check disconnect status +curl http://localhost:1312/api/gpu/disconnect/status +``` + +--- + ## Configuration **Environment variables:** @@ -88,6 +168,19 @@ PORT = 1312 # Server port ```bash GET / # Dashboard GET /api/gpu-data # JSON metrics + +# GPU Disconnect API (Node Mode) +GET /api/gpu/{gpu_id}/disconnect/methods # Get available disconnect methods +POST /api/gpu/{gpu_id}/disconnect # Disconnect specific GPU +POST /api/gpu/disconnect-multiple # Disconnect multiple GPUs +GET /api/gpu/disconnect/status # System disconnect capabilities + +# GPU Disconnect API (Hub Mode) +GET /api/hub/nodes # List connected nodes +GET /api/hub/gpu/{node}/{gpu_id}/disconnect/methods # Get methods for node GPU +POST /api/hub/gpu/{node}/{gpu_id}/disconnect # Disconnect GPU on specific node +POST /api/hub/gpu/disconnect-multiple # Multi-node batch disconnect +GET /api/hub/gpu/disconnect/status # Hub-wide disconnect status ``` ### WebSocket diff --git a/core/gpu_disconnect.py b/core/gpu_disconnect.py new file mode 100644 index 0000000..a3f0204 --- /dev/null +++ b/core/gpu_disconnect.py @@ -0,0 +1,718 @@ +#!/usr/bin/env python3 +""" +GPU Disconnect/Reconnect Utility for GPU Hot +Simulates GPU disconnect/reconnect on Linux for fault tolerance testing +""" + +import asyncio +import os +import subprocess +import logging +import time +from pathlib import Path +from typing import Optional, Dict, List +from enum import Enum +import pynvml + +logger = logging.getLogger(__name__) + +SYSFS_PCI_DEVICES = Path("/sys/bus/pci/devices") +SYSFS_PCI_SLOTS = Path("/sys/bus/pci/slots") +SYSFS_PCI_RESCAN = Path("/sys/bus/pci/rescan") + +# Global state for simulated disconnects +_simulated_offline_gpus = set() + + +def is_wsl2() -> bool: + """Detect if running in WSL2""" + try: + with open('/proc/version', 'r') as f: + version = f.read().lower() + return 'wsl2' in version or 'microsoft' in version + except Exception: + return False + + +def is_gpu_simulated_offline(gpu_index: int) -> bool: + """Check if GPU is in simulated offline state""" + return gpu_index in _simulated_offline_gpus + + +class DisconnectMethod(Enum): + """Available GPU disconnect methods""" + AUTO = "auto" + # Real PCI disconnects (Linux native only) + SLOT_POWER = "slot" + HOT_RESET = "hot" + LOGICAL = "logical" + # WSL2-compatible methods + NVIDIA_RESET = "nvidia" + SIMULATED = "simulated" + MEMORY_FLOOD = "memory_flood" # Experimental + + +class GPUDisconnectError(Exception): + """Custom exception for GPU disconnect operations""" + pass + + +class GPUDisconnector: + """Manages GPU disconnect/reconnect operations""" + + def __init__(self): + self._check_root_permissions() + + def _check_root_permissions(self): + """Check if running with sufficient privileges""" + if os.geteuid() != 0: + logger.warning("GPU disconnect requires root privileges. Operations may fail.") + + # Log environment detection + if is_wsl2(): + logger.info("WSL2 environment detected - PCI methods unavailable, will use WSL2-compatible methods") + else: + logger.info("Native Linux environment detected - all disconnect methods available") + + async def disconnect_gpu( + self, + gpu_index: int, + method: DisconnectMethod = DisconnectMethod.AUTO, + down_time: float = 5.0 + ) -> Dict[str, any]: + """ + Disconnect and reconnect a GPU + + Args: + gpu_index: NVIDIA GPU index (0-based) + method: Disconnect method to use + down_time: Seconds to keep device disconnected + + Returns: + Dict with operation results + """ + try: + # Get GPU PCI bus ID + bdf = await self._get_gpu_bdf(gpu_index) + logger.info(f"Disconnecting GPU {gpu_index} (PCI: {bdf}) using method: {method.value}") + + # Check for active processes + processes = await self._check_gpu_processes(gpu_index) + if processes: + logger.warning(f"GPU {gpu_index} has {len(processes)} active processes") + + # Perform disconnect/reconnect + result = await self._execute_disconnect(bdf, method, down_time, gpu_index) + result.update({ + 'gpu_index': gpu_index, + 'bdf': bdf, + 'method_used': method.value, + 'down_time': down_time, + 'active_processes': len(processes) + }) + + logger.info(f"GPU {gpu_index} disconnect/reconnect completed successfully") + return result + + except Exception as e: + error_msg = f"Failed to disconnect GPU {gpu_index}: {str(e)}" + logger.error(error_msg) + raise GPUDisconnectError(error_msg) from e + + async def disconnect_multiple_gpus( + self, + gpu_indices: List[int], + method: DisconnectMethod = DisconnectMethod.AUTO, + down_time: float = 5.0 + ) -> Dict[str, any]: + """ + Disconnect multiple GPUs simultaneously + + Args: + gpu_indices: List of GPU indices to disconnect + method: Disconnect method to use + down_time: Seconds to keep devices disconnected + + Returns: + Dict with results for each GPU + """ + logger.info(f"Disconnecting {len(gpu_indices)} GPUs: {gpu_indices}") + + # Create tasks for each GPU + tasks = [] + for gpu_index in gpu_indices: + task = asyncio.create_task( + self.disconnect_gpu(gpu_index, method, down_time), + name=f"disconnect_gpu_{gpu_index}" + ) + tasks.append((gpu_index, task)) + + # Wait for all operations to complete + results = {} + errors = {} + + for gpu_index, task in tasks: + try: + results[gpu_index] = await task + except Exception as e: + errors[gpu_index] = str(e) + logger.error(f"GPU {gpu_index} disconnect failed: {e}") + + return { + 'total_gpus': len(gpu_indices), + 'successful': len(results), + 'failed': len(errors), + 'results': results, + 'errors': errors + } + + async def get_available_methods(self, gpu_index: int) -> List[str]: + """Get available disconnect methods for a GPU""" + methods = [] + + try: + bdf = await self._get_gpu_bdf(gpu_index) + + # In WSL2, only memory flood works (experimental) + if is_wsl2(): + methods.append(DisconnectMethod.MEMORY_FLOOD.value) + logger.info("WSL2 detected - Only MEMORY_FLOOD available (experimental)") + else: + # Check slot power (Linux only) + if self._has_slot_power(bdf): + methods.append(DisconnectMethod.SLOT_POWER.value) + + # Check hot reset capability (Linux only) + if self._has_hot_reset_capability(bdf): + methods.append(DisconnectMethod.HOT_RESET.value) + + # Logical remove (Linux only) + methods.append(DisconnectMethod.LOGICAL.value) + + # NVIDIA reset (if nvidia-smi available) + if await self._has_nvidia_smi(): + methods.append(DisconnectMethod.NVIDIA_RESET.value) + + # Memory flood experimental method + methods.append(DisconnectMethod.MEMORY_FLOOD.value) + + except Exception as e: + logger.error(f"Error checking methods for GPU {gpu_index}: {e}") + # Fallback to memory flood if error + methods.append(DisconnectMethod.MEMORY_FLOOD.value) + + return methods + + async def _get_gpu_bdf(self, gpu_index: int) -> str: + """Get PCI bus ID for GPU index using nvidia-smi""" + try: + result = await asyncio.create_subprocess_exec( + 'nvidia-smi', '--query-gpu=pci.bus_id', '--format=csv,noheader', '-i', str(gpu_index), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await result.communicate() + + if result.returncode != 0: + raise GPUDisconnectError(f"nvidia-smi failed: {stderr.decode()}") + + bdf = stdout.decode().strip() + if bdf.startswith("00000000:"): + bdf = "0000:" + bdf.split(":", 1)[1] + + return bdf + + except Exception as e: + raise GPUDisconnectError(f"Failed to get PCI bus ID for GPU {gpu_index}: {e}") + + async def _check_gpu_processes(self, gpu_index: int) -> List[Dict]: + """Check for active processes on GPU""" + try: + result = await asyncio.create_subprocess_exec( + 'nvidia-smi', '--query-compute-apps=pid,process_name', '--format=csv,noheader', '-i', str(gpu_index), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await result.communicate() + + if result.returncode != 0: + return [] + + processes = [] + for line in stdout.decode().strip().splitlines(): + if line.strip() and "No running processes found" not in line: + parts = line.split(',', 1) + if len(parts) == 2: + processes.append({ + 'pid': parts[0].strip(), + 'name': parts[1].strip() + }) + + return processes + + except Exception: + return [] + + async def _execute_disconnect(self, bdf: str, method: DisconnectMethod, down_time: float, gpu_index: int = None) -> Dict: + """Execute the actual disconnect/reconnect operation""" + if method == DisconnectMethod.AUTO: + method = await self._select_best_method(bdf, gpu_index) + + start_time = time.time() + + try: + if method == DisconnectMethod.SLOT_POWER: + await self._slot_power_disconnect(bdf, down_time) + elif method == DisconnectMethod.HOT_RESET: + await self._hot_reset_disconnect(bdf, down_time) + elif method == DisconnectMethod.LOGICAL: + await self._logical_disconnect(bdf, down_time) + elif method == DisconnectMethod.NVIDIA_RESET: + await self._nvidia_reset_disconnect(bdf, down_time, gpu_index) + elif method == DisconnectMethod.SIMULATED: + await self._simulated_disconnect(gpu_index, down_time) + elif method == DisconnectMethod.MEMORY_FLOOD: + await self._memory_flood_disconnect(gpu_index, down_time) + else: + raise GPUDisconnectError(f"Unsupported method: {method}") + + duration = time.time() - start_time + return { + 'success': True, + 'method_executed': method.value, + 'duration_seconds': duration, + 'message': f"Successfully completed {method.value} disconnect/reconnect" + } + + except Exception as e: + duration = time.time() - start_time + return { + 'success': False, + 'method_executed': method.value, + 'duration_seconds': duration, + 'error': str(e) + } + + async def _select_best_method(self, bdf: str, gpu_index: int = None) -> DisconnectMethod: + """Select the best available method based on environment""" + + # WSL2 detection - use memory flood (experimental) + if is_wsl2(): + logger.info("WSL2 detected - using MEMORY_FLOOD disconnect (experimental)") + return DisconnectMethod.MEMORY_FLOOD + + # Native Linux - check PCI capabilities + device_path = SYSFS_PCI_DEVICES / bdf + if not device_path.exists(): + logger.warning(f"PCI device {bdf} not accessible - falling back to MEMORY_FLOOD") + return DisconnectMethod.MEMORY_FLOOD + + # Use real PCI methods in order of preference + if self._has_slot_power(bdf): + return DisconnectMethod.SLOT_POWER + elif self._has_hot_reset_capability(bdf): + return DisconnectMethod.HOT_RESET + else: + return DisconnectMethod.LOGICAL + + def _has_slot_power(self, bdf: str) -> bool: + """Check if slot power control is available""" + try: + dev = SYSFS_PCI_DEVICES / bdf + if not dev.exists(): + return False + + # Check for slot symlink + slot_link = dev / "slot" + if slot_link.exists(): + power_file = slot_link / "power" + return power_file.exists() + + # Check slots directory + if SYSFS_PCI_SLOTS.exists(): + target = bdf.split(".")[0] # Remove function + for slot in SYSFS_PCI_SLOTS.iterdir(): + addr_file = slot / "address" + power_file = slot / "power" + if addr_file.exists() and power_file.exists(): + try: + addr = addr_file.read_text().strip() + if addr == target: + return True + except Exception: + continue + + return False + + except Exception: + return False + + def _has_hot_reset_capability(self, bdf: str) -> bool: + """Check if hot reset is available""" + try: + # Check for upstream bridge reset capability + upstream_bdf = self._get_upstream_bdf(bdf) + if upstream_bdf: + upstream_dev = SYSFS_PCI_DEVICES / upstream_bdf + reset_sub = upstream_dev / "reset_subordinate" + reset_file = upstream_dev / "reset" + return reset_sub.exists() or reset_file.exists() + return False + except Exception: + return False + + def _get_upstream_bdf(self, bdf: str) -> Optional[str]: + """Get upstream bridge BDF""" + try: + dev_path = SYSFS_PCI_DEVICES / bdf + parent = dev_path.resolve().parent.name + if ":" in parent: + return parent + return None + except Exception: + return None + + async def _has_nvidia_smi(self) -> bool: + """Check if nvidia-smi is available""" + try: + result = await asyncio.create_subprocess_exec( + 'nvidia-smi', '--version', + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL + ) + await result.communicate() + return result.returncode == 0 + except Exception: + return False + + async def _slot_power_disconnect(self, bdf: str, down_time: float): + """Execute slot power disconnect""" + logger.info(f"Executing slot power disconnect for {bdf}") + + power_file = self._find_slot_power_file(bdf) + if not power_file: + raise GPUDisconnectError(f"Slot power file not found for {bdf}") + + # Unbind driver first + await self._unbind_driver(bdf) + + # Power off + await self._write_sysfs(power_file, "0") + logger.info(f"Slot powered OFF for {down_time}s") + + # Wait for device to disappear + await self._wait_for_condition( + lambda: not (SYSFS_PCI_DEVICES / bdf).exists(), + timeout=10, + description=f"{bdf} to disappear" + ) + + await asyncio.sleep(down_time) + + # Power on + await self._write_sysfs(power_file, "1") + logger.info("Slot powered ON") + + # Rescan and rebind + await self._write_sysfs(SYSFS_PCI_RESCAN, "1") + await self._wait_for_condition( + lambda: (SYSFS_PCI_DEVICES / bdf).exists(), + timeout=30, + description=f"{bdf} to reappear" + ) + + async def _hot_reset_disconnect(self, bdf: str, down_time: float): + """Execute hot reset disconnect""" + logger.info(f"Executing hot reset for {bdf}") + + upstream_bdf = self._get_upstream_bdf(bdf) + if not upstream_bdf: + raise GPUDisconnectError(f"Cannot find upstream bridge for {bdf}") + + # Unbind and remove + await self._unbind_driver(bdf) + await self._write_sysfs(SYSFS_PCI_DEVICES / bdf / "remove", "1") + + await asyncio.sleep(0.25) + + # Try hot reset + upstream_dev = SYSFS_PCI_DEVICES / upstream_bdf + reset_sub = upstream_dev / "reset_subordinate" + reset_file = upstream_dev / "reset" + + if reset_sub.exists(): + await self._write_sysfs(reset_sub, "1") + elif reset_file.exists(): + await self._write_sysfs(reset_file, "1") + else: + raise GPUDisconnectError(f"No reset capability found for upstream {upstream_bdf}") + + await asyncio.sleep(down_time) + + # Rescan + await self._write_sysfs(SYSFS_PCI_RESCAN, "1") + await self._wait_for_condition( + lambda: (SYSFS_PCI_DEVICES / bdf).exists(), + timeout=30, + description=f"{bdf} to reappear" + ) + + async def _logical_disconnect(self, bdf: str, down_time: float): + """Execute logical disconnect (remove/rescan)""" + logger.info(f"Executing logical disconnect for {bdf}") + + device_path = SYSFS_PCI_DEVICES / bdf + + # Unbind and remove + await self._unbind_driver(bdf) + await self._write_sysfs(device_path / "remove", "1") + + # Wait briefly for removal to take effect + await asyncio.sleep(0.5) + + if device_path.exists(): + logger.warning(f"Device {bdf} still exists after removal - may not be properly disconnected") + + # Sleep for down_time + await asyncio.sleep(down_time) + + # Rescan PCI bus + await self._write_sysfs(SYSFS_PCI_RESCAN, "1") + + # Wait for device to reappear + await self._wait_for_condition( + lambda: (SYSFS_PCI_DEVICES / bdf).exists(), + timeout=30, + description=f"{bdf} to reappear" + ) + + async def _nvidia_reset_disconnect(self, bdf: str, down_time: float, gpu_index: int = None): + """Execute NVIDIA GPU reset using nvidia-smi""" + # Find GPU index from BDF if not provided + if gpu_index is None: + gpu_index = await self._get_gpu_index_from_bdf(bdf) + + logger.info(f"Executing NVIDIA reset for GPU {gpu_index}") + + result = await asyncio.create_subprocess_exec( + 'nvidia-smi', '--gpu-reset', '-i', str(gpu_index), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await result.communicate() + + if result.returncode != 0: + raise GPUDisconnectError(f"nvidia-smi --gpu-reset failed: {stderr.decode()}") + + await asyncio.sleep(down_time) + + async def _get_gpu_index_from_bdf(self, target_bdf: str) -> int: + """Get GPU index from PCI bus ID""" + result = await asyncio.create_subprocess_exec( + 'nvidia-smi', '--query-gpu=index,pci.bus_id', '--format=csv,noheader', + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await result.communicate() + + if result.returncode != 0: + raise GPUDisconnectError(f"Failed to query GPU indices: {stderr.decode()}") + + for line in stdout.decode().strip().splitlines(): + parts = line.split(',') + if len(parts) >= 2: + index = int(parts[0].strip()) + bdf = parts[1].strip() + if bdf.startswith("00000000:"): + bdf = "0000:" + bdf.split(":", 1)[1] + if bdf == target_bdf: + return index + + raise GPUDisconnectError(f"GPU index not found for BDF {target_bdf}") + + def _find_slot_power_file(self, bdf: str) -> Optional[Path]: + """Find slot power control file""" + dev = SYSFS_PCI_DEVICES / bdf + slot_link = dev / "slot" + if slot_link.exists(): + power_file = slot_link / "power" + if power_file.exists(): + return power_file + + # Check slots directory + if SYSFS_PCI_SLOTS.exists(): + target = bdf.split(".")[0] + for slot in SYSFS_PCI_SLOTS.iterdir(): + addr_file = slot / "address" + power_file = slot / "power" + if addr_file.exists() and power_file.exists(): + try: + addr = addr_file.read_text().strip() + if addr == target: + return power_file + except Exception: + continue + + return None + + async def _unbind_driver(self, bdf: str): + """Unbind driver from device""" + try: + driver_link = SYSFS_PCI_DEVICES / bdf / "driver" + if driver_link.is_symlink(): + driver_name = driver_link.resolve().name + unbind_file = Path(f"/sys/bus/pci/drivers/{driver_name}/unbind") + if unbind_file.exists(): + await self._write_sysfs(unbind_file, bdf) + except Exception as e: + logger.warning(f"Failed to unbind driver for {bdf}: {e}") + + async def _write_sysfs(self, path: Path, value: str): + """Write to sysfs file with proper error handling""" + try: + def write_sync(): + path.write_text(value) + + await asyncio.get_event_loop().run_in_executor(None, write_sync) + + except Exception as e: + raise GPUDisconnectError(f"Failed to write to {path}: {e}") + + async def _wait_for_condition(self, condition, timeout: int, description: str): + """Wait for a condition to be true with timeout""" + start_time = time.time() + while time.time() - start_time < timeout: + if condition(): + return + await asyncio.sleep(0.25) + + raise GPUDisconnectError(f"Timeout waiting for {description}") + + async def _simulated_disconnect(self, gpu_index: int, down_time: float): + """Simulate disconnect in software only - WSL2 safe""" + logger.info(f"Simulating disconnect for GPU {gpu_index} ({down_time}s)") + + # Add to simulated offline set + _simulated_offline_gpus.add(gpu_index) + + try: + await asyncio.sleep(down_time) + finally: + # Remove from offline set + if gpu_index in _simulated_offline_gpus: + _simulated_offline_gpus.remove(gpu_index) + + async def _memory_flood_disconnect(self, gpu_index: int, down_time: float): + """Flood GPU memory to trigger potential OOM/driver reset - EXPERIMENTAL""" + logger.warning(f"Starting EXPERIMENTAL memory flood on GPU {gpu_index} - may cause instability!") + + import ctypes + + allocations = [] + ctx = None + + try: + # Load CUDA driver library + try: + libcuda = ctypes.CDLL('libcuda.so.1') + except OSError as e: + raise GPUDisconnectError(f"CUDA driver library not found: {e}") + + # Define CUDA function signatures + cuInit = libcuda.cuInit + cuInit.argtypes = [ctypes.c_uint] + cuInit.restype = ctypes.c_int + + cuDeviceGet = libcuda.cuDeviceGet + cuDeviceGet.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int] + cuDeviceGet.restype = ctypes.c_int + + cuCtxCreate = libcuda.cuCtxCreate_v2 + cuCtxCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_uint, ctypes.c_int] + cuCtxCreate.restype = ctypes.c_int + + cuCtxDestroy = libcuda.cuCtxDestroy_v2 + cuCtxDestroy.argtypes = [ctypes.c_void_p] + cuCtxDestroy.restype = ctypes.c_int + + cuMemAlloc = libcuda.cuMemAlloc_v2 + cuMemAlloc.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t] + cuMemAlloc.restype = ctypes.c_int + + cuMemFree = libcuda.cuMemFree_v2 + cuMemFree.argtypes = [ctypes.c_void_p] + cuMemFree.restype = ctypes.c_int + + # Initialize CUDA and create context + if cuInit(0) != 0: + raise GPUDisconnectError(f"CUDA initialization failed") + + device = ctypes.c_int() + if cuDeviceGet(ctypes.byref(device), gpu_index) != 0: + raise GPUDisconnectError(f"Failed to get CUDA device {gpu_index}") + + ctx = ctypes.c_void_p() + if cuCtxCreate(ctypes.byref(ctx), 0, device) != 0: + raise GPUDisconnectError(f"Failed to create CUDA context for GPU {gpu_index}") + + # Get GPU memory info + handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index) + mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + free_mem = mem_info.free + + # Allocate memory chunks + allocated_bytes = 0 + chunk_size = 100 * 1024 * 1024 # 100MB chunks + target_bytes = int(free_mem * 0.95) + + while allocated_bytes < target_bytes: + ptr = ctypes.c_void_p() + result = cuMemAlloc(ctypes.byref(ptr), chunk_size) + + if result == 0: + allocations.append(ptr) + allocated_bytes += chunk_size + else: + break + + logger.info(f"Allocated {allocated_bytes / 1e9:.2f}GB on GPU {gpu_index}, holding for {down_time}s") + await asyncio.sleep(down_time) + + except Exception as e: + logger.error(f"Memory flood error: {e}") + raise + finally: + # Release memory + for ptr in allocations: + try: + cuMemFree(ptr) + except Exception: + pass + + # Destroy CUDA context + if ctx and ctx.value: + try: + cuCtxDestroy(ctx) + except Exception: + pass + + +# Global instance +gpu_disconnector = GPUDisconnector() + + +async def disconnect_gpu(gpu_index: int, method: str = "auto", down_time: float = 5.0) -> Dict: + """Async wrapper for GPU disconnect operation""" + method_enum = DisconnectMethod(method) + return await gpu_disconnector.disconnect_gpu(gpu_index, method_enum, down_time) + + +async def disconnect_multiple_gpus(gpu_indices: List[int], method: str = "auto", down_time: float = 5.0) -> Dict: + """Async wrapper for multiple GPU disconnect operation""" + method_enum = DisconnectMethod(method) + return await gpu_disconnector.disconnect_multiple_gpus(gpu_indices, method_enum, down_time) + + +async def get_available_methods(gpu_index: int) -> List[str]: + """Get available disconnect methods for a GPU""" + return await gpu_disconnector.get_available_methods(gpu_index) diff --git a/core/handlers.py b/core/handlers.py index 070ff30..997d137 100644 --- a/core/handlers.py +++ b/core/handlers.py @@ -1,18 +1,34 @@ -"""Async WebSocket handlers for real-time monitoring""" +"""Async WebSocket handlers for real-time monitoring and GPU disconnect API endpoints""" import asyncio import psutil import logging import json from datetime import datetime -from fastapi import WebSocket +from fastapi import WebSocket, HTTPException +from fastapi.responses import JSONResponse +from pydantic import BaseModel from . import config +from .gpu_disconnect import disconnect_gpu, disconnect_multiple_gpus, get_available_methods, GPUDisconnectError logger = logging.getLogger(__name__) # Global WebSocket connections websocket_connections = set() + +# Pydantic models for API requests +class DisconnectRequest(BaseModel): + method: str = "auto" + down_time: float = 5.0 + + +class MultiDisconnectRequest(BaseModel): + gpu_indices: list[int] + method: str = "auto" + down_time: float = 5.0 + + def register_handlers(app, monitor): """Register FastAPI WebSocket handlers""" @@ -34,6 +50,194 @@ async def websocket_endpoint(websocket: WebSocket): logger.debug(f'Dashboard client disconnected: {e}') finally: websocket_connections.discard(websocket) + + # GPU Disconnect API Endpoints + @app.get("/api/gpu/{gpu_id}/disconnect/methods") + async def get_disconnect_methods(gpu_id: int): + """Get available disconnect methods for a GPU""" + try: + from .gpu_disconnect import is_wsl2 + + methods = await get_available_methods(gpu_id) + in_wsl2 = is_wsl2() + + return { + "gpu_id": gpu_id, + "available_methods": methods, + "default_method": "auto", + "environment": { + "is_wsl2": in_wsl2, + "recommended_method": "simulated" if in_wsl2 else "auto", + "pci_available": not in_wsl2 + } + } + except Exception as e: + logger.error(f"Error getting disconnect methods for GPU {gpu_id}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + @app.post("/api/gpu/{gpu_id}/disconnect") + async def disconnect_single_gpu(gpu_id: int, request: DisconnectRequest): + """Disconnect and reconnect a specific GPU""" + try: + logger.info(f"Received disconnect request for GPU {gpu_id}, method: {request.method}, down_time: {request.down_time}s") + + result = await disconnect_gpu( + gpu_index=gpu_id, + method=request.method, + down_time=request.down_time + ) + + return JSONResponse(content=result) + + except GPUDisconnectError as e: + logger.error(f"GPU disconnect error: {e}") + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + logger.error(f"Unexpected error during GPU {gpu_id} disconnect: {e}") + raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}") + + @app.post("/api/gpu/disconnect-multiple") + async def disconnect_multiple(request: MultiDisconnectRequest): + """Disconnect and reconnect multiple GPUs simultaneously""" + try: + logger.info(f"Received multi-disconnect request for GPUs {request.gpu_indices}, method: {request.method}, down_time: {request.down_time}s") + + result = await disconnect_multiple_gpus( + gpu_indices=request.gpu_indices, + method=request.method, + down_time=request.down_time + ) + + return JSONResponse(content=result) + + except GPUDisconnectError as e: + logger.error(f"Multi-GPU disconnect error: {e}") + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + logger.error(f"Unexpected error during multi-GPU disconnect: {e}") + raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}") + + @app.get("/api/gpu/verify-disconnect/{gpu_id}") + async def verify_gpu_disconnect(gpu_id: int): + """Verify GPU visibility - check if GPU exists via NVML, nvidia-smi, and sysfs""" + import subprocess + from pathlib import Path + + result = { + "gpu_id": gpu_id, + "timestamp": datetime.now().isoformat(), + "checks": {} + } + + # Check NVML device count + try: + import pynvml + device_count = pynvml.nvmlDeviceGetCount() + result["checks"]["nvml_total_devices"] = device_count + result["checks"]["nvml_status"] = "success" + + # Try to get handle for specific GPU + try: + handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) + pci_info = pynvml.nvmlDeviceGetPciInfo(handle) + result["checks"]["nvml_gpu_exists"] = True + result["checks"]["nvml_pci_bdf"] = pci_info.busId.decode('utf-8') + except Exception as e: + result["checks"]["nvml_gpu_exists"] = False + result["checks"]["nvml_gpu_error"] = str(e) + except Exception as e: + result["checks"]["nvml_status"] = f"error: {e}" + + # Check nvidia-smi + try: + smi_result = subprocess.run( + ['nvidia-smi', '--query-gpu=index,name,pci.bus_id', '--format=csv,noheader'], + capture_output=True, + text=True, + timeout=5 + ) + result["checks"]["nvidia_smi_success"] = smi_result.returncode == 0 + if smi_result.returncode == 0: + gpu_lines = [line for line in smi_result.stdout.strip().split('\n') if line.startswith(str(gpu_id))] + result["checks"]["nvidia_smi_gpu_found"] = len(gpu_lines) > 0 + if gpu_lines: + result["checks"]["nvidia_smi_output"] = gpu_lines[0] + else: + result["checks"]["nvidia_smi_error"] = smi_result.stderr + except Exception as e: + result["checks"]["nvidia_smi_success"] = False + result["checks"]["nvidia_smi_error"] = str(e) + + # Check PCI sysfs path + if "nvml_pci_bdf" in result["checks"]: + bdf = result["checks"]["nvml_pci_bdf"] + pci_path = Path(f"/sys/bus/pci/devices/{bdf}") + result["checks"]["pci_device_exists"] = pci_path.exists() + result["checks"]["pci_device_path"] = str(pci_path) + + return JSONResponse(content=result) + + @app.get("/api/gpu/disconnect/status") + async def get_disconnect_status(): + """Get current disconnect operation status and system capabilities""" + try: + from .gpu_disconnect import is_wsl2 + + # Check root permissions + import os + has_root = os.geteuid() == 0 + + # Check nvidia-smi availability + import shutil + has_nvidia_smi = shutil.which("nvidia-smi") is not None + + # Check sysfs access + from pathlib import Path + sysfs_accessible = Path("/sys/bus/pci/devices").exists() + + # WSL2 detection + in_wsl2 = is_wsl2() + + # Determine readiness based on environment + if in_wsl2: + ready = has_nvidia_smi # WSL2 only needs nvidia-smi for some methods + else: + ready = has_root and has_nvidia_smi and sysfs_accessible + + warnings = [] + if in_wsl2: + warnings.append("WSL2 detected - PCI disconnect unavailable, using simulated/soft methods") + else: + if not has_root: + warnings.append("Root privileges required for PCI operations") + if not has_nvidia_smi: + warnings.append("nvidia-smi not found in PATH") + if not sysfs_accessible: + warnings.append("PCI sysfs interface not accessible") + + return { + "ready": ready, + "environment": { + "is_wsl2": in_wsl2, + "platform": "WSL2" if in_wsl2 else "Native Linux" + }, + "permissions": { + "root_access": has_root, + "nvidia_smi_available": has_nvidia_smi, + "sysfs_accessible": sysfs_accessible + }, + "capabilities": { + "pci_disconnect": not in_wsl2 and sysfs_accessible, + "nvidia_reset": has_nvidia_smi, + "simulated": True, + "memory_flood": True # Uses ctypes + CUDA Driver API (zero dependencies) + }, + "warnings": [w for w in warnings if w] + } + + except Exception as e: + logger.error(f"Error checking disconnect status: {e}") + raise HTTPException(status_code=500, detail=str(e)) async def monitor_loop(monitor, connections): diff --git a/core/hub_handlers.py b/core/hub_handlers.py index 0f02826..5a26dab 100644 --- a/core/hub_handlers.py +++ b/core/hub_handlers.py @@ -1,15 +1,58 @@ -"""Async WebSocket handlers for hub mode""" +"""Async WebSocket handlers for hub mode and GPU disconnect relay endpoints""" import asyncio import logging import json -from fastapi import WebSocket +import aiohttp +from fastapi import WebSocket, HTTPException +from fastapi.responses import JSONResponse +from pydantic import BaseModel +from typing import Dict, Any logger = logging.getLogger(__name__) # Global WebSocket connections websocket_connections = set() + +# Pydantic models for hub disconnect requests +class HubDisconnectRequest(BaseModel): + method: str = "auto" + down_time: float = 5.0 + + +class HubMultiDisconnectRequest(BaseModel): + targets: list[dict] # [{"node_name": "node1", "gpu_id": 0}, ...] + method: str = "auto" + down_time: float = 5.0 + + +async def forward_to_node(node_url: str, endpoint: str, method: str = "GET", data: Dict[str, Any] = None) -> Dict[str, Any]: + """Forward API request to a specific node""" + url = f"{node_url.rstrip('/')}/{endpoint.lstrip('/')}" + + try: + async with aiohttp.ClientSession() as session: + if method.upper() == "GET": + async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as response: + return await response.json() + elif method.upper() == "POST": + async with session.post(url, json=data, timeout=aiohttp.ClientTimeout(total=60)) as response: + if response.status >= 400: + error_text = await response.text() + raise Exception(f"Node returned error {response.status}: {error_text}") + return await response.json() + else: + raise ValueError(f"Unsupported HTTP method: {method}") + + except asyncio.TimeoutError: + raise Exception(f"Timeout connecting to node at {node_url}") + except aiohttp.ClientError as e: + raise Exception(f"Network error connecting to node at {node_url}: {str(e)}") + except Exception as e: + raise Exception(f"Error communicating with node at {node_url}: {str(e)}") + + def register_hub_handlers(app, hub): """Register FastAPI WebSocket handlers for hub mode""" @@ -36,6 +79,211 @@ async def websocket_endpoint(websocket: WebSocket): logger.debug(f'Dashboard client disconnected: {e}') finally: websocket_connections.discard(websocket) + + # Hub GPU Disconnect API Endpoints + @app.get("/api/hub/nodes") + async def get_hub_nodes(): + """Get list of connected nodes and their status""" + try: + nodes_info = {} + for node_name, node_data in hub.nodes.items(): + nodes_info[node_name] = { + 'url': node_data['url'], + 'status': node_data['status'], + 'last_update': node_data['last_update'] + } + + return { + 'total_nodes': len(hub.nodes), + 'online_nodes': sum(1 for n in hub.nodes.values() if n['status'] == 'online'), + 'nodes': nodes_info + } + + except Exception as e: + logger.error(f"Error getting hub nodes: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/hub/gpu/{node_name}/{gpu_id}/disconnect/methods") + async def get_node_disconnect_methods(node_name: str, gpu_id: int): + """Get available disconnect methods for a GPU on a specific node""" + try: + if node_name not in hub.nodes: + raise HTTPException(status_code=404, detail=f"Node '{node_name}' not found") + + node_data = hub.nodes[node_name] + if node_data['status'] != 'online': + raise HTTPException(status_code=503, detail=f"Node '{node_name}' is offline") + + node_url = node_data['url'] + endpoint = f"api/gpu/{gpu_id}/disconnect/methods" + + result = await forward_to_node(node_url, endpoint, "GET") + result['node_name'] = node_name + + return result + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error getting disconnect methods for {node_name}/GPU {gpu_id}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + @app.post("/api/hub/gpu/{node_name}/{gpu_id}/disconnect") + async def disconnect_node_gpu(node_name: str, gpu_id: int, request: HubDisconnectRequest): + """Disconnect a GPU on a specific node""" + try: + logger.info(f"Hub received disconnect request for {node_name}/GPU {gpu_id}") + + if node_name not in hub.nodes: + raise HTTPException(status_code=404, detail=f"Node '{node_name}' not found") + + node_data = hub.nodes[node_name] + if node_data['status'] != 'online': + raise HTTPException(status_code=503, detail=f"Node '{node_name}' is offline") + + node_url = node_data['url'] + endpoint = f"api/gpu/{gpu_id}/disconnect" + request_data = { + 'method': request.method, + 'down_time': request.down_time + } + + result = await forward_to_node(node_url, endpoint, "POST", request_data) + result['node_name'] = node_name + result['hub_timestamp'] = datetime.now().isoformat() + + logger.info(f"Successfully relayed disconnect request to {node_name}/GPU {gpu_id}") + return JSONResponse(content=result) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error disconnecting {node_name}/GPU {gpu_id}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + @app.post("/api/hub/gpu/disconnect-multiple") + async def disconnect_multiple_node_gpus(request: HubMultiDisconnectRequest): + """Disconnect multiple GPUs across multiple nodes""" + try: + logger.info(f"Hub received multi-disconnect request for {len(request.targets)} targets") + + # Group targets by node + node_targets = {} + for target in request.targets: + node_name = target.get('node_name') + gpu_id = target.get('gpu_id') + + if not node_name or gpu_id is None: + raise HTTPException(status_code=400, detail="Each target must have 'node_name' and 'gpu_id'") + + if node_name not in hub.nodes: + raise HTTPException(status_code=404, detail=f"Node '{node_name}' not found") + + if node_name not in node_targets: + node_targets[node_name] = [] + node_targets[node_name].append(gpu_id) + + # Check all nodes are online + for node_name in node_targets: + if hub.nodes[node_name]['status'] != 'online': + raise HTTPException(status_code=503, detail=f"Node '{node_name}' is offline") + + # Create tasks for each node + tasks = [] + for node_name, gpu_ids in node_targets.items(): + node_url = hub.nodes[node_name]['url'] + + if len(gpu_ids) == 1: + # Single GPU disconnect + endpoint = f"api/gpu/{gpu_ids[0]}/disconnect" + request_data = { + 'method': request.method, + 'down_time': request.down_time + } + else: + # Multi-GPU disconnect on same node + endpoint = "api/gpu/disconnect-multiple" + request_data = { + 'gpu_indices': gpu_ids, + 'method': request.method, + 'down_time': request.down_time + } + + task = asyncio.create_task( + forward_to_node(node_url, endpoint, "POST", request_data), + name=f"disconnect_{node_name}" + ) + tasks.append((node_name, task)) + + # Wait for all tasks to complete + results = {} + errors = {} + + for node_name, task in tasks: + try: + result = await task + result['node_name'] = node_name + results[node_name] = result + except Exception as e: + errors[node_name] = str(e) + logger.error(f"Error disconnecting GPUs on {node_name}: {e}") + + response = { + 'total_nodes': len(node_targets), + 'successful_nodes': len(results), + 'failed_nodes': len(errors), + 'results': results, + 'errors': errors, + 'hub_timestamp': datetime.now().isoformat() + } + + logger.info(f"Multi-disconnect completed: {len(results)} successful, {len(errors)} failed") + return JSONResponse(content=response) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error in hub multi-disconnect: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + @app.get("/api/hub/gpu/disconnect/status") + async def get_hub_disconnect_status(): + """Get disconnect capability status for all nodes""" + try: + node_status = {} + + for node_name, node_data in hub.nodes.items(): + if node_data['status'] == 'online': + try: + node_url = node_data['url'] + result = await forward_to_node(node_url, "api/gpu/disconnect/status", "GET") + node_status[node_name] = { + 'status': 'online', + 'capabilities': result + } + except Exception as e: + node_status[node_name] = { + 'status': 'error', + 'error': str(e) + } + else: + node_status[node_name] = { + 'status': 'offline' + } + + total_ready = sum(1 for status in node_status.values() + if status.get('capabilities', {}).get('ready', False)) + + return { + 'hub_ready': total_ready > 0, + 'total_nodes': len(hub.nodes), + 'ready_nodes': total_ready, + 'node_status': node_status + } + + except Exception as e: + logger.error(f"Error getting hub disconnect status: {e}") + raise HTTPException(status_code=500, detail=str(e)) async def hub_loop(hub, connections): diff --git a/core/monitor.py b/core/monitor.py index fa1f946..a5f43fc 100644 --- a/core/monitor.py +++ b/core/monitor.py @@ -7,6 +7,7 @@ from .metrics import MetricsCollector from .nvidia_smi_fallback import parse_nvidia_smi from .config import NVIDIA_SMI +from .gpu_disconnect import is_gpu_simulated_offline logger = logging.getLogger(__name__) @@ -19,6 +20,7 @@ def __init__(self): self.gpu_data = {} self.collector = MetricsCollector() self.use_smi = {} # Track which GPUs use nvidia-smi (decided at boot) + self.last_device_count = None # Track device count changes try: pynvml.nvmlInit() @@ -87,6 +89,16 @@ async def get_gpu_data(self): try: device_count = pynvml.nvmlDeviceGetCount() + + # Log device count changes (indicates GPU disconnect/reconnect) + if self.last_device_count is not None and device_count != self.last_device_count: + logger.warning(f"[MONITOR] *** GPU DEVICE COUNT CHANGED: {self.last_device_count} -> {device_count} ***") + if device_count < self.last_device_count: + logger.warning(f"[MONITOR] *** GPU(s) DISAPPEARED - {self.last_device_count - device_count} device(s) missing ***") + else: + logger.info(f"[MONITOR] *** GPU(s) REAPPEARED - {device_count - self.last_device_count} device(s) added ***") + + self.last_device_count = device_count gpu_data = {} # Get nvidia-smi data once if any GPU needs it @@ -104,6 +116,24 @@ async def get_gpu_data(self): tasks = [] for i in range(device_count): gpu_id = str(i) + + # Check if GPU is in simulated offline state + if is_gpu_simulated_offline(i): + logger.debug(f"[MONITOR] GPU {i} is in simulated offline state - skipping") + # Create offline data + gpu_data[gpu_id] = { + 'index': gpu_id, + 'name': self.gpu_data.get(gpu_id, {}).get('name', 'Unknown GPU'), + 'simulated_offline': True, + 'status': 'Simulated Disconnect', + 'utilization': None, + 'memory_used': 0, + 'memory_total': 0, + 'temperature': None, + 'power_draw': None, + } + continue + if self.use_smi.get(gpu_id, False): # Use nvidia-smi data if smi_data and gpu_id in smi_data: @@ -141,8 +171,16 @@ def _collect_single_gpu(self, gpu_index): try: handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index) return self.collector.collect_all(handle, str(gpu_index)) + except pynvml.NVMLError as e: + # NVML-specific errors might indicate GPU is disconnected + error_str = str(e) + if "Not Found" in error_str or "Unknown Error" in error_str or "GPU is lost" in error_str: + logger.warning(f"[MONITOR] GPU {gpu_index}: Cannot access GPU - may be disconnected ({error_str})") + else: + logger.error(f"[MONITOR] GPU {gpu_index}: NVML Error - {e}") + return {} except Exception as e: - logger.error(f"GPU {gpu_index}: Error - {e}") + logger.error(f"[MONITOR] GPU {gpu_index}: Unexpected error - {e}") return {} async def get_processes(self): diff --git a/docker-compose.yml b/docker-compose.yml index 313a3af..1e20a0b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,6 +15,11 @@ services: - driver: nvidia count: all capabilities: [gpu] + # Required for GPU disconnect functionality + privileged: true + volumes: + - /sys/bus/pci:/sys/bus/pci:rw + - /sys/devices:/sys/devices:ro init: true pid: "host" restart: unless-stopped diff --git a/requirements.txt b/requirements.txt index a770860..a7b7cc4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ websockets==12.0 psutil==5.9.6 nvidia-ml-py==13.580.82 requests==2.31.0 -websocket-client==1.6.3 \ No newline at end of file +websocket-client==1.6.3 +aiohttp==3.9.1 \ No newline at end of file diff --git a/static/css/disconnect-controls.css b/static/css/disconnect-controls.css new file mode 100644 index 0000000..8584a80 --- /dev/null +++ b/static/css/disconnect-controls.css @@ -0,0 +1,662 @@ +/* GPU Disconnect Controls Styles */ + +/* Disconnect Button Styling */ +.disconnect-button { + background: linear-gradient(135deg, #ff6b6b, #ee5a52); + color: white; + border: none; + border-radius: 8px; + padding: 8px 16px; + font-size: 0.9rem; + font-weight: 500; + cursor: pointer; + transition: all 0.2s ease; + display: inline-flex; + align-items: center; + gap: 6px; + margin-top: 8px; +} + +.disconnect-button:hover:not(:disabled) { + background: linear-gradient(135deg, #ff5252, #e53935); + transform: translateY(-1px); + box-shadow: 0 4px 12px rgba(255, 107, 107, 0.3); +} + +.disconnect-button:active:not(:disabled) { + transform: translateY(0); +} + +.disconnect-button:disabled { + background: #ccc; + cursor: not-allowed; + transform: none; + box-shadow: none; +} + +.disconnect-icon { + font-size: 1rem; + display: inline-block; +} + +/* GPU Actions Container */ +.gpu-actions { + display: flex; + gap: 8px; + flex-wrap: wrap; + margin-top: 12px; + padding-top: 12px; + border-top: 1px solid rgba(255, 255, 255, 0.1); +} + +/* GPU Disconnect Button (styled like ONLINE badge) */ +.gpu-disconnect-container { + z-index: 10; +} + +.gpu-disconnect-button { + padding: 0.75rem 1.5rem; + background: rgba(255, 107, 107, 0.15); + border: 2px solid rgba(255, 107, 107, 0.4); + border-radius: 30px; + font-size: 0.85rem; + font-weight: 700; + color: #ff6b6b; + display: flex; + align-items: center; + gap: 0.5rem; + letter-spacing: 1px; + box-shadow: 0 0 20px rgba(255, 107, 107, 0.3); + cursor: pointer; + transition: all 0.3s ease; + text-transform: uppercase; +} + +.gpu-disconnect-button:hover { + background: rgba(255, 107, 107, 0.25); + border-color: rgba(255, 107, 107, 0.6); + box-shadow: 0 0 30px rgba(255, 107, 107, 0.5); + transform: translateY(-2px); +} + +.gpu-disconnect-button:active { + transform: translateY(0); + box-shadow: 0 0 15px rgba(255, 107, 107, 0.4); +} + +.disconnect-dot { + width: 8px; + height: 8px; + background: #ff6b6b; + border-radius: 50%; + display: inline-block; + box-shadow: 0 0 10px rgba(255, 107, 107, 0.8); + animation: pulse-disconnect 2s ease-in-out infinite; +} + +@keyframes pulse-disconnect { + 0%, 100% { + box-shadow: 0 0 10px rgba(255, 107, 107, 0.8); + } + 50% { + box-shadow: 0 0 20px rgba(255, 107, 107, 1); + } +} + +.disconnect-text { + text-shadow: 0 0 10px rgba(255, 107, 107, 0.5); +} + +/* Multi-Select Toolbar */ +.multi-select-toolbar { + position: fixed; + bottom: 20px; + left: 50%; + transform: translateX(-50%); + background: rgba(45, 45, 45, 0.95); + backdrop-filter: blur(10px); + border: 1px solid rgba(255, 255, 255, 0.1); + border-radius: 12px; + padding: 16px 24px; + box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3); + z-index: 1000; + display: none; + animation: slideUp 0.3s ease; +} + +@keyframes slideUp { + from { + opacity: 0; + transform: translateX(-50%) translateY(20px); + } + to { + opacity: 1; + transform: translateX(-50%) translateY(0); + } +} + +.toolbar-content { + display: flex; + align-items: center; + gap: 20px; + color: white; + font-weight: 500; +} + +.toolbar-actions { + display: flex; + gap: 12px; +} + +.selected-count { + color: #4fc3f7; + font-weight: 600; +} + +/* Modal Styles */ +.modal-overlay { + position: fixed; + top: 0; + left: 0; + right: 0; + bottom: 0; + background: rgba(0, 0, 0, 0.7); + backdrop-filter: blur(4px); + display: flex; + align-items: center; + justify-content: center; + z-index: 10000; + opacity: 0; + transition: opacity 0.2s ease; +} + +.disconnect-modal { + background: linear-gradient(135deg, #2a2a2a, #1e1e1e); + border: 1px solid rgba(255, 255, 255, 0.1); + border-radius: 16px; + min-width: 480px; + max-width: 90vw; + max-height: 90vh; + overflow: hidden; + transform: scale(0.8); + transition: transform 0.2s ease; + box-shadow: 0 20px 60px rgba(0, 0, 0, 0.4); +} + +.multi-disconnect-modal { + min-width: 560px; +} + +.modal-header { + background: linear-gradient(135deg, #333, #2a2a2a); + padding: 20px 24px; + display: flex; + align-items: center; + justify-content: space-between; + border-bottom: 1px solid rgba(255, 255, 255, 0.1); +} + +.modal-header h3 { + margin: 0; + color: white; + font-size: 1.25rem; + font-weight: 600; +} + +.modal-close { + background: none; + border: none; + color: rgba(255, 255, 255, 0.6); + font-size: 1.5rem; + cursor: pointer; + padding: 0; + width: 32px; + height: 32px; + display: flex; + align-items: center; + justify-content: center; + border-radius: 50%; + transition: all 0.2s ease; +} + +.modal-close:hover { + background: rgba(255, 255, 255, 0.1); + color: white; +} + +.modal-content { + padding: 24px; + color: white; +} + +.disconnect-warning { + background: linear-gradient(135deg, rgba(255, 193, 7, 0.1), rgba(255, 152, 0, 0.1)); + border: 1px solid rgba(255, 193, 7, 0.3); + border-radius: 8px; + padding: 16px; + margin-bottom: 20px; + display: flex; + gap: 12px; + align-items: flex-start; +} + +.multi-warning { + background: linear-gradient(135deg, rgba(244, 67, 54, 0.15), rgba(233, 30, 99, 0.1)); + border-color: rgba(244, 67, 54, 0.4); +} + +.warning-icon { + font-size: 1.2rem; + flex-shrink: 0; +} + +.warning-text { + line-height: 1.5; +} + +.warning-text strong { + color: #ffeb3b; +} + +/* Method Selection */ +.method-selection { + margin-bottom: 20px; +} + +.method-selection label { + display: block; + margin-bottom: 8px; + font-weight: 500; + color: rgba(255, 255, 255, 0.9); +} + +.method-selection select { + width: 100%; + background: rgba(255, 255, 255, 0.1); + border: 1px solid rgba(255, 255, 255, 0.2); + border-radius: 8px; + padding: 12px; + color: white; + font-size: 0.9rem; + margin-bottom: 8px; +} + +.method-selection select option { + background: #2a2a2a; + color: white; + padding: 8px; +} + +.method-selection select:focus { + outline: none; + border-color: #4fc3f7; + box-shadow: 0 0 0 2px rgba(79, 195, 247, 0.2); +} + +.method-description { + font-size: 0.85rem; + color: rgba(255, 255, 255, 0.7); + line-height: 1.4; + padding: 8px 12px; + background: rgba(255, 255, 255, 0.05); + border-radius: 6px; +} + +/* Timing Controls */ +.timing-controls { + margin-bottom: 20px; +} + +.timing-controls label { + display: block; + margin-bottom: 8px; + font-weight: 500; + color: rgba(255, 255, 255, 0.9); +} + +.time-options { + display: flex; + gap: 8px; + flex-wrap: wrap; +} + +.time-btn { + background: rgba(255, 255, 255, 0.1); + border: 1px solid rgba(255, 255, 255, 0.2); + border-radius: 6px; + padding: 8px 14px; + color: rgba(255, 255, 255, 0.8); + cursor: pointer; + transition: all 0.2s ease; + font-size: 0.85rem; +} + +.time-btn:hover { + background: rgba(255, 255, 255, 0.15); + border-color: rgba(255, 255, 255, 0.3); +} + +.time-btn.active { + background: linear-gradient(135deg, #4fc3f7, #29b6f6); + border-color: #4fc3f7; + color: white; +} + +#custom-time, #multi-custom-time { + background: rgba(255, 255, 255, 0.1); + border: 1px solid rgba(255, 255, 255, 0.2); + border-radius: 6px; + padding: 8px 12px; + color: white; + width: 120px; + font-size: 0.85rem; +} + +#custom-time:focus, #multi-custom-time:focus { + outline: none; + border-color: #4fc3f7; + box-shadow: 0 0 0 2px rgba(79, 195, 247, 0.2); +} + +/* Selected GPUs Display */ +.selected-gpus { + margin-bottom: 20px; +} + +.selected-gpus label { + display: block; + margin-bottom: 8px; + font-weight: 500; + color: rgba(255, 255, 255, 0.9); +} + +.gpu-list { + background: rgba(255, 255, 255, 0.05); + border: 1px solid rgba(255, 255, 255, 0.1); + border-radius: 8px; + padding: 12px; + font-size: 0.9rem; + color: rgba(255, 255, 255, 0.8); + max-height: 100px; + overflow-y: auto; +} + +/* Active Processes Warning */ +.active-processes-warning { + background: linear-gradient(135deg, rgba(33, 150, 243, 0.1), rgba(3, 169, 244, 0.1)); + border: 1px solid rgba(33, 150, 243, 0.3); + border-radius: 8px; + padding: 16px; + margin-bottom: 20px; + display: flex; + gap: 12px; + align-items: flex-start; +} + +/* Modal Actions */ +.modal-actions { + background: rgba(255, 255, 255, 0.05); + padding: 20px 24px; + display: flex; + gap: 12px; + justify-content: flex-end; + border-top: 1px solid rgba(255, 255, 255, 0.1); +} + +.btn-secondary { + background: rgba(255, 255, 255, 0.1); + border: 1px solid rgba(255, 255, 255, 0.2); + color: rgba(255, 255, 255, 0.8); + padding: 10px 20px; + border-radius: 8px; + cursor: pointer; + transition: all 0.2s ease; + font-weight: 500; +} + +.btn-secondary:hover { + background: rgba(255, 255, 255, 0.15); + color: white; +} + +.btn-danger { + background: linear-gradient(135deg, #f44336, #d32f2f); + border: none; + color: white; + padding: 10px 20px; + border-radius: 8px; + cursor: pointer; + transition: all 0.2s ease; + font-weight: 500; + display: flex; + align-items: center; + gap: 8px; +} + +.btn-danger:hover { + background: linear-gradient(135deg, #e53935, #c62828); + box-shadow: 0 4px 12px rgba(244, 67, 54, 0.3); +} + +/* GPU Status Indicators */ +.disconnect-status { + position: absolute; + top: 12px; + right: 12px; + background: rgba(0, 0, 0, 0.8); + color: white; + padding: 4px 8px; + border-radius: 4px; + font-size: 0.75rem; + display: flex; + align-items: center; + gap: 4px; + z-index: 5; +} + +.gpu-card { + position: relative; +} + +.gpu-card.disconnecting { + opacity: 0.7; + border-color: #ff9800 !important; +} + +.gpu-card.disconnect-completed { + animation: successPulse 2s ease; +} + +.gpu-card.disconnect-failed { + border-color: #f44336 !important; + animation: errorShake 0.5s ease; +} + +@keyframes successPulse { + 0%, 100% { border-color: inherit; } + 50% { border-color: #4caf50; } +} + +@keyframes errorShake { + 0%, 100% { transform: translateX(0); } + 25% { transform: translateX(-2px); } + 75% { transform: translateX(2px); } +} + +.status-spinner { + display: inline-block; + width: 12px; + height: 12px; + border: 2px solid transparent; + border-top: 2px solid #4fc3f7; + border-radius: 50%; + animation: spin 1s linear infinite; +} + +@keyframes spin { + to { transform: rotate(360deg); } +} + +.status-success { + color: #4caf50; + font-weight: bold; +} + +.status-error { + color: #f44336; + font-weight: bold; +} + +/* Button Spinner */ +.btn-spinner { + display: inline-block; + width: 14px; + height: 14px; + border: 2px solid transparent; + border-top: 2px solid currentColor; + border-radius: 50%; + animation: spin 1s linear infinite; +} + +/* Notifications */ +.notification-container { + position: fixed; + top: 20px; + right: 20px; + z-index: 10001; + display: flex; + flex-direction: column; + gap: 8px; +} + +.notification { + background: rgba(45, 45, 45, 0.95); + backdrop-filter: blur(10px); + border-radius: 8px; + padding: 0; + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3); + border-left: 4px solid; + animation: slideInRight 0.3s ease; + max-width: 400px; +} + +.notification-info { + border-left-color: #2196f3; +} + +.notification-success { + border-left-color: #4caf50; +} + +.notification-warning { + border-left-color: #ff9800; +} + +.notification-error { + border-left-color: #f44336; +} + +@keyframes slideInRight { + from { + opacity: 0; + transform: translateX(100%); + } + to { + opacity: 1; + transform: translateX(0); + } +} + +.notification-content { + display: flex; + align-items: center; + justify-content: space-between; + padding: 12px 16px; + color: white; +} + +.notification-message { + flex: 1; + font-size: 0.9rem; +} + +.notification-close { + background: none; + border: none; + color: rgba(255, 255, 255, 0.6); + cursor: pointer; + font-size: 1.2rem; + padding: 0; + margin-left: 12px; + width: 20px; + height: 20px; + display: flex; + align-items: center; + justify-content: center; + border-radius: 50%; + transition: all 0.2s ease; +} + +.notification-close:hover { + background: rgba(255, 255, 255, 0.1); + color: white; +} + +/* Responsive Design */ +@media (max-width: 768px) { + .disconnect-modal { + min-width: 90vw; + margin: 20px; + } + + .multi-select-toolbar { + left: 10px; + right: 10px; + transform: none; + border-radius: 8px; + } + + .toolbar-content { + flex-direction: column; + gap: 12px; + text-align: center; + } + + .time-options { + justify-content: center; + } + + .notification-container { + left: 10px; + right: 10px; + top: 10px; + } + + .notification { + max-width: none; + } + + /* Adjust disconnect button for mobile */ + .gpu-disconnect-button { + padding: 0.5rem 1rem; + font-size: 0.75rem; + gap: 0.35rem; + } + + .gpu-disconnect-container { + right: 10px !important; + top: 60px !important; + } +} + +/* Dark mode adjustments */ +@media (prefers-color-scheme: dark) { + .disconnect-modal { + background: linear-gradient(135deg, #1a1a1a, #0d1117); + border-color: rgba(255, 255, 255, 0.1); + } + + .modal-header { + background: linear-gradient(135deg, #21262d, #1a1a1a); + } +} diff --git a/static/js/gpu-cards.js b/static/js/gpu-cards.js index 889dd34..514de15 100644 --- a/static/js/gpu-cards.js +++ b/static/js/gpu-cards.js @@ -9,7 +9,7 @@ function createOverviewCard(gpuId, gpuInfo) { const memPercent = (memory_used / memory_total) * 100; return ` -