diff --git a/README.md b/README.md index 41693c8..16a2c41 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,14 @@ # GPU Hot Real-time NVIDIA GPU monitoring dashboard. Web-based, no SSH required. +**Supports Docker and native installation on Linux & Windows.** [![Python](https://img.shields.io/badge/Python-3.8+-3776AB?style=flat-square&logo=python&logoColor=white)](https://www.python.org/) [![Docker](https://img.shields.io/badge/Docker-Ready-2496ED?style=flat-square&logo=docker&logoColor=white)](https://www.docker.com/) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) [![NVIDIA](https://img.shields.io/badge/NVIDIA-GPU-76B900?style=flat-square&logo=nvidia&logoColor=white)](https://www.nvidia.com/) +[![Linux](https://img.shields.io/badge/Linux-Native-FCC624?style=flat-square&logo=linux&logoColor=black)](https://www.kernel.org/) +[![Windows](https://img.shields.io/badge/Windows-Native-0078D4?style=flat-square&logo=windows&logoColor=white)](https://www.microsoft.com/windows/) GPU Hot Dashboard @@ -39,14 +42,44 @@ Open `http://localhost:1312` **Process monitoring:** Add `--init --pid=host` to see process names. Note: This allows the container to access host process information. -**From source:** +**From source (Docker):** ```bash git clone https://github.com/psalias2006/gpu-hot cd gpu-hot docker-compose up --build ``` -**Requirements:** Docker + [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) +**Native installation (Linux/Windows):** +```bash +# Linux +git clone https://github.com/psalias2006/gpu-hot +cd gpu-hot +pip3 install -r requirements.txt +python3 app.py +``` + +```cmd +REM Windows (Command Prompt) +git clone https://github.com/psalias2006/gpu-hot +cd gpu-hot +pip install -r requirements.txt +python app.py +``` + +```powershell +# Windows (PowerShell) +git clone https://github.com/psalias2006/gpu-hot +cd gpu-hot +pip install -r requirements.txt +python app.py +``` + +**Docker Requirements:** Docker + [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) + +**Native Requirements:** +- Python 3.8+ with pip +- NVIDIA GPU drivers (recent version recommended) +- All Python dependencies will be installed via pip --- @@ -65,7 +98,7 @@ docker-compose up --build ## Configuration -**Environment variables:** +**Environment variables (Docker & Native):** ```bash NVIDIA_VISIBLE_DEVICES=0,1 # Specific GPUs (default: all) NVIDIA_SMI=true # Force nvidia-smi mode for older GPUs @@ -74,6 +107,27 @@ NODE_NAME=gpu-server-1 # Node display name (default: hostname) NODE_URLS=http://host:1312... # Comma-separated node URLs (required for hub mode) ``` +**Native installation examples:** +```bash +# Linux - Single machine with specific GPUs +export NVIDIA_VISIBLE_DEVICES=0,1 +export NVIDIA_SMI=true +python3 app.py +``` + +```cmd +REM Windows Command Prompt - Hub mode +set GPU_HOT_MODE=hub +set NODE_URLS=http://server1:1312,http://server2:1312 +python app.py +``` + +```powershell +# Windows PowerShell - Force nvidia-smi mode +$env:NVIDIA_SMI="true" +python app.py +``` + **Backend (`core/config.py`):** ```python UPDATE_INTERVAL = 0.5 # Polling interval @@ -131,14 +185,73 @@ gpu-hot/ **No GPUs detected:** ```bash -nvidia-smi # Verify drivers work -docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi # Test Docker GPU access +# Test NVIDIA drivers +nvidia-smi # Should show GPU list + +# Docker-specific test +docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi + +# Native installation - force nvidia-smi mode for older GPUs +export NVIDIA_SMI=true # Linux +set NVIDIA_SMI=true # Windows CMD +$env:NVIDIA_SMI="true" # Windows PowerShell +``` + +**Python/pip issues (Native installation):** +```bash +# Linux - Install missing dependencies +sudo apt update && sudo apt install python3 python3-pip +pip3 install -r requirements.txt + +# Verify Python version (3.8+ required) +python3 --version +``` + +```cmd +REM Windows - Install Python from python.org +python --version +pip install -r requirements.txt + +REM If 'python' not found, try: +python3 --version +py --version +``` + +**Port 1312 already in use:** +```bash +# Linux - Find what's using the port +sudo lsof -i :1312 +sudo netstat -tulpn | grep :1312 + +# Kill existing process or change port +export PORT=1313 # Linux +set PORT=1313 # Windows +``` + +**Permission errors (Windows):** +```cmd +REM Run Command Prompt or PowerShell as Administrator +REM Or install to user directory: +pip install --user -r requirements.txt +``` + +**Import errors:** +```bash +# Missing nvidia-ml-py (most common issue) +pip install nvidia-ml-py + +# Missing system packages (Linux) +sudo apt install build-essential python3-dev + +# Verify all dependencies +python -c "import pynvml, psutil, fastapi; print('Dependencies OK')" ``` **Hub can't connect to nodes:** ```bash curl http://node-ip:1312/api/gpu-data # Test connectivity -sudo ufw allow 1312/tcp # Check firewall +sudo ufw allow 1312/tcp # Linux firewall +netsh advfirewall firewall add rule name="GPU Hot" dir=in action=allow protocol=TCP localport=1312 # Windows firewall ``` **Performance issues:** Increase `UPDATE_INTERVAL` in `core/config.py` diff --git a/core/config.py b/core/config.py index 8e462a5..26c5886 100644 --- a/core/config.py +++ b/core/config.py @@ -4,6 +4,8 @@ import os import socket +import platform +import sys # Flask Configuration SECRET_KEY = 'gpu_hot_secret' @@ -26,3 +28,39 @@ # NODE_URLS: comma-separated URLs for hub mode (e.g., http://node1:1312,http://node2:1312) NODE_URLS = [url.strip() for url in os.getenv('NODE_URLS', '').split(',') if url.strip()] +# Platform Detection +PLATFORM = platform.system() # 'Windows', 'Linux', 'Darwin', etc. + +# Platform-specific configurations +PYTHON_EXECUTABLE = 'python' if PLATFORM == 'Windows' else 'python3' + +def get_platform_info(): + """Get comprehensive platform information for diagnostics""" + return { + 'platform': PLATFORM, + 'system': platform.system(), + 'release': platform.release(), + 'version': platform.version(), + 'machine': platform.machine(), + 'processor': platform.processor(), + 'python_version': platform.python_version(), + 'python_executable': sys.executable, + } + +def get_nvidia_smi_command(): + """Get the appropriate nvidia-smi command for the current platform""" + if PLATFORM == 'Windows': + # On Windows, nvidia-smi.exe should be in PATH + return 'nvidia-smi' + else: + # On Linux/Unix systems (including Darwin/macOS) + return 'nvidia-smi' + +def is_windows(): + """Check if running on Windows platform""" + return PLATFORM == 'Windows' + +def is_unix_like(): + """Check if running on Unix-like platform (Linux, Darwin, etc.)""" + return PLATFORM in ['Linux', 'Darwin', 'FreeBSD', 'OpenBSD', 'NetBSD'] + diff --git a/core/monitor.py b/core/monitor.py index fa1f946..033ec1a 100644 --- a/core/monitor.py +++ b/core/monitor.py @@ -6,7 +6,7 @@ import logging from .metrics import MetricsCollector from .nvidia_smi_fallback import parse_nvidia_smi -from .config import NVIDIA_SMI +from .config import NVIDIA_SMI, PLATFORM logger = logging.getLogger(__name__) @@ -206,14 +206,28 @@ def _get_processes_sync(self): return [] def _get_process_name(self, pid): - """Extract readable process name from PID with improved logic""" + """Extract readable process name from PID with improved cross-platform logic""" try: p = psutil.Process(pid) + # Define platform-specific process filters + if PLATFORM == 'Windows': + # Windows-specific interpreters and shells to skip + skip_processes = ['python.exe', 'python3.exe', 'cmd.exe', 'powershell.exe', + 'pwsh.exe', 'conhost.exe', 'wscript.exe', 'cscript.exe'] + skip_names = ['python', 'python3', 'cmd', 'powershell', 'pwsh', 'conhost'] + else: + # Unix-like systems + skip_processes = ['python', 'python3', 'sh', 'bash', 'zsh', 'fish', 'dash'] + skip_names = ['python', 'python3', 'sh', 'bash', 'zsh', 'fish', 'dash'] + # First try to get the process name try: process_name = p.name() - if process_name and process_name not in ['python', 'python3', 'sh', 'bash']: + if process_name and process_name.lower() not in [s.lower() for s in skip_processes]: + # Remove .exe extension on Windows for cleaner display + if PLATFORM == 'Windows' and process_name.lower().endswith('.exe'): + return process_name[:-4] return process_name except (psutil.AccessDenied, psutil.NoSuchProcess, psutil.ZombieProcess): pass @@ -227,24 +241,26 @@ def _get_process_name(self, pid): if not arg or arg.startswith('-'): continue - # Skip common interpreters and shells - if arg in ['python', 'python3', 'node', 'java', 'sh', 'bash', 'zsh']: - continue - - # Extract filename from path - filename = arg.split('/')[-1].split('\\')[-1] - - # Skip if it's still a generic name - if filename in ['python', 'python3', 'node', 'java', 'sh', 'bash']: - continue - - # Found a meaningful name - if filename: - return filename + # Extract filename from path (handle both / and \ separators) + filename = arg.replace('\\', '/').split('/')[-1] + + # Remove file extensions on Windows for cleaner display + if PLATFORM == 'Windows' and '.' in filename: + name_without_ext = filename.rsplit('.', 1)[0] + # Skip if it's still a generic name + if name_without_ext.lower() not in [s.lower() for s in skip_names]: + return name_without_ext + else: + # Skip common interpreters and shells + if filename.lower() not in [s.lower() for s in skip_names]: + return filename # Fallback to first argument if nothing else worked if cmdline[0]: - return cmdline[0].split('/')[-1].split('\\')[-1] + filename = cmdline[0].replace('\\', '/').split('/')[-1] + if PLATFORM == 'Windows' and filename.lower().endswith('.exe'): + filename = filename[:-4] + return filename except (psutil.AccessDenied, psutil.NoSuchProcess, psutil.ZombieProcess): pass diff --git a/core/nvidia_smi_fallback.py b/core/nvidia_smi_fallback.py index 6e1d6ad..1021aaa 100644 --- a/core/nvidia_smi_fallback.py +++ b/core/nvidia_smi_fallback.py @@ -6,6 +6,7 @@ import subprocess import logging from datetime import datetime +from .config import get_nvidia_smi_command, is_windows logger = logging.getLogger(__name__) @@ -13,8 +14,9 @@ def parse_nvidia_smi(): """Parse nvidia-smi output and extract comprehensive GPU information""" try: + nvidia_smi_cmd = get_nvidia_smi_command() result = subprocess.run([ - 'nvidia-smi', + nvidia_smi_cmd, '--query-gpu=index,name,uuid,driver_version,vbios_version,' 'temperature.gpu,utilization.gpu,utilization.memory,' 'memory.used,memory.total,memory.free,power.draw,power.limit,' @@ -24,7 +26,7 @@ def parse_nvidia_smi(): 'encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,' 'pstate,compute_mode', '--format=csv,noheader,nounits' - ], capture_output=True, text=True, timeout=10) + ], capture_output=True, text=True, timeout=10, shell=is_windows()) if result.returncode != 0: logger.warning(f"nvidia-smi comprehensive query failed (code {result.returncode}), trying basic query") @@ -95,13 +97,14 @@ def parse_nvidia_smi_fallback(): """Fallback parser with minimal, widely-supported fields""" try: logger.info("Using basic nvidia-smi query (minimal fields)") + nvidia_smi_cmd = get_nvidia_smi_command() result = subprocess.run([ - 'nvidia-smi', + nvidia_smi_cmd, '--query-gpu=index,name,temperature.gpu,utilization.gpu,utilization.memory,' 'memory.used,memory.total,power.draw,power.limit,fan.speed,' 'clocks.gr,clocks.sm,clocks.mem,pstate', '--format=csv,noheader,nounits' - ], capture_output=True, text=True, timeout=10) + ], capture_output=True, text=True, timeout=10, shell=is_windows()) if result.returncode != 0: logger.error(f"Basic nvidia-smi query also failed (code {result.returncode})")