|
| 1 | +#!/bin/bash |
| 2 | +set -e |
| 3 | + |
| 4 | +# Allow users to override the paths for the NVIDIA tools. |
| 5 | +: "${NVIDIA_SMI_PATH:=nvidia-smi}" |
| 6 | +: "${NVIDIA_CTK_PATH:=nvidia-ctk}" |
| 7 | + |
| 8 | +# --- Debug and Environment Setup --- |
| 9 | +echo "Current PATH: $PATH" |
| 10 | +echo "Operating System: $(uname -a)" |
| 11 | + |
| 12 | +# Get the script directory to reference local scripts reliably. |
| 13 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 14 | + |
| 15 | +# --- Install Prerequisites --- |
| 16 | +echo "Installing kubectl and helm..." |
| 17 | +bash "$SCRIPT_DIR/install-kubectl.sh" |
| 18 | +bash "$SCRIPT_DIR/install-helm.sh" |
| 19 | + |
| 20 | + |
| 21 | +# --- Configure BPF (if available) --- |
| 22 | +if [ -f /proc/sys/net/core/bpf_jit_harden ]; then |
| 23 | + echo "Configuring BPF: Setting net.core.bpf_jit_harden=0" |
| 24 | + echo "net.core.bpf_jit_harden=0" | sudo tee -a /etc/sysctl.conf |
| 25 | + sudo sysctl -p |
| 26 | +else |
| 27 | + echo "BPF JIT hardening configuration not available, skipping..." |
| 28 | +fi |
| 29 | + |
| 30 | +# --- NVIDIA GPU Setup --- |
| 31 | +GPU_AVAILABLE=false |
| 32 | +if command -v "$NVIDIA_SMI_PATH" >/dev/null 2>&1; then |
| 33 | + echo "NVIDIA GPU detected via nvidia-smi at: $(command -v "$NVIDIA_SMI_PATH")" |
| 34 | + if command -v "$NVIDIA_CTK_PATH" >/dev/null 2>&1; then |
| 35 | + echo "nvidia-ctk found at: $(command -v "$NVIDIA_CTK_PATH")" |
| 36 | + GPU_AVAILABLE=true |
| 37 | + else |
| 38 | + echo "nvidia-ctk not found. Please install the NVIDIA Container Toolkit to enable GPU support." |
| 39 | + fi |
| 40 | +fi |
| 41 | + |
| 42 | +if [ "$GPU_AVAILABLE" = true ]; then |
| 43 | + # Configure Docker for GPU support. |
| 44 | + echo "Configuring Docker runtime for GPU support..." |
| 45 | + if sudo "$NVIDIA_CTK_PATH" runtime configure --runtime=docker; then |
| 46 | + echo "Restarting Docker to apply changes..." |
| 47 | + sudo systemctl restart docker |
| 48 | + echo "Docker runtime configured successfully." |
| 49 | + else |
| 50 | + echo "Error: Failed to configure Docker runtime using the NVIDIA Container Toolkit." |
| 51 | + exit 1 |
| 52 | + fi |
| 53 | + |
| 54 | + # Install the GPU Operator via Helm. |
| 55 | + echo "Adding NVIDIA helm repo and updating..." |
| 56 | + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update |
| 57 | + echo "Installing GPU Operator..." |
| 58 | + helm install --wait gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator --version=v24.9.1 |
| 59 | +fi |
| 60 | + |
| 61 | +echo "NVIDIA GPU Setup complete." |
0 commit comments