Skip to content

Commit 087d81f

Browse files
committed
[Doc] Added nvidia gpu setup script for each node.
Signed-off-by: insukim1994 <insu.kim@moreh.io>
1 parent b5b9674 commit 087d81f

File tree

2 files changed

+63
-2
lines changed

2 files changed

+63
-2
lines changed

utils/install-cri-o.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,5 @@ sudo apt-get install -y cri-o
2626
sudo systemctl start crio.service
2727

2828
# Install CNI (container network interface) plugins
29-
wget https://raw.githubusercontent.com/cri-o/cri-o/refs/heads/main/contrib/cni/10-crio-bridge.conflist
30-
sudo cp 10-crio-bridge.conflist /etc/cni/net.d
29+
wget https://raw.githubusercontent.com/cri-o/cri-o/refs/heads/main/contrib/cni/11-crio-ipv4-bridge.conflist
30+
sudo cp 11-crio-ipv4-bridge.conflist /etc/cni/net.d

utils/nvidia-gpu-setup-k8s.sh

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Allow users to override the paths for the NVIDIA tools.
5+
: "${NVIDIA_SMI_PATH:=nvidia-smi}"
6+
: "${NVIDIA_CTK_PATH:=nvidia-ctk}"
7+
8+
# --- Debug and Environment Setup ---
9+
echo "Current PATH: $PATH"
10+
echo "Operating System: $(uname -a)"
11+
12+
# Get the script directory to reference local scripts reliably.
13+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
14+
15+
# --- Install Prerequisites ---
16+
echo "Installing kubectl and helm..."
17+
bash "$SCRIPT_DIR/install-kubectl.sh"
18+
bash "$SCRIPT_DIR/install-helm.sh"
19+
20+
21+
# --- Configure BPF (if available) ---
22+
if [ -f /proc/sys/net/core/bpf_jit_harden ]; then
23+
echo "Configuring BPF: Setting net.core.bpf_jit_harden=0"
24+
echo "net.core.bpf_jit_harden=0" | sudo tee -a /etc/sysctl.conf
25+
sudo sysctl -p
26+
else
27+
echo "BPF JIT hardening configuration not available, skipping..."
28+
fi
29+
30+
# --- NVIDIA GPU Setup ---
31+
GPU_AVAILABLE=false
32+
if command -v "$NVIDIA_SMI_PATH" >/dev/null 2>&1; then
33+
echo "NVIDIA GPU detected via nvidia-smi at: $(command -v "$NVIDIA_SMI_PATH")"
34+
if command -v "$NVIDIA_CTK_PATH" >/dev/null 2>&1; then
35+
echo "nvidia-ctk found at: $(command -v "$NVIDIA_CTK_PATH")"
36+
GPU_AVAILABLE=true
37+
else
38+
echo "nvidia-ctk not found. Please install the NVIDIA Container Toolkit to enable GPU support."
39+
fi
40+
fi
41+
42+
if [ "$GPU_AVAILABLE" = true ]; then
43+
# Configure Docker for GPU support.
44+
echo "Configuring Docker runtime for GPU support..."
45+
if sudo "$NVIDIA_CTK_PATH" runtime configure --runtime=docker; then
46+
echo "Restarting Docker to apply changes..."
47+
sudo systemctl restart docker
48+
echo "Docker runtime configured successfully."
49+
else
50+
echo "Error: Failed to configure Docker runtime using the NVIDIA Container Toolkit."
51+
exit 1
52+
fi
53+
54+
# Install the GPU Operator via Helm.
55+
echo "Adding NVIDIA helm repo and updating..."
56+
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
57+
echo "Installing GPU Operator..."
58+
helm install --wait gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator --version=v24.9.1
59+
fi
60+
61+
echo "NVIDIA GPU Setup complete."

0 commit comments

Comments
 (0)