Skip to content

Commit

Permalink
Fixed the GCP/GPU vm image build up
Browse files Browse the repository at this point in the history
  • Loading branch information
sidoruka committed Nov 2, 2023
1 parent ab251d7 commit 18e8659
Showing 1 changed file with 12 additions and 13 deletions.
25 changes: 12 additions & 13 deletions deploy/infra/gcp/install-gpu-node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ yum --enablerepo=elrepo-kernel install kernel-ml \
sed -i '/GRUB_DEFAULT=/c\GRUB_DEFAULT=0' /etc/default/grub && \
grub2-mkconfig -o /boot/grub2/grub.cfg
grep 'menuentry ' /boot/grub2/grub.cfg | cut -f 2 -d "'" | nl -v 0
grub2-set-default 'CentOS Linux (5.7.7-1.el7.elrepo.x86_64) 7 (Core)'
grub2-set-default 'CentOS Linux (6.5.9-1.el7.elrepo.x86_64) 7 (Core)'

###########
reboot
Expand All @@ -48,7 +48,9 @@ wget -q "https://cloud-pipeline-oss-builds.s3.amazonaws.com/tools/jq/jq-1.6/jq-l
chmod +x /usr/bin/jq

# Install nvidia driver deps
yum install -y gcc
yum install -y centos-release-scl && \
yum install -y devtoolset-9 && \
source /opt/rh/devtoolset-9/enable

# Install Docker
yum install -y yum-utils \
Expand Down Expand Up @@ -120,9 +122,9 @@ yum install -y \
kubelet-1.15.4-0.x86_64

# Install nvidia driver
wget http://us.download.nvidia.com/XFree86/Linux-x86_64/440.100/NVIDIA-Linux-x86_64-440.100.run && \
sh NVIDIA-Linux-x86_64-440.100.run --silent && \
rm -f NVIDIA-Linux-x86_64-440.100.run
wget https://us.download.nvidia.com/tesla/535.129.03/NVIDIA-Linux-x86_64-535.129.03.run && \
sh NVIDIA-*.run --silent && \
rm -f NVIDIA-*.run

# Install nvidia docker
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
Expand All @@ -136,11 +138,8 @@ curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.re
yum install nvidia-docker2-2.0.3-1.docker18.03* \
nvidia-container-runtime-2.0.0-1.docker18.03* -y

# According to https://aws.amazon.com/ru/premiumsupport/knowledge-center/g2-rhel-boot/ - the following shall be done for p3 instances (p2 work well)
# 1. Resize the instance, choosing any instance other than one in the g2 series.
# 2. Edit /etc/default/grub and add the following values to the GRUB_CMDLINE_LINUX line:
# rd.driver.blacklist=nouveau nouveau.modeset=0
# 3. Rebuild the grub configuration:
# grub2-mkconfig -o /boot/grub2/grub.cfg
sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT="/GRUB_CMDLINE_LINUX_DEFAULT="rd.driver.blacklist=nouveau nouveau.modeset=0 /g' /etc/default/grub
grub2-mkconfig -o /boot/grub2/grub.cfg
# Install "nvidia-container-runtime-hook", otherwise every single container will fail
# even kube containers
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo && \
yum install -y nvidia-container-toolkit

0 comments on commit 18e8659

Please sign in to comment.