Skip to content

Commit

Permalink
Install NVIDIA Drivers on Instances Missing the Drivers (#1684)
Browse files Browse the repository at this point in the history
Summary:
- Use the pytorch/test-infra action ot install NVIDIA drivers properly if the instance is missing the drivers

Pull Request resolved: #1684

Reviewed By: shintaro-iwasaki

Differential Revision: D44603925

Pulled By: q10

fbshipit-source-id: 712bdf5c2af67c5a6f540567abcc47ed892912c1
  • Loading branch information
q10 authored and facebook-github-bot committed Apr 2, 2023
1 parent cef67a5 commit 5ad0bf1
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 63 deletions.
42 changes: 18 additions & 24 deletions .github/scripts/setup_env.bash
Original file line number Diff line number Diff line change
Expand Up @@ -248,12 +248,14 @@ free_disk_space () {

print_gpu_info () {
echo "################################################################################"
echo "[INFO] Check GPU info ..."
echo "[INFO] Printing general display info ..."
install_system_packages lshw
print_exec sudo lshw -C display

echo "################################################################################"
echo "[INFO] Check NVIDIA GPU info ..."
echo "[INFO] Printing NVIDIA GPU info ..."

(lspci -v | grep -e 'controller.*NVIDIA') || true

if [[ "${ENFORCE_NVIDIA_GPU}" ]]; then
# Ensure that nvidia-smi is available and returns GPU entries
Expand All @@ -270,6 +272,11 @@ print_gpu_info () {
fi
fi

echo "################################################################################"
echo "[INFO] Printing AMD GPU info ..."

(lspci -v | grep -e 'Display controller: Advanced') || true

if [[ "${ENFORCE_AMD_GPU}" ]]; then
# Ensure that rocm-smi is available and returns GPU entries
if ! rocm-smi; then
Expand All @@ -288,28 +295,32 @@ print_gpu_info () {

__print_system_info_linux () {
echo "################################################################################"
echo "[INFO] Check ldd version ..."
echo "[INFO] Print ldd version ..."
print_exec ldd --version

echo "################################################################################"
echo "[INFO] Check CPU info ..."
echo "[INFO] Print CPU info ..."
print_exec nproc
print_exec cat /proc/cpuinfo

echo "################################################################################"
echo "[INFO] Check Linux distribution info ..."
echo "[INFO] Print PCI info ..."
print_exec lspci -v

echo "################################################################################"
echo "[INFO] Print Linux distribution info ..."
print_exec uname -a
print_exec cat /proc/version
print_exec cat /etc/os-release
}

__print_system_info_macos () {
echo "################################################################################"
echo "[INFO] Check CPU info ..."
echo "[INFO] Print CPU info ..."
sysctl -a | grep machdep.cpu

echo "################################################################################"
echo "[INFO] Check MacOS version info ..."
echo "[INFO] Print MacOS version info ..."
print_exec uname -a
print_exec sw_vers
}
Expand Down Expand Up @@ -684,23 +695,6 @@ install_pytorch_pip () {
# CUDA Setup Functions
################################################################################

install_nvidia_drivers_centos () {
echo "################################################################################"
echo "# Install NVIDIA Drivers"
echo "#"
echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)"
echo "################################################################################"
echo ""

echo "[SETUP] Adding NVIDIA repos to yum ..."
print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
print_exec sudo yum clean expire-cache

echo "[SETUP] Installing NVIDIA drivers ..."
install_system_packages nvidia-driver-latest-dkms
}

install_cuda () {
local env_name="$1"
local cuda_version="$2"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils build-essential cmake git libblas-dev python3 sudo wget
apt install -y binutils build-essential cmake git libblas-dev pciutils python3 sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
Expand Down Expand Up @@ -120,7 +120,7 @@ jobs:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils build-essential cmake git libblas-dev python3 sudo unzip wget
apt install -y binutils build-essential cmake git libblas-dev pciutils python3 sudo unzip wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils git sudo wget
apt install -y binutils git pciutils sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
Expand Down Expand Up @@ -177,7 +177,7 @@ jobs:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils build-essential git sudo wget
apt install -y binutils build-essential git pciutils sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_cpu_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git sudo wget which
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
Expand Down Expand Up @@ -119,7 +119,7 @@ jobs:

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git sudo wget which
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_cpu_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git sudo wget which
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
Expand Down Expand Up @@ -110,7 +110,7 @@ jobs:

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git sudo wget which
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
Expand Down
23 changes: 8 additions & 15 deletions .github/workflows/fbgemm_gpu_cuda_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git sudo tar wget which
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v3
Expand Down Expand Up @@ -108,10 +108,9 @@ jobs:

# Download the built artifact from GHA, test on GPU, and push to PyPI
test_and_publish_artifact:
# runs-on: linux.4xlarge.nvidia.gpu
# Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
runs-on: linux.g5.4xlarge.nvidia.gpu
container:
image: ${{ matrix.container-image }}
options: --user root --gpus all
defaults:
run:
shell: bash
Expand All @@ -122,20 +121,13 @@ jobs:
strategy:
fail-fast: false
matrix:
container-image: [ "nvidia/cuda:11.8.0-base-ubuntu20.04" ]
python-version: [ "3.8", "3.9", "3.10", "3.11" ]
cuda-version: [ "11.7.1", "11.8.0" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "11.7.1" ]
needs: build_artifact

steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils curl git sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
with:
Expand All @@ -146,6 +138,10 @@ jobs:
with:
name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl

# Use PyTorch test infrastructure action - https://github.com/pytorch/test-infra/blob/main/.github/actions/setup-nvidia/action.yml
- name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
uses: pytorch/test-infra/.github/actions/setup-nvidia@main

- name: Display System Info
run: . $PRELUDE; print_system_info; print_ec2_info

Expand All @@ -169,10 +165,7 @@ jobs:
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Install FBGEMM_GPU Nightly
run: |
. $PRELUDE
pwd; ls -la .
install_fbgemm_gpu_package $BUILD_ENV *.whl
run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl

- name: Test with PyTest
timeout-minutes: 10
Expand Down
20 changes: 5 additions & 15 deletions .github/workflows/fbgemm_gpu_cuda_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git sudo tar wget which
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v3
Expand Down Expand Up @@ -100,9 +100,6 @@ jobs:
# Download the built artifact from GHA, test on GPU, and push to PyPI
test_and_publish_artifact:
runs-on: linux.g5.4xlarge.nvidia.gpu
container:
image: ${{ matrix.container-image }}
options: --user root --gpus all
defaults:
run:
shell: bash
Expand All @@ -113,20 +110,13 @@ jobs:
strategy:
fail-fast: false
matrix:
container-image: [ "nvidia/cuda:11.8.0-base-ubuntu20.04" ]
python-version: [ "3.8", "3.9", "3.10", "3.11" ]
cuda-version: [ "11.7.1", "11.8.0" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "11.7.1" ]
needs: build_artifact

steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils curl git sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
with:
Expand All @@ -137,6 +127,9 @@ jobs:
with:
name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl

- name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
uses: pytorch/test-infra/.github/actions/setup-nvidia@main

- name: Display System Info
run: . $PRELUDE; print_system_info; print_ec2_info

Expand All @@ -159,10 +152,7 @@ jobs:
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Install FBGEMM_GPU
run: |
. $PRELUDE
pwd; ls -la .
install_fbgemm_gpu_package $BUILD_ENV *.whl
run: . $PRELUDE; install_fbgemm_gpu_package $BUILD_ENV *.whl

- name: Test with PyTest
timeout-minutes: 10
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git rsync sudo tar wget which
run: yum update -y; yum install -y binutils findutils git pciutils rsync sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v3
Expand Down

0 comments on commit 5ad0bf1

Please sign in to comment.