From 66645917cf0618abfa847ecdbd70e3bd3372d982 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 17 May 2024 11:13:15 +0200 Subject: [PATCH 1/9] {2023.06}[system] cuDNN/8.9.2.26-CUDA-12.1.1 --- EESSI-install-software.sh | 5 +- create_lmodsitepackage.py | 29 ++- .../eessi-2023.06-eb-4.9.1-001-system.yml | 1 + eb_hooks.py | 79 +++++++ install_scripts.sh | 2 +- .../nvidia/install_cudnn_host_injections.sh | 210 ++++++++++++++++++ 6 files changed, 321 insertions(+), 5 deletions(-) create mode 100644 scripts/gpu_support/nvidia/install_cudnn_host_injections.sh diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 8a5789c2b2..5d848fc7e4 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -217,7 +217,7 @@ pr_diff=$(ls [0-9]*.diff | head -1) # for now, this just reinstalls all scripts. Note the most elegant, but works ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX} -# Install full CUDA SDK in host_injections +# Install full CUDA SDK and cu* libraries in host_injections # Hardcode this for now, see if it works # TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install # Allow skipping CUDA SDK install in e.g. CI environments @@ -234,8 +234,9 @@ fi if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh -c 12.1.1 --accept-cuda-eula else - echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" + echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" fi # Install drivers in host_injections diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 47aa20e51e..5e44ba8187 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -172,13 +172,38 @@ end end +local function eessi_cudnn_enabled_load_hook(t) + local frameStk = require("FrameStk"):singleton() + local mt = frameStk:mt() + local simpleName = string.match(t.modFullName, "(.-)/") + -- If we try to load cuDNN itself, check if the full cuDNN package was installed on the host in host_injections. + -- This is required for end users to build additional cuDNN dependent software. If the full SDK isn't present, refuse + -- to load the cuDNN module and print an informative message on how to set up GPU support for EESSI + local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n" + if simpleName == 'cuDNN' then + -- get the full host_injections path + local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') + -- build final path where the cuDNN software should be installed + local cudnnEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" + local cudnnDirExists = isDir(cudnnEasyBuildDir) + if not cudnnDirExists then + local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " + advice = advice .. "due to licencing. You will need to install a full copy of the cuDNN package where EESSI " + advice = advice .. "can find it.\\n" + advice = advice .. refer_to_docs + LmodError("\\nYou requested to load ", simpleName, " ", advice) + end + end +end + -- Combine both functions into a single one, as we can only register one function as load hook in lmod -- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed function eessi_load_hook(t) - -- Only apply CUDA hooks if the loaded module is in the EESSI prefix - -- This avoids getting an Lmod Error when trying to load a CUDA module from a local software stack + -- Only apply CUDA and cuDNN hooks if the loaded module is in the EESSI prefix + -- This avoids getting an Lmod Error when trying to load a CUDA and cuDNN module from a local software stack if from_eessi_prefix(t) then eessi_cuda_enabled_load_hook(t) + eessi_cudnn_enabled_load_hook(t) end end diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.1-001-system.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.1-001-system.yml index 46ac979719..4e043f3c48 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.1-001-system.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.1-001-system.yml @@ -3,3 +3,4 @@ easyconfigs: options: from-pr: 20299 - EESSI-extend-2023.06-easybuild.eb + - cuDNN-8.9.2.26-CUDA-12.1.1.eb diff --git a/eb_hooks.py b/eb_hooks.py index 8b0a11b0ed..27bb68bef1 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -632,6 +632,64 @@ def post_sanitycheck_cuda(self, *args, **kwargs): raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!") + +def post_sanitycheck_cudnn(self, *args, **kwargs): + """ + Remove files from cuDNN installation that we are not allowed to ship, + and replace them with a symlink to a corresponding installation under host_injections. + """ + if self.name == 'cuDNN': + print_msg("Replacing files in cuDNN installation that we can not ship with symlinks to host_injections...") + + allowlist = ['LICENSE'] + + # read cuDNN LICENSE, construct allowlist based on section 2. Distribution + # that specifies list of files that can be shipped + license_path = os.path.join(self.installdir, 'LICENSE') + search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:" + with open(license_path) as infile: + for line in infile: + if line.strip().startswith(search_string): + # remove search string, split into words, remove trailing + # dots '.' and only retain words starting with a dot '.' + distributable = line[len(search_string):] + for word in distributable.split(): + if word[0] == '.': + allowlist.append(word.rstrip('.')) + + allowlist = sorted(set(allowlist)) + self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist)) + + # iterate over all files in the CUDA installation directory + for dir_path, _, files in os.walk(self.installdir): + for filename in files: + full_path = os.path.join(dir_path, filename) + # we only really care about real files, i.e. not symlinks + if not os.path.islink(full_path): + # check if the current file is part of the allowlist + basename = filename.split('.')[0] + if '.' in filename: + extension = '.' + filename.split('.')[1] + if basename in allowlist: + self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) + elif '.' in filename and extension in allowlist: + self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path) + else: + self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s", + filename, full_path) + # if it is not in the allowlist, delete the file and create a symlink to host_injections + host_inj_path = full_path.replace('versions', 'host_injections') + # make sure source and target of symlink are not the same + if full_path == host_inj_path: + raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " + "are using this hook for a NESSI installation?", + full_path, host_inj_path) + remove_file(full_path) + symlink(host_inj_path, full_path) + else: + raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!") + + def inject_gpu_property(ec): """ Add 'gpu' property, via modluafooter easyconfig parameter @@ -656,6 +714,26 @@ def inject_gpu_property(ec): ec[key] = '\n'.join([ec_dict[key], value]) else: ec[key] = value + + # Check if cuDNN is in the dependencies, if so add the 'gpu' Lmod property + if ('cuDNN' in [dep[0] for dep in iter(ec_dict['dependencies'])]): + ec.log.info("Injecting gpu as Lmod arch property and envvar with cuDNN version") + key = 'modluafooter' + value = 'add_property("arch","gpu")' + cudnn_version = 0 + for dep in iter(ec_dict['dependencies']): + # Make cuDNN a build dependency only (rpathing saves us from link errors) + if 'cuDNN' in dep[0]: + cudnn_version = dep[1] + ec_dict['dependencies'].remove(dep) + if dep not in ec_dict['builddependencies']: + ec_dict['builddependencies'].append(dep) + value = '\n'.join([value, 'setenv("EESSICUDNNVERSION","%s")' % cudnn_version]) + if key in ec_dict: + if not value in ec_dict[key]: + ec[key] = '\n'.join([ec_dict[key], value]) + else: + ec[key] = value return ec @@ -709,4 +787,5 @@ def inject_gpu_property(ec): POST_SANITYCHECK_HOOKS = { 'CUDA': post_sanitycheck_cuda, + 'cuDNN': post_sanitycheck_cudnn, } diff --git a/install_scripts.sh b/install_scripts.sh index 17f0b81008..ee15a715f3 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -110,7 +110,7 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@ # Copy files for the scripts/gpu_support/nvidia directory nvidia_files=( - install_cuda_host_injections.sh link_nvidia_host_libraries.sh + install_cuda_host_injections.sh install_cudnn_host_injections.sh link_nvidia_host_libraries.sh ) copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}" diff --git a/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh b/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh new file mode 100644 index 0000000000..d23cd5ebb7 --- /dev/null +++ b/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh @@ -0,0 +1,210 @@ +#!/usr/bin/env bash + +# This script can be used to install cuDNN under the `.../host_injections` directory. +# This provides the parts of the cuDNN installation that cannot be redistributed as +# part of EESSI due to license limitations. While GPU-based software from EESSI will +# _run_ without these, installation of additional software that requires the cuDNN +# installation(s) under `host_injections` to be present. +# +# The `host_injections` directory is a variant symlink that by default points to +# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see +# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the +# installation to be successful, this directory needs to be writeable by the user +# executing this script. + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../utils.sh + +# Function to display help message +show_help() { + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --help Display this help message" + echo " -c, --cuda-version CUDA_VERSION Specify a version of CUDA to be used" + echo " when installing cuDNN (must" + echo " have a corresponding easyconfig in the" + echo " EasyBuild release)" + echo " -d, --cudnn-version CUDNN_VERSION Specify a version of cuDNN to install (must" + echo " have a corresponding easyconfig in the" + echo " EasyBuild release)" + echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" + echo " storage during the cuDNN install" + echo " (must have >10GB available)" +} + +# Initialize variables +cuda_version="" +cudnn_version="" + +# Parse command-line options +while [[ $# -gt 0 ]]; do + case "$1" in + --help) + show_help + exit 0 + ;; + -c|--cuda-version) + if [ -n "$2" ]; then + cuda_version="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + -d|--cudnn-version) + if [ -n "$2" ]; then + cudnn_version="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + -t|--temp-dir) + if [ -n "$2" ]; then + CUDA_TEMP_DIR="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + *) + show_help + fatal_error "Error: Unknown option: $1" + ;; + esac +done + +# Make sure EESSI is initialised +check_eessi_initialised + +# Make sure the CUDA version supplied is a semantic version +is_semantic_version() { + local version=$1 + local regex='^[0-9]+\.[0-9]+\.[0-9]+$' + + if [[ $version =~ $regex ]]; then + return 0 # Return success (0) if it's a semantic version + else + return 1 # Return failure (1) if it's not a semantic version + fi +} +if ! is_semantic_version "$cuda_version"; then + show_help + error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" + error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" + error="${error}version to provide is probably one of those available under\n" + error="${error}$EESSI_SOFTWARE_PATH/software/cuDNN\n" + fatal_error "${error}" +fi + +# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` +cudnn_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} + +# Only install cuDNN if specified version is not found. +# (existence of easybuild subdir implies a successful install) +if [ -d "${cudnn_install_parent}"/software/cuDNN/*-CUDA-"${cuda_version}"/easybuild ]; then + echo_green "cuDNN software found! No need to install cuDNN again." +else + # We need to be able write to the installation space so let's make sure we can + if ! create_directory_structure "${cudnn_install_parent}"/software/cuDNN ; then + fatal_error "No write permissions to directory ${cudnn_install_parent}/software/cuDNN" + fi + + # we need a directory we can use for temporary storage + if [[ -z "${CUDA_TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) + else + tmpdir="${CUDA_TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi + fi + + required_space_in_tmpdir=50000 + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${cudnn_install_parent}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < 5000000 )); then + fatal_error "Need at least 5GB disk space to install cuDNN under ${cudnn_install_parent}, exiting now..." + fi + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" + fi + + if ! command -v "eb" &>/dev/null; then + echo_yellow "Attempting to load an EasyBuild module to do actual install" + module load EasyBuild + # There are some scenarios where this may fail + if [ $? -ne 0 ]; then + error="'eb' command not found in your environment and\n" + error="${error} module load EasyBuild\n" + error="${error}failed for some reason.\n" + error="${error}Please re-run this script with the 'eb' command available." + fatal_error "${error}" + fi + fi + + cudnn_easyconfig="cuDNN-${cudnn_version}-CUDA-${cuda_version}.eb" + + # Check the easyconfig file is available in the release + # (eb search always returns 0, so we need a grep to ensure a usable exit code) + eb --search ^${cudnn_easyconfig}|grep cuDNN > /dev/null 2>&1 + # Check the exit code + if [ $? -ne 0 ]; then + eb_version=$(eb --version) + available_cudnn_easyconfigs=$(eb --search ^cuDNN-*.eb|grep cuDNN) + + error="The easyconfig ${cudnn_easyconfig} was not found in EasyBuild version:\n" + error="${error} ${eb_version}\n" + error="${error}You either need to give a different version of CUDA to install _or_ \n" + error="${error}use a different version of EasyBuild for the installation.\n" + error="${error}\nThe versions of available with the current eb command are:\n" + error="${error}${available_cudnn_easyconfigs}" + fatal_error "${error}" + fi + + # We need the --rebuild option, as the cuDNN module may or may not be on the + # `MODULEPATH` yet. Even if it is, we still want to redo this installation + # since it will provide the symlinked targets for the parts of the cuDNN + # installation in the `.../versions/...` prefix + # We install the module in our `tmpdir` since we do not need the modulefile, + # we only care about providing the targets for the symlinks. + extra_args="--rebuild --installpath-modules=${tmpdir}" + + # We don't want hooks used in this install, we need a vanilla cuDNN installation + touch "$tmpdir"/none.py + # shellcheck disable=SC2086 # Intended splitting of extra_args + eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cudnn_install_parent}"/ "${cudnn_easyconfig}" + ret=$? + if [ $ret -ne 0 ]; then + eb_last_log=$(unset EB_VERBOSE; eb --last-log) + cp -a ${eb_last_log} . + fatal_error "cuDNN installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." + else + echo_green "cuDNN installation at ${cudnn_install_parent}/software/cuDNN/${cudnn_version}-CUDA-${cuda_version} succeeded!" + fi + # clean up tmpdir + rm -rf "${tmpdir}" +fi From 0d744e7a45052258e88d6ee27cbddba8b7fdadce Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 17 May 2024 11:14:25 +0200 Subject: [PATCH 2/9] add x permissions to install script --- scripts/gpu_support/nvidia/install_cudnn_host_injections.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/gpu_support/nvidia/install_cudnn_host_injections.sh diff --git a/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh b/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh old mode 100644 new mode 100755 From 0d8a896a20a2e7875b4e6c0551089161142a5d32 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 17 May 2024 12:42:42 +0200 Subject: [PATCH 3/9] fix arguments to cuDNN install script --- EESSI-install-software.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 5d848fc7e4..111fb67e53 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -234,7 +234,7 @@ fi if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh -c 12.1.1 --accept-cuda-eula + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh -c 12.1.1 -d 8.9.2.26 else echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" fi From bd3469e998b5f9209695bc801bb219dacaa74755 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 17 May 2024 14:53:01 +0200 Subject: [PATCH 4/9] handle multiple dependencies to CUDA and related packages --- eb_hooks.py | 78 ++++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 27bb68bef1..17da9ca256 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -660,7 +660,7 @@ def post_sanitycheck_cudnn(self, *args, **kwargs): allowlist = sorted(set(allowlist)) self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist)) - # iterate over all files in the CUDA installation directory + # iterate over all files in the cuDNN installation directory for dir_path, _, files in os.walk(self.installdir): for filename in files: full_path = os.path.join(dir_path, filename) @@ -682,7 +682,7 @@ def post_sanitycheck_cudnn(self, *args, **kwargs): # make sure source and target of symlink are not the same if full_path == host_inj_path: raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " - "are using this hook for a NESSI installation?", + "are using this hook for a EESSI installation?", full_path, host_inj_path) remove_file(full_path) symlink(host_inj_path, full_path) @@ -692,48 +692,48 @@ def post_sanitycheck_cudnn(self, *args, **kwargs): def inject_gpu_property(ec): """ - Add 'gpu' property, via modluafooter easyconfig parameter + Add 'gpu' property EESSIVERSION envvars and drop dependencies to + build dependencies, via modluafooter easyconfig parameter """ ec_dict = ec.asdict() - # Check if CUDA is in the dependencies, if so add the 'gpu' Lmod property - if ('CUDA' in [dep[0] for dep in iter(ec_dict['dependencies'])]): - ec.log.info("Injecting gpu as Lmod arch property and envvar with CUDA version") + # check if CUDA, cuDNN, you-name-it is in the dependencies, if so + # - drop dependency to build dependency + # - add 'gpu' Lmod property + # - add envvar with package version + packages_list = ( "CUDA", "cuDNN" ) + packages_version = { } + add_gpu_property = '' + + for package in packages_list: + # Check if package is in the dependencies, if so drop dependency to build + # dependency and set variable for later adding the 'gpu' Lmod property + if (package in [dep[0] for dep in iter(ec_dict['dependencies'])]): + add_gpu_property = 'add_property("arch","gpu")' + for dep in iter(ec_dict['dependencies']): + if package in dep[0]: + # make package a build dependency only (rpathing saves us from link errors) + ec.log.info("Dropping dependency on %s to build dependency" % package) + ec_dict['dependencies'].remove(dep) + if dep not in ec_dict['builddependencies']: + ec_dict['builddependencies'].append(dep) + # take note of version for creating the modluafooter + packages_version[package] = dep[1] + if add_gpu_property: + ec.log.info("Injecting gpu as Lmod arch property and envvars for dependencies with their version") key = 'modluafooter' - value = 'add_property("arch","gpu")' - cuda_version = 0 - for dep in iter(ec_dict['dependencies']): - # Make CUDA a build dependency only (rpathing saves us from link errors) - if 'CUDA' in dep[0]: - cuda_version = dep[1] - ec_dict['dependencies'].remove(dep) - if dep not in ec_dict['builddependencies']: - ec_dict['builddependencies'].append(dep) - value = '\n'.join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) - if key in ec_dict: - if not value in ec_dict[key]: - ec[key] = '\n'.join([ec_dict[key], value]) + values = [add_gpu_property] + for package, version in packages_version.items(): + envvar = "EESSI%sVERSION" %s package.upper() + values.append('setenv("%s","%s")' % (envvar, version)) + if not key in ec_dict: + ec[key] = '\n'.join(values) else: - ec[key] = value + new_value = ec_dict[key] + for value in values: + if not value in new_value: + new_value = '\n'.join([new_value, value]) + ec[key] = new_value - # Check if cuDNN is in the dependencies, if so add the 'gpu' Lmod property - if ('cuDNN' in [dep[0] for dep in iter(ec_dict['dependencies'])]): - ec.log.info("Injecting gpu as Lmod arch property and envvar with cuDNN version") - key = 'modluafooter' - value = 'add_property("arch","gpu")' - cudnn_version = 0 - for dep in iter(ec_dict['dependencies']): - # Make cuDNN a build dependency only (rpathing saves us from link errors) - if 'cuDNN' in dep[0]: - cudnn_version = dep[1] - ec_dict['dependencies'].remove(dep) - if dep not in ec_dict['builddependencies']: - ec_dict['builddependencies'].append(dep) - value = '\n'.join([value, 'setenv("EESSICUDNNVERSION","%s")' % cudnn_version]) - if key in ec_dict: - if not value in ec_dict[key]: - ec[key] = '\n'.join([ec_dict[key], value]) - else: - ec[key] = value return ec From db85e23a24225396335fbd675c0e676167a23b22 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 17 May 2024 15:08:28 +0200 Subject: [PATCH 5/9] fix syntax --- eb_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index 17da9ca256..46e79810d9 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -723,7 +723,7 @@ def inject_gpu_property(ec): key = 'modluafooter' values = [add_gpu_property] for package, version in packages_version.items(): - envvar = "EESSI%sVERSION" %s package.upper() + envvar = "EESSI%sVERSION" % package.upper() values.append('setenv("%s","%s")' % (envvar, version)) if not key in ec_dict: ec[key] = '\n'.join(values) From 3aad5d903786cdd0a4d763ddb08ef29c5078eee4 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Sun, 19 May 2024 01:11:46 +0200 Subject: [PATCH 6/9] generalized CUDA/libraries installation script and easystack file --- .../eessi-2023.06-cuda-and-libraries.yml | 3 + .../nvidia/install_cuda_and_libraries.sh | 199 ++++++++++++++++++ 2 files changed, 202 insertions(+) create mode 100644 scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml create mode 100755 scripts/gpu_support/nvidia/install_cuda_and_libraries.sh diff --git a/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml b/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml new file mode 100644 index 0000000000..e0e47bf2d8 --- /dev/null +++ b/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml @@ -0,0 +1,3 @@ +easyconfigs: + - CUDA-12.1.1.eb + - cuDNN-8.9.2.26-CUDA-12.1.1.eb diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh new file mode 100755 index 0000000000..fbaba6e587 --- /dev/null +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -0,0 +1,199 @@ +#!/usr/bin/env bash + +# This script can be used to install CUDA and other libraries by NVIDIA under +# the `.../host_injections` directory. +# +# This provides the parts of the CUDA installation and other libriaries that +# cannot be redistributed as part of EESSI due to license limitations. While +# GPU-based software from EESSI will _run_ without these, installation of +# additional software that builds upon CUDA or other libraries requires that +# these installation are present under `host_injections`. +# +# The `host_injections` directory is a variant symlink that by default points to +# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see +# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the +# installation to be successful, this directory needs to be writeable by the user +# executing this script. + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../utils.sh + +# Function to display help message +show_help() { + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --help Display this help message" + echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" + echo " CUDA, see the EULA at" + echo " https://docs.nvidia.com/cuda/eula/index.html" + echo " -e, --easystack EASYSTACKFILE Path to easystack file that defines which" + echo " packages shall be installed" + echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" + echo " storage during the installation of CUDA" + echo " and/or other libraries (must have" + echo " several GB available; depends on the number of installations)" +} + +# Initialize variables +eula_accepted=0 +EASYSTACKFILE= +TEMP_DIR= + +# Parse command-line options +while [[ $# -gt 0 ]]; do + case "$1" in + --help) + show_help + exit 0 + ;; + --accept-cuda-eula) + eula_accepted=1 + shift 1 + ;; + -e|--easystack) + if [ -n "$2" ]; then + EASYSTACKFILE="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + -t|--temp-dir) + if [ -n "$2" ]; then + TEMP_DIR="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + *) + show_help + fatal_error "Error: Unknown option: $1" + ;; + esac +done + +if [[ -z "${EASYSTACKFILE}" ]]; then + fatal_error "Need the name/path to an easystack file. See command line options\n" +fi + +# Make sure EESSI is initialised +check_eessi_initialised + +# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` +# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) +export EESSI_SITE_INSTALL=${EESSI_SOFTWARE_PATH/versions/host_injections} + +# we need a directory we can use for temporary storage +if [[ -z "${TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) +else + tmpdir="${TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi +fi + +# load EESSI-extend/2023.06-easybuild module && verify that it is loaded +EESSI_EXTEND_MODULE="EESSI-extend/2023.06-easybuild" +module load ${EESSI_EXTEND_MODULE} +ret=$? +if [ "${ret}" -ne 0 ]; then + fatal_error "An error occured while trying to load ${EESSI_EXTEND_MODULE}\n" +fi + +# do a 'eb --dry-run-short' with the EASYSTACKFILE and determine list of packages +# to be installed +echo ">> Determining if packages specified in ${EASYSTACKFILE} are missing under ${EESSI_SITE_INSTALL}" +eb_dry_run_short_out=${tmpdir}/eb_dry_run_short.out +eb --dry-run-short --rebuild --easystack ${EASYSTACKFILE} 2>&1 | tee ${eb_dry_run_short_out} +ret=$? + +# Check if CUDA shall be installed +cuda_install_needed=0 +cat ${eb_dry_run_short_out} | grep "^ \* \[R\]" | grep "\/c\/CUDA\/" +ret=$? +if [ "${ret}" -eq 0 ]; then + cuda_install_needed=1 +fi + +# Make sure the CUDA EULA is accepted if it shall be installed +if [ "${cuda_install_needed}" -eq 1 ] && [ "${eula_accepted}" -ne 1 ]; then + show_help + error="\nCUDA shall be installed. However, the CUDA EULA has not been accepted\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" + fatal_error "${error}" +fi + +# determine the number of packages to be installed (assume 5 GB + num_packages * +# 3GB space needed) +number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[R\]" | wc -l) +base_storage_space=$((5000000 + ${number_of_packages} * 3000000)) + +required_space_in_tmpdir=${base_storage_space} +# Let's see if we have sources and build locations defined if not, we use the temporary space +if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space})) +fi +if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space})) +fi + +# The install is pretty fat, you need lots of space for download/unpack/install +# (~3*${base_storage_space}*1000 Bytes), +# need to do a space check before we proceed +avail_space=$(df --output=avail "${EESSI_SITE_INSTALL}"/ | tail -n 1 | awk '{print $1}') +min_disk_storage=$((3 * ${base_storage_space})) +if (( avail_space < ${min_disk_storage} )); then + fatal_error "Need at least $(echo "${min_disk_storage} / 1000000" | bc) GB disk space to install CUDA and other libraries under ${EESSI_SITE_INSTALL}, exiting now..." +fi +avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') +if (( avail_space < required_space_in_tmpdir )); then + error="Need at least $(echo "${required_space_in_tmpdir} / 1000000" | bc) temporary disk space under ${tmpdir}.\n" + error="${error}Set the environment variable TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH" + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" +fi + +# Brief explanation of parameters: +# - prefix: using $tmpdir as default base directory for several EB settings +# - rebuild: we need the --rebuild option, as the CUDA module may or may not be on the +# `MODULEPATH` yet. Even if it is, we still want to redo this installation +# since it will provide the symlinked targets for the parts of the CUDA +# and/or other installation in the `.../versions/...` prefix +# - install-path-modules: We install the module in our `tmpdir` since we do not need the modulefile, +# we only care about providing the targets for the symlinks. +# - ${cuda_arg}: We only set the --accept-eula-for=CUDA option if CUDA will be installed and if +# this script was called with the argument --accept-cuda-eula. +# - hooks: We don't want hooks used in this install, we need vanilla +# installations of CUDA and/or other libraries +# - easystack: Path to easystack file that defines which packages shall be +# installed +cuda_arg= +if [[ ]]; then + cuda_arg="--accept-eula-for=CUDA" +fi +touch "$tmpdir"/none.py +eb --prefix="$tmpdir" \ + --rebuild \ + --install-path-modules=${tmpdir} \ + {cuda_arg} \ + --hooks="$tmpdir"/none.py \ + --easystack ${EASYSTACKFILE} +ret=$? +if [ $ret -ne 0 ]; then + eb_last_log=$(unset EB_VERBOSE; eb --last-log) + cp -a ${eb_last_log} . + fatal_error "some installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." +else + echo_green "all installations at ${EESSI_SITE_INSTALL}/software/... succeeded!" +fi +# clean up tmpdir +rm -rf "${tmpdir}" From e1919051ed09cdec41ee81c20cca9487e15ffd14 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 21 May 2024 09:31:19 +0200 Subject: [PATCH 7/9] small improvements after testing script --- .../nvidia/install_cuda_and_libraries.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index fbaba6e587..140fd8d4e1 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -99,6 +99,9 @@ else fi fi +# workaround for EasyBuild not being found when loading "extend" module +module load EasyBuild/4.9.1 + # load EESSI-extend/2023.06-easybuild module && verify that it is loaded EESSI_EXTEND_MODULE="EESSI-extend/2023.06-easybuild" module load ${EESSI_EXTEND_MODULE} @@ -116,7 +119,7 @@ ret=$? # Check if CUDA shall be installed cuda_install_needed=0 -cat ${eb_dry_run_short_out} | grep "^ \* \[R\]" | grep "\/c\/CUDA\/" +cat ${eb_dry_run_short_out} | grep "^ \* \[[xR]\]" | grep "module: CUDA/" ret=$? if [ "${ret}" -eq 0 ]; then cuda_install_needed=1 @@ -131,7 +134,8 @@ fi # determine the number of packages to be installed (assume 5 GB + num_packages * # 3GB space needed) -number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[R\]" | wc -l) +number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[[xR]\]" | sed -e 's/^.*module: //' | uniq | wc -l) +echo "number of packages to be (re-)installed: '${number_of_packages}'" base_storage_space=$((5000000 + ${number_of_packages} * 3000000)) required_space_in_tmpdir=${base_storage_space} @@ -168,7 +172,7 @@ fi # `MODULEPATH` yet. Even if it is, we still want to redo this installation # since it will provide the symlinked targets for the parts of the CUDA # and/or other installation in the `.../versions/...` prefix -# - install-path-modules: We install the module in our `tmpdir` since we do not need the modulefile, +# - installpath-modules: We install the module in our `tmpdir` since we do not need the modulefile, # we only care about providing the targets for the symlinks. # - ${cuda_arg}: We only set the --accept-eula-for=CUDA option if CUDA will be installed and if # this script was called with the argument --accept-cuda-eula. @@ -177,14 +181,14 @@ fi # - easystack: Path to easystack file that defines which packages shall be # installed cuda_arg= -if [[ ]]; then +if [[ ${eula_accepted} -eq 1 ]]; then cuda_arg="--accept-eula-for=CUDA" fi touch "$tmpdir"/none.py eb --prefix="$tmpdir" \ --rebuild \ - --install-path-modules=${tmpdir} \ - {cuda_arg} \ + --installpath-modules=${tmpdir} \ + "${cuda_arg}" \ --hooks="$tmpdir"/none.py \ --easystack ${EASYSTACKFILE} ret=$? From 74a9a5596596e159a73606016a25ae255201fe67 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 23 May 2024 11:09:49 +0200 Subject: [PATCH 8/9] various updates to take suggestions into account - `EESSI-install-software.sh` - use `scripts/gpu_support/nvidia/install_cuda_and_libraries.sh` with `scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml` - `create_lmodsitepackage.py` - consolidate `eessi_{cuda,cudnn}_enabled_load_hook` functions in a single one (`eessi_cuda_and_libraries_enabled_load_hook`) - the remaining hook is prepared to easily add new modules, e.g., cuTENSOR - `eb_hooks.py` - put code that iterates over all files replacing non-distributable ones with symlinks into `host_injections` with a common function (`replace_non_distributable_files_with_symlinks`) - `install_scripts.sh` - add files to copy to CVMFS (see `nvidia_files`) - `scripts/gpu_support/nvidia/install_cuda_and_libraries.sh` - improved creation of tmp directory --- EESSI-install-software.sh | 6 +- create_lmodsitepackage.py | 59 ++--- eb_hooks.py | 98 ++++---- install_scripts.sh | 6 +- .../nvidia/install_cuda_and_libraries.sh | 6 +- .../nvidia/install_cudnn_host_injections.sh | 210 ------------------ 6 files changed, 81 insertions(+), 304 deletions(-) delete mode 100755 scripts/gpu_support/nvidia/install_cudnn_host_injections.sh diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 111fb67e53..0299d5d61b 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -233,8 +233,10 @@ else fi if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh -c 12.1.1 -d 8.9.2.26 + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \ + -e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml \ + -t /tmp/temp \ + --accept-cuda-eula else echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" fi diff --git a/create_lmodsitepackage.py b/create_lmodsitepackage.py index 5e44ba8187..33796e56cb 100755 --- a/create_lmodsitepackage.py +++ b/create_lmodsitepackage.py @@ -107,35 +107,41 @@ end -local function eessi_cuda_enabled_load_hook(t) +local function eessi_cuda_and_libraries_enabled_load_hook(t) local frameStk = require("FrameStk"):singleton() local mt = frameStk:mt() local simpleName = string.match(t.modFullName, "(.-)/") - -- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections. - -- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse - -- to load the CUDA module and print an informative message on how to set up GPU support for EESSI + local packagesList = { ["CUDA"] = true, ["cuDNN"] = true } + -- If we try to load any of the modules in packagesList, we check if the + -- full package was installed on the host in host_injections. + -- This is required for end users to build additional software that depends + -- on the package. If the full SDK isn't present, refuse + -- to load the module and print an informative message on how to set up GPU support for EESSI local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n" - if simpleName == 'CUDA' then + if packagesList[simpleName] then + -- simpleName is a module in packagesList -- get the full host_injections path local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') - -- build final path where the CUDA software should be installed - local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" - local cudaDirExists = isDir(cudaEasyBuildDir) - if not cudaDirExists then + + -- build final path where the software should be installed + local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" + local packageDirExists = isDir(packageEasyBuildDir) + if not packageDirExists then local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " - advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI " + advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI " advice = advice .. "can find it.\\n" advice = advice .. refer_to_docs LmodError("\\nYou requested to load ", simpleName, " ", advice) end end - -- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker, + -- when loading CUDA (and cu*) enabled modules check if the necessary driver libraries are accessible to the EESSI linker, -- otherwise, refuse to load the requested module and print error message local haveGpu = mt:haveProperty(simpleName,"arch","gpu") if haveGpu then local arch = os.getenv("EESSI_CPU_FAMILY") or "" - local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" - local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" + local cvmfs_repo = os.getenv("EESSI_CVMFS_REPO") or "" + local cudaVersionFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt" + local cudaDriverFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/libcuda.so" local cudaDriverExists = isFile(cudaDriverFile) local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so") if not (cudaDriverExists or singularityCudaExists) then @@ -172,38 +178,13 @@ end end -local function eessi_cudnn_enabled_load_hook(t) - local frameStk = require("FrameStk"):singleton() - local mt = frameStk:mt() - local simpleName = string.match(t.modFullName, "(.-)/") - -- If we try to load cuDNN itself, check if the full cuDNN package was installed on the host in host_injections. - -- This is required for end users to build additional cuDNN dependent software. If the full SDK isn't present, refuse - -- to load the cuDNN module and print an informative message on how to set up GPU support for EESSI - local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n" - if simpleName == 'cuDNN' then - -- get the full host_injections path - local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections') - -- build final path where the cuDNN software should be installed - local cudnnEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild" - local cudnnDirExists = isDir(cudnnEasyBuildDir) - if not cudnnDirExists then - local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI " - advice = advice .. "due to licencing. You will need to install a full copy of the cuDNN package where EESSI " - advice = advice .. "can find it.\\n" - advice = advice .. refer_to_docs - LmodError("\\nYou requested to load ", simpleName, " ", advice) - end - end -end - -- Combine both functions into a single one, as we can only register one function as load hook in lmod -- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed function eessi_load_hook(t) -- Only apply CUDA and cuDNN hooks if the loaded module is in the EESSI prefix -- This avoids getting an Lmod Error when trying to load a CUDA and cuDNN module from a local software stack if from_eessi_prefix(t) then - eessi_cuda_enabled_load_hook(t) - eessi_cudnn_enabled_load_hook(t) + eessi_cuda_and_libraries_enabled_load_hook(t) end end diff --git a/eb_hooks.py b/eb_hooks.py index 46e79810d9..d3fefe2558 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -567,6 +567,47 @@ def post_sanitycheck_hook(self, *args, **kwargs): POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs) +def replace_non_distributable_files_with_symlinks(log, install_dir, package, allowlist): + """ + Replace files that cannot be distributed with symlinks into host_injections + """ + extension_based = { "CUDA": False, "cuDNN": True } + if not package in extension_based: + raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", package) + + # iterate over all files in the package installation directory + for dir_path, _, files in os.walk(install_dir): + for filename in files: + full_path = os.path.join(dir_path, filename) + # we only really care about real files, i.e. not symlinks + if not os.path.islink(full_path): + # check if the current file name stub is part of the allowlist + basename = filename.split('.')[0] + if extension_based[package]: + if '.' in filename: + extension = '.' + filename.split('.')[1] + if basename in allowlist: + log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) + elif extension_based[package] and '.' in filename and extension in allowlist: + log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path) + else: + if extension_based[package]: + print_name = filename + else: + print_name = basename + log.debug("%s is not found in allowlist, so replacing it with symlink: %s", + print_name, full_path) + # if it is not in the allowlist, delete the file and create a symlink to host_injections + host_inj_path = full_path.replace('versions', 'host_injections') + # make sure source and target of symlink are not the same + if full_path == host_inj_path: + raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " + "are using this hook for an EESSI installation?", + full_path, host_inj_path) + remove_file(full_path) + symlink(host_inj_path, full_path) + + def post_sanitycheck_cuda(self, *args, **kwargs): """ Remove files from CUDA installation that we are not allowed to ship, @@ -606,28 +647,9 @@ def post_sanitycheck_cuda(self, *args, **kwargs): if 'libcudart' not in allowlist: raise EasyBuildError("Did not find 'libcudart' in allowlist: %s" % allowlist) - # iterate over all files in the CUDA installation directory - for dir_path, _, files in os.walk(self.installdir): - for filename in files: - full_path = os.path.join(dir_path, filename) - # we only really care about real files, i.e. not symlinks - if not os.path.islink(full_path): - # check if the current file name stub is part of the allowlist - basename = filename.split('.')[0] - if basename in allowlist: - self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) - else: - self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s", - basename, full_path) - # if it is not in the allowlist, delete the file and create a symlink to host_injections - host_inj_path = full_path.replace('versions', 'host_injections') - # make sure source and target of symlink are not the same - if full_path == host_inj_path: - raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " - "are using this hook for an EESSI installation?", - full_path, host_inj_path) - remove_file(full_path) - symlink(host_inj_path, full_path) + # replace files that are not distributable with symlinks into + # host_injections + replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) else: raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!") @@ -643,8 +665,7 @@ def post_sanitycheck_cudnn(self, *args, **kwargs): allowlist = ['LICENSE'] - # read cuDNN LICENSE, construct allowlist based on section 2. Distribution - # that specifies list of files that can be shipped + # read cuDNN LICENSE, construct allowlist based on section "2. Distribution" that specifies list of files that can be shipped license_path = os.path.join(self.installdir, 'LICENSE') search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:" with open(license_path) as infile: @@ -660,32 +681,9 @@ def post_sanitycheck_cudnn(self, *args, **kwargs): allowlist = sorted(set(allowlist)) self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist)) - # iterate over all files in the cuDNN installation directory - for dir_path, _, files in os.walk(self.installdir): - for filename in files: - full_path = os.path.join(dir_path, filename) - # we only really care about real files, i.e. not symlinks - if not os.path.islink(full_path): - # check if the current file is part of the allowlist - basename = filename.split('.')[0] - if '.' in filename: - extension = '.' + filename.split('.')[1] - if basename in allowlist: - self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path) - elif '.' in filename and extension in allowlist: - self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path) - else: - self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s", - filename, full_path) - # if it is not in the allowlist, delete the file and create a symlink to host_injections - host_inj_path = full_path.replace('versions', 'host_injections') - # make sure source and target of symlink are not the same - if full_path == host_inj_path: - raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you " - "are using this hook for a EESSI installation?", - full_path, host_inj_path) - remove_file(full_path) - symlink(host_inj_path, full_path) + # replace files that are not distributable with symlinks into + # host_injections + replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist) else: raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!") diff --git a/install_scripts.sh b/install_scripts.sh index 05891d2551..8c435cef80 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -110,7 +110,11 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@ # Copy files for the scripts/gpu_support/nvidia directory nvidia_files=( - install_cuda_host_injections.sh install_cudnn_host_injections.sh link_nvidia_host_libraries.sh + eessi-2023.06-cuda-and-libraries.yml + install_cuda_and_libraries.sh + install_cuda_host_injections.sh + install_cudnn_host_injections.sh + link_nvidia_host_libraries.sh ) copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}" diff --git a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh index 140fd8d4e1..f04aa1aff6 100755 --- a/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh +++ b/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh @@ -93,11 +93,13 @@ export EESSI_SITE_INSTALL=${EESSI_SOFTWARE_PATH/versions/host_injections} if [[ -z "${TEMP_DIR}" ]]; then tmpdir=$(mktemp -d) else - tmpdir="${TEMP_DIR}"/temp - if ! mkdir "$tmpdir" ; then + mkdir -p ${TEMP_DIR} + tmpdir=$(mktemp -d --tmpdir=${TEMP_DIR} cuda_n_co.XXX) + if [[ ! -d "$tmpdir" ]] ; then fatal_error "Could not create directory ${tmpdir}" fi fi +echo "Created temporary directory '${tmpdir}'" # workaround for EasyBuild not being found when loading "extend" module module load EasyBuild/4.9.1 diff --git a/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh b/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh deleted file mode 100755 index d23cd5ebb7..0000000000 --- a/scripts/gpu_support/nvidia/install_cudnn_host_injections.sh +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env bash - -# This script can be used to install cuDNN under the `.../host_injections` directory. -# This provides the parts of the cuDNN installation that cannot be redistributed as -# part of EESSI due to license limitations. While GPU-based software from EESSI will -# _run_ without these, installation of additional software that requires the cuDNN -# installation(s) under `host_injections` to be present. -# -# The `host_injections` directory is a variant symlink that by default points to -# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see -# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the -# installation to be successful, this directory needs to be writeable by the user -# executing this script. - -# Initialise our bash functions -TOPDIR=$(dirname $(realpath $BASH_SOURCE)) -source "$TOPDIR"/../../utils.sh - -# Function to display help message -show_help() { - echo "Usage: $0 [OPTIONS]" - echo "Options:" - echo " --help Display this help message" - echo " -c, --cuda-version CUDA_VERSION Specify a version of CUDA to be used" - echo " when installing cuDNN (must" - echo " have a corresponding easyconfig in the" - echo " EasyBuild release)" - echo " -d, --cudnn-version CUDNN_VERSION Specify a version of cuDNN to install (must" - echo " have a corresponding easyconfig in the" - echo " EasyBuild release)" - echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" - echo " storage during the cuDNN install" - echo " (must have >10GB available)" -} - -# Initialize variables -cuda_version="" -cudnn_version="" - -# Parse command-line options -while [[ $# -gt 0 ]]; do - case "$1" in - --help) - show_help - exit 0 - ;; - -c|--cuda-version) - if [ -n "$2" ]; then - cuda_version="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - -d|--cudnn-version) - if [ -n "$2" ]; then - cudnn_version="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - -t|--temp-dir) - if [ -n "$2" ]; then - CUDA_TEMP_DIR="$2" - shift 2 - else - echo "Error: Argument required for $1" - show_help - exit 1 - fi - ;; - *) - show_help - fatal_error "Error: Unknown option: $1" - ;; - esac -done - -# Make sure EESSI is initialised -check_eessi_initialised - -# Make sure the CUDA version supplied is a semantic version -is_semantic_version() { - local version=$1 - local regex='^[0-9]+\.[0-9]+\.[0-9]+$' - - if [[ $version =~ $regex ]]; then - return 0 # Return success (0) if it's a semantic version - else - return 1 # Return failure (1) if it's not a semantic version - fi -} -if ! is_semantic_version "$cuda_version"; then - show_help - error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" - error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" - error="${error}version to provide is probably one of those available under\n" - error="${error}$EESSI_SOFTWARE_PATH/software/cuDNN\n" - fatal_error "${error}" -fi - -# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` -cudnn_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} - -# Only install cuDNN if specified version is not found. -# (existence of easybuild subdir implies a successful install) -if [ -d "${cudnn_install_parent}"/software/cuDNN/*-CUDA-"${cuda_version}"/easybuild ]; then - echo_green "cuDNN software found! No need to install cuDNN again." -else - # We need to be able write to the installation space so let's make sure we can - if ! create_directory_structure "${cudnn_install_parent}"/software/cuDNN ; then - fatal_error "No write permissions to directory ${cudnn_install_parent}/software/cuDNN" - fi - - # we need a directory we can use for temporary storage - if [[ -z "${CUDA_TEMP_DIR}" ]]; then - tmpdir=$(mktemp -d) - else - tmpdir="${CUDA_TEMP_DIR}"/temp - if ! mkdir "$tmpdir" ; then - fatal_error "Could not create directory ${tmpdir}" - fi - fi - - required_space_in_tmpdir=50000 - # Let's see if we have sources and build locations defined if not, we use the temporary space - if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then - export EASYBUILD_BUILDPATH=${tmpdir}/build - required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) - fi - if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then - export EASYBUILD_SOURCEPATH=${tmpdir}/sources - required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) - fi - - # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), - # need to do a space check before we proceed - avail_space=$(df --output=avail "${cudnn_install_parent}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < 5000000 )); then - fatal_error "Need at least 5GB disk space to install cuDNN under ${cudnn_install_parent}, exiting now..." - fi - avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < required_space_in_tmpdir )); then - error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" - error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." - error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " - error="${error}to reduce this requirement. Exiting now..." - fatal_error "${error}" - fi - - if ! command -v "eb" &>/dev/null; then - echo_yellow "Attempting to load an EasyBuild module to do actual install" - module load EasyBuild - # There are some scenarios where this may fail - if [ $? -ne 0 ]; then - error="'eb' command not found in your environment and\n" - error="${error} module load EasyBuild\n" - error="${error}failed for some reason.\n" - error="${error}Please re-run this script with the 'eb' command available." - fatal_error "${error}" - fi - fi - - cudnn_easyconfig="cuDNN-${cudnn_version}-CUDA-${cuda_version}.eb" - - # Check the easyconfig file is available in the release - # (eb search always returns 0, so we need a grep to ensure a usable exit code) - eb --search ^${cudnn_easyconfig}|grep cuDNN > /dev/null 2>&1 - # Check the exit code - if [ $? -ne 0 ]; then - eb_version=$(eb --version) - available_cudnn_easyconfigs=$(eb --search ^cuDNN-*.eb|grep cuDNN) - - error="The easyconfig ${cudnn_easyconfig} was not found in EasyBuild version:\n" - error="${error} ${eb_version}\n" - error="${error}You either need to give a different version of CUDA to install _or_ \n" - error="${error}use a different version of EasyBuild for the installation.\n" - error="${error}\nThe versions of available with the current eb command are:\n" - error="${error}${available_cudnn_easyconfigs}" - fatal_error "${error}" - fi - - # We need the --rebuild option, as the cuDNN module may or may not be on the - # `MODULEPATH` yet. Even if it is, we still want to redo this installation - # since it will provide the symlinked targets for the parts of the cuDNN - # installation in the `.../versions/...` prefix - # We install the module in our `tmpdir` since we do not need the modulefile, - # we only care about providing the targets for the symlinks. - extra_args="--rebuild --installpath-modules=${tmpdir}" - - # We don't want hooks used in this install, we need a vanilla cuDNN installation - touch "$tmpdir"/none.py - # shellcheck disable=SC2086 # Intended splitting of extra_args - eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cudnn_install_parent}"/ "${cudnn_easyconfig}" - ret=$? - if [ $ret -ne 0 ]; then - eb_last_log=$(unset EB_VERBOSE; eb --last-log) - cp -a ${eb_last_log} . - fatal_error "cuDNN installation failed, please check EasyBuild logs $(basename ${eb_last_log})..." - else - echo_green "cuDNN installation at ${cudnn_install_parent}/software/cuDNN/${cudnn_version}-CUDA-${cuda_version} succeeded!" - fi - # clean up tmpdir - rm -rf "${tmpdir}" -fi From 7cd0d00e6d28908f7d454e522c0852c3a7cb155f Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 23 May 2024 11:18:17 +0200 Subject: [PATCH 9/9] don't copy removed file --- install_scripts.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/install_scripts.sh b/install_scripts.sh index 8c435cef80..a8a6c127ea 100755 --- a/install_scripts.sh +++ b/install_scripts.sh @@ -113,7 +113,6 @@ nvidia_files=( eessi-2023.06-cuda-and-libraries.yml install_cuda_and_libraries.sh install_cuda_host_injections.sh - install_cudnn_host_injections.sh link_nvidia_host_libraries.sh ) copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"