Skip to content

Commit

Permalink
Merge branch '2023.06-software.eessi.io-cuDNN-8.9.2.26-part-1' of git…
Browse files Browse the repository at this point in the history
…hub-trz:trz42/software-layer into 2023.06-software.eessi.io-cuDNN-8.9.2.26-part-1
  • Loading branch information
truib committed Oct 14, 2024
2 parents a27683e + d5572ea commit b68fdfa
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 145 deletions.
9 changes: 2 additions & 7 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -260,16 +260,11 @@ fi

temp_install_storage=${TMPDIR}/temp_install_storage
mkdir -p ${temp_install_storage}
# Note the eessi...CUDA.yml file(s) is(are) copied by 'install_scripts.sh' from
# the EESSI/software-layer easystacks/software.eessi.io/2023.06/accel/nvidia
# directory to /cvmfs to avoid keeping them in sync manually. If more than one
# such file is used (e.g., because different EasyBuild versions were used), the
# install script 'install_cuda_and_libraries.sh' has to be run multiple times.
if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \
-e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml \
-t ${temp_install_storage} \
--accept-cuda-eula
--accept-cuda-eula \
--accept-cudnn-eula
else
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
fi
Expand Down
14 changes: 5 additions & 9 deletions install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -128,16 +128,12 @@ nvidia_files=(
)
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"

# special treatment for the easystack file(s) that lists CUDA and cu* libraries
# To be picked up by a build job they have to be stored under
# easystacks/software.eessi.io/2023.06/accel/nvidia/ on GitHub.
# To avoid keeping that file and the one that we distribute via CernVM-FS so
# users/sites can install the full CUDA SDK and cu* libraries under
# 'host_injections' we copy the above file to the right location under /cvmfs.
nvidia_host_injections_files=(
eessi-2023.06-eb-4.9.4-2023a-CUDA.yml
# Easystacks to be used to install software in host injections
host_injections_easystacks=(
eessi-2023.06-eb-4.9.4-2023a-CUDA-host-injections.yml
)
copy_files_by_list ${TOPDIR}/easystacks/software.eessi.io/2023.06/accel/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_host_injections_files[@]}"
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia/easystacks \
${INSTALL_PREFIX}/scripts/gpu_support/nvidia/easystacks "${host_injections_easystacks[@]}"

# Copy over EasyBuild hooks file used for installations
hook_files=(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# This EasyStack provides a list of all the EasyConfigs that should be installed in host_injections
# for nvidia GPU support, because they cannot (fully) be shipped as part of EESSI due to license constraints
easyconfigs:
- CUDA-12.1.1.eb
- cuDNN-8.9.2.26-CUDA-12.1.1.eb:
options:
# Needed for support for --accept-uela-for option
include-easyblocks-from-commit: 11afb88ec55e0ca431cbe823696aa43e2a9bfca8
270 changes: 141 additions & 129 deletions scripts/gpu_support/nvidia/install_cuda_and_libraries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,17 @@ show_help() {
echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install"
echo " CUDA, see the EULA at"
echo " https://docs.nvidia.com/cuda/eula/index.html"
echo " -e, --easystack EASYSTACK_FILE Path to easystack file that defines which"
echo " packages shall be installed"
echo " --accept-cudnn-eula You _must_ accept the cuDNN EULA to install"
echo " cuDNN, see the EULA at"
echo " https://docs.nvidia.com/deeplearning/cudnn/latest/reference/eula.html"
echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary"
echo " storage during the installation of CUDA"
echo " and/or other libraries (must have"
echo " several GB available; depends on the number of installations)"
}

# Initialize variables
eula_accepted=0
cuda_eula_accepted=0
EASYSTACK_FILE=
TEMP_DIR=

Expand All @@ -48,18 +49,12 @@ while [[ $# -gt 0 ]]; do
exit 0
;;
--accept-cuda-eula)
eula_accepted=1
cuda_eula_accepted=1
shift 1
;;
-e|--easystack)
if [ -n "$2" ]; then
EASYSTACK_FILE="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
--accept-cudnn-eula)
cudnn_eula_accepted=1
shift 1
;;
-t|--temp-dir)
if [ -n "$2" ]; then
Expand All @@ -78,16 +73,11 @@ while [[ $# -gt 0 ]]; do
esac
done

if [[ -z "${EASYSTACK_FILE}" ]]; then
fatal_error "Need the name/path to an easystack file. See command line options\n"
fi

# Make sure EESSI is initialised
check_eessi_initialised

# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections`
# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup)
export EESSI_SITE_INSTALL=${EESSI_SOFTWARE_PATH/versions/host_injections}
# Make sure that `EESSI-extend` will install in the site installation path EESSI_SITE_SOFTWARE_PATH
export EESSI_SITE_INSTALL=1

# we need a directory we can use for temporary storage
if [[ -z "${TEMP_DIR}" ]]; then
Expand All @@ -101,119 +91,141 @@ else
fi
echo "Created temporary directory '${tmpdir}'"

echo "MODULEPATH=${MODULEPATH}"
echo "List available *CUDA* modules before loading EESSI-extend/${EESSI_VERSION}-easybuild"
module avail CUDA
# use install_path/modules/all as MODULEPATH
SAVE_MODULEPATH=${MODULEPATH}

# load EESSI-extend/2023.06-easybuild module && verify that it is loaded
EESSI_EXTEND_MODULE="EESSI-extend/${EESSI_VERSION}-easybuild"
module load ${EESSI_EXTEND_MODULE}
ret=$?
if [ "${ret}" -ne 0 ]; then
fatal_error "An error occured while trying to load ${EESSI_EXTEND_MODULE}\n"
fi
for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do
echo -e "Processing easystack file ${easystack_file}...\n\n"

# determine version of EasyBuild module to load based on EasyBuild version included in name of easystack file
eb_version=$(echo ${EASYSTACK_FILE} | sed 's/.*eb-\([0-9.]*\).*/\1/g')

# Load EasyBuild version for this easystack file _before_ loading EESSI-extend
module load EasyBuild/${eb_version}
module load EESSI-extend/${EESSI_VERSION}-easybuild

# Install modules in hidden .modules dir to keep track of what was installed before
MODULEPATH=${EASYBUILD_INSTALLPATH}/.modules/all
echo "set MODULEPATH=${MODULEPATH}"

# show EasyBuild configuration
echo "Show EasyBuild configuration"
eb --show-config

# do a 'eb --dry-run-short' with the EASYSTACK_FILE and determine list of packages
# to be installed
echo ">> Determining if packages specified in ${EASYSTACK_FILE} are missing under ${EESSI_SITE_SOFTWARE_PATH}"
eb_dry_run_short_out=${tmpdir}/eb_dry_run_short.out
eb --dry-run-short --easystack ${EASYSTACK_FILE} 2>&1 | tee ${eb_dry_run_short_out}
ret=$?

# Check if CUDA shall be installed
cuda_install_needed=0
cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | grep "module: CUDA/" > /dev/null
ret=$?
if [ "${ret}" -eq 0 ]; then
cuda_install_needed=1
fi

echo "MODULEPATH=${MODULEPATH}"
echo "List available *CUDA* modules after loading EESSI-extend/${EESSI_VERSION}-easybuild"
module avail CUDA
# Make sure the CUDA EULA is accepted if it shall be installed
if [ "${cuda_install_needed}" -eq 1 ] && [ "${cuda_eula_accepted}" -ne 1 ]; then
show_help
error="\nCUDA shall be installed. However, the CUDA EULA has not been accepted\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n"
fatal_error "${error}"
fi

# use install_path/modules/all as MODULEPATH
SAVE_MODULEPATH=${MODULEPATH}
MODULEPATH=${EASYBUILD_INSTALLPATH}/.modules/all
echo "set MODULEPATH=${MODULEPATH}"

# show EasyBuild configuration
echo "Show EasyBuild configuration"
eb --show-config

# do a 'eb --dry-run-short' with the EASYSTACK_FILE and determine list of packages
# to be installed
echo ">> Determining if packages specified in ${EASYSTACK_FILE} are missing under ${EESSI_SITE_INSTALL}"
eb_dry_run_short_out=${tmpdir}/eb_dry_run_short.out
eb --dry-run-short --rebuild --easystack ${EASYSTACK_FILE} 2>&1 | tee ${eb_dry_run_short_out}
ret=$?

# Check if CUDA shall be installed
cuda_install_needed=0
cat ${eb_dry_run_short_out} | grep "^ \* \[[xR]\]" | grep "module: CUDA/"
ret=$?
if [ "${ret}" -eq 0 ]; then
cuda_install_needed=1
fi
# Check if cdDNN shall be installed
cudnn_install_needed=0
cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | grep "module: cuDNN/" > /dev/null
ret=$?
if [ "${ret}" -eq 0 ]; then
cudnn_install_needed=1
fi

# Make sure the CUDA EULA is accepted if it shall be installed
if [ "${cuda_install_needed}" -eq 1 ] && [ "${eula_accepted}" -ne 1 ]; then
show_help
error="\nCUDA shall be installed. However, the CUDA EULA has not been accepted\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n"
fatal_error "${error}"
fi
# Make sure the cuDNN EULA is accepted if it shall be installed
if [ "${cudnn_install_needed}" -eq 1 ] && [ "${cudnn_eula_accepted}" -ne 1 ]; then
show_help
error="\ncuDNN shall be installed. However, the cuDNNDA EULA has not been accepted\nYou _must_ accept the cuDNN EULA via the appropriate command line option.\n"
fatal_error "${error}"
fi

# determine the number of packages to be installed (assume 5 GB + num_packages *
# 3GB space needed)
number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | sed -e 's/^.*module: //' | sort -u | wc -l)
echo "number of packages to be (re-)installed: '${number_of_packages}'"
base_storage_space=$((5000000 + ${number_of_packages} * 3000000))

required_space_in_tmpdir=${base_storage_space}
# Let's see if we have sources and build locations defined if not, we use the temporary space
if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then
export EASYBUILD_BUILDPATH=${tmpdir}/build
required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space}))
fi
if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then
export EASYBUILD_SOURCEPATH=${tmpdir}/sources
required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space}))
fi
# determine the number of packages to be installed (assume 5 GB + num_packages *
# 3GB space needed). Both CUDA and cuDNN are about this size
number_of_packages=$(cat ${eb_dry_run_short_out} | grep "^ \* \[[ ]\]" | sed -e 's/^.*module: //' | sort -u | wc -l)
echo "number of packages to be (re-)installed: '${number_of_packages}'"
base_storage_space=$((5000000 + ${number_of_packages} * 3000000))

required_space_in_tmpdir=${base_storage_space}
# Let's see if we have sources and build locations defined if not, we use the temporary space
if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then
export EASYBUILD_BUILDPATH=${tmpdir}/build
required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space}))
fi
if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then
export EASYBUILD_SOURCEPATH=${tmpdir}/sources
required_space_in_tmpdir=$((required_space_in_tmpdir + ${base_storage_space}))
fi

# The install is pretty fat, you need lots of space for download/unpack/install
# (~3*${base_storage_space}*1000 Bytes),
# need to do a space check before we proceed
avail_space=$(df --output=avail "${EESSI_SITE_SOFTWARE_PATH}"/ | tail -n 1 | awk '{print $1}')
min_disk_storage=$((3 * ${base_storage_space}))
if (( avail_space < ${min_disk_storage} )); then
fatal_error "Need at least $(echo "${min_disk_storage} / 1000000" | bc) GB disk space to install CUDA and other libraries under ${EESSI_SITE_SOFTWARE_PATH}, exiting now..."
fi
avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < required_space_in_tmpdir )); then
error="Need at least $(echo "${required_space_in_tmpdir} / 1000000" | bc) temporary disk space under ${tmpdir}.\n"
error="${error}Set the environment variable TEMP_DIR to a location with adequate space to pass this check."
error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH"
error="${error}to reduce this requirement. Exiting now..."
fatal_error "${error}"
fi

# The install is pretty fat, you need lots of space for download/unpack/install
# (~3*${base_storage_space}*1000 Bytes),
# need to do a space check before we proceed
avail_space=$(df --output=avail "${EESSI_SITE_INSTALL}"/ | tail -n 1 | awk '{print $1}')
min_disk_storage=$((3 * ${base_storage_space}))
if (( avail_space < ${min_disk_storage} )); then
fatal_error "Need at least $(echo "${min_disk_storage} / 1000000" | bc) GB disk space to install CUDA and other libraries under ${EESSI_SITE_INSTALL}, exiting now..."
fi
avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < required_space_in_tmpdir )); then
error="Need at least $(echo "${required_space_in_tmpdir} / 1000000" | bc) temporary disk space under ${tmpdir}.\n"
error="${error}Set the environment variable TEMP_DIR to a location with adequate space to pass this check."
error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH"
error="${error}to reduce this requirement. Exiting now..."
fatal_error "${error}"
fi
# Brief explanation of parameters:
# - prefix: using $tmpdir as default base directory for several EB settings
# - installpath-modules: We install the module in a hidden .modules, so that next time this script
# is run, it is not reinstalled.
# - ${accept_eula_opt}: We only set the --accept-eula-for=CUDA option if CUDA will be installed and if
# this script was called with the argument --accept-cuda-eula.
# - hooks: We don't want hooks used in this install, we need vanilla
# installations of CUDA and/or other libraries
# - easystack: Path to easystack file that defines which packages shall be
# installed
accept_eula_opt=
if [[ ${cuda_eula_accepted} -eq 1 ]]; then
accept_eula_opt="CUDA"
fi
if [[ ${cudnn_eula_accepted} -eq 1 ]]; then
if [[ -z ${accept_eula_opt} ]]; then
accept_eula_opt="cuDNN"
else
accept_eula_opt="${accept_eula_opt},cuDNN"
fi
fi
touch "$tmpdir"/none.py
eb_args="--prefix=$tmpdir"
eb_args="$eb_args --installpath-modules=${EASYBUILD_INSTALLPATH}/.modules"
eb_args="$eb_args --hooks="$tmpdir"/none.py"
eb_args="$eb_args --easystack ${EASYSTACK_FILE}"
if [[ ! -z ${accept_eula_opt} ]]; then
eb_args="$eb_args --accept-eula-for=$accept_eula_opt"
fi
echo "Running eb $eb_args"
eb $eb_args
ret=$?
if [ $ret -ne 0 ]; then
eb_last_log=$(unset EB_VERBOSE; eb --last-log)
cp -a ${eb_last_log} .
fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..."
else
echo_green "all installations at ${EESSI_SITE_SOFTWARE_PATH}/software/... succeeded!"
fi

# Brief explanation of parameters:
# - prefix: using $tmpdir as default base directory for several EB settings
# - rebuild: we need the --rebuild option, as the CUDA module may or may not be on the
# `MODULEPATH` yet. Even if it is, we still want to redo this installation
# since it will provide the symlinked targets for the parts of the CUDA
# and/or other installation in the `.../versions/...` prefix
# - installpath-modules: We install the module in our `tmpdir` since we do not need the modulefile,
# we only care about providing the targets for the symlinks.
# - ${accept_eula_opt}: We only set the --accept-eula-for=CUDA option if CUDA will be installed and if
# this script was called with the argument --accept-cuda-eula.
# - hooks: We don't want hooks used in this install, we need vanilla
# installations of CUDA and/or other libraries
# - easystack: Path to easystack file that defines which packages shall be
# installed
accept_eula_opt=
if [[ ${eula_accepted} -eq 1 ]]; then
accept_eula_opt="--accept-eula-for=CUDA"
fi
touch "$tmpdir"/none.py
eb --prefix="$tmpdir" \
--rebuild \
--installpath-modules=${EASYBUILD_INSTALLPATH}/.modules \
"${accept_eula_opt}" \
--hooks="$tmpdir"/none.py \
--easystack ${EASYSTACK_FILE}
ret=$?
if [ $ret -ne 0 ]; then
eb_last_log=$(unset EB_VERBOSE; eb --last-log)
cp -a ${eb_last_log} .
fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..."
else
echo_green "all installations at ${EESSI_SITE_INSTALL}/software/... succeeded!"
fi
# clean up tmpdir
rm -rf "${tmpdir}"
# clean up tmpdir
rm -rf "${tmpdir}"

# Restore MODULEPATH for next loop iteration
MODUELPATH=${SAVE_MODULEPATH}
done

0 comments on commit b68fdfa

Please sign in to comment.