Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{2023.06}[system] cuDNN/8.9.2.26-CUDA-12.1.1 #581

9 changes: 6 additions & 3 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ pr_diff=$(ls [0-9]*.diff | head -1)
# for now, this just reinstalls all scripts. Note the most elegant, but works
${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX}

# Install full CUDA SDK in host_injections
# Install full CUDA SDK and cu* libraries in host_injections
# Hardcode this for now, see if it works
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
# Allow skipping CUDA SDK install in e.g. CI environments
Expand All @@ -233,9 +233,12 @@ else
fi

if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_and_libraries.sh \
-e ${EESSI_PREFIX}/scripts/gpu_support/nvidia/eessi-2023.06-cuda-and-libraries.yml \
-t /tmp/temp \
--accept-cuda-eula
else
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
fi

# Install drivers in host_injections
Expand Down
38 changes: 22 additions & 16 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,35 +107,41 @@
end


local function eessi_cuda_enabled_load_hook(t)
local function eessi_cuda_and_libraries_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
-- to load the CUDA module and print an informative message on how to set up GPU support for EESSI
local packagesList = { ["CUDA"] = true, ["cuDNN"] = true }
-- If we try to load any of the modules in packagesList, we check if the
-- full package was installed on the host in host_injections.
-- This is required for end users to build additional software that depends
-- on the package. If the full SDK isn't present, refuse
-- to load the module and print an informative message on how to set up GPU support for EESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'CUDA' then
if packagesList[simpleName] then
-- simpleName is a module in packagesList
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the CUDA software should be installed
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudaDirExists = isDir(cudaEasyBuildDir)
if not cudaDirExists then

-- build final path where the software should be installed
local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local packageDirExists = isDir(packageEasyBuildDir)
if not packageDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- when loading CUDA (and cu*) enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
-- otherwise, refuse to load the requested module and print error message
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
if haveGpu then
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cvmfs_repo = os.getenv("EESSI_CVMFS_REPO") or ""
local cudaVersionFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
local cudaDriverFile = cvmfs_repo .. "/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
Expand Down Expand Up @@ -175,10 +181,10 @@
-- Combine both functions into a single one, as we can only register one function as load hook in lmod
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
function eessi_load_hook(t)
-- Only apply CUDA hooks if the loaded module is in the EESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA module from a local software stack
-- Only apply CUDA and cuDNN hooks if the loaded module is in the EESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA and cuDNN module from a local software stack
if from_eessi_prefix(t) then
eessi_cuda_enabled_load_hook(t)
eessi_cuda_and_libraries_enabled_load_hook(t)
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ easyconfigs:
options:
from-pr: 20299
- EESSI-extend-2023.06-easybuild.eb
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
157 changes: 117 additions & 40 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,47 @@ def post_sanitycheck_hook(self, *args, **kwargs):
POST_SANITYCHECK_HOOKS[self.name](self, *args, **kwargs)


def replace_non_distributable_files_with_symlinks(log, install_dir, package, allowlist):
"""
Replace files that cannot be distributed with symlinks into host_injections
"""
extension_based = { "CUDA": False, "cuDNN": True }
if not package in extension_based:
raise EasyBuildError("Don't know how to strip non-distributable files from package %s.", package)

# iterate over all files in the package installation directory
for dir_path, _, files in os.walk(install_dir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file name stub is part of the allowlist
basename = filename.split('.')[0]
if extension_based[package]:
if '.' in filename:
extension = '.' + filename.split('.')[1]
if basename in allowlist:
log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
elif extension_based[package] and '.' in filename and extension in allowlist:
log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
else:
if extension_based[package]:
print_name = filename
else:
print_name = basename
log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
print_name, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for an EESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)


def post_sanitycheck_cuda(self, *args, **kwargs):
"""
Remove files from CUDA installation that we are not allowed to ship,
Expand Down Expand Up @@ -606,56 +647,91 @@ def post_sanitycheck_cuda(self, *args, **kwargs):
if 'libcudart' not in allowlist:
raise EasyBuildError("Did not find 'libcudart' in allowlist: %s" % allowlist)

# iterate over all files in the CUDA installation directory
for dir_path, _, files in os.walk(self.installdir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file name stub is part of the allowlist
basename = filename.split('.')[0]
if basename in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
else:
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
basename, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for an EESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)
# replace files that are not distributable with symlinks into
# host_injections
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
else:
raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!")



def post_sanitycheck_cudnn(self, *args, **kwargs):
"""
Remove files from cuDNN installation that we are not allowed to ship,
and replace them with a symlink to a corresponding installation under host_injections.
"""
if self.name == 'cuDNN':
print_msg("Replacing files in cuDNN installation that we can not ship with symlinks to host_injections...")

allowlist = ['LICENSE']

# read cuDNN LICENSE, construct allowlist based on section "2. Distribution" that specifies list of files that can be shipped
license_path = os.path.join(self.installdir, 'LICENSE')
search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:"
with open(license_path) as infile:
for line in infile:
if line.strip().startswith(search_string):
# remove search string, split into words, remove trailing
# dots '.' and only retain words starting with a dot '.'
distributable = line[len(search_string):]
for word in distributable.split():
if word[0] == '.':
allowlist.append(word.rstrip('.'))

allowlist = sorted(set(allowlist))
self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist))

# replace files that are not distributable with symlinks into
# host_injections
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
else:
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")


def inject_gpu_property(ec):
"""
Add 'gpu' property, via modluafooter easyconfig parameter
Add 'gpu' property EESSI<PACKAGE>VERSION envvars and drop dependencies to
build dependencies, via modluafooter easyconfig parameter
"""
ec_dict = ec.asdict()
# Check if CUDA is in the dependencies, if so add the 'gpu' Lmod property
if ('CUDA' in [dep[0] for dep in iter(ec_dict['dependencies'])]):
ec.log.info("Injecting gpu as Lmod arch property and envvar with CUDA version")
# check if CUDA, cuDNN, you-name-it is in the dependencies, if so
# - drop dependency to build dependency
# - add 'gpu' Lmod property
# - add envvar with package version
packages_list = ( "CUDA", "cuDNN" )
packages_version = { }
add_gpu_property = ''

for package in packages_list:
# Check if package is in the dependencies, if so drop dependency to build
# dependency and set variable for later adding the 'gpu' Lmod property
if (package in [dep[0] for dep in iter(ec_dict['dependencies'])]):
add_gpu_property = 'add_property("arch","gpu")'
for dep in iter(ec_dict['dependencies']):
if package in dep[0]:
# make package a build dependency only (rpathing saves us from link errors)
ec.log.info("Dropping dependency on %s to build dependency" % package)
ec_dict['dependencies'].remove(dep)
if dep not in ec_dict['builddependencies']:
ec_dict['builddependencies'].append(dep)
# take note of version for creating the modluafooter
packages_version[package] = dep[1]
if add_gpu_property:
ec.log.info("Injecting gpu as Lmod arch property and envvars for dependencies with their version")
key = 'modluafooter'
value = 'add_property("arch","gpu")'
cuda_version = 0
for dep in iter(ec_dict['dependencies']):
# Make CUDA a build dependency only (rpathing saves us from link errors)
if 'CUDA' in dep[0]:
cuda_version = dep[1]
ec_dict['dependencies'].remove(dep)
if dep not in ec_dict['builddependencies']:
ec_dict['builddependencies'].append(dep)
value = '\n'.join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version])
if key in ec_dict:
if not value in ec_dict[key]:
ec[key] = '\n'.join([ec_dict[key], value])
values = [add_gpu_property]
for package, version in packages_version.items():
envvar = "EESSI%sVERSION" % package.upper()
values.append('setenv("%s","%s")' % (envvar, version))
if not key in ec_dict:
ec[key] = '\n'.join(values)
else:
ec[key] = value
new_value = ec_dict[key]
for value in values:
if not value in new_value:
new_value = '\n'.join([new_value, value])
ec[key] = new_value

return ec


Expand Down Expand Up @@ -709,4 +785,5 @@ def inject_gpu_property(ec):

POST_SANITYCHECK_HOOKS = {
'CUDA': post_sanitycheck_cuda,
'cuDNN': post_sanitycheck_cudnn,
}
5 changes: 4 additions & 1 deletion install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@

# Copy files for the scripts/gpu_support/nvidia directory
nvidia_files=(
install_cuda_host_injections.sh link_nvidia_host_libraries.sh
eessi-2023.06-cuda-and-libraries.yml
install_cuda_and_libraries.sh
install_cuda_host_injections.sh
link_nvidia_host_libraries.sh
)
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
easyconfigs:
- CUDA-12.1.1.eb
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
Loading
Loading