diff --git a/compass/job/__init__.py b/compass/job/__init__.py index 21ec57f58..151d951cb 100644 --- a/compass/job/__init__.py +++ b/compass/job/__init__.py @@ -93,6 +93,12 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir, job_name = 'compass' else: job_name = f'compass_{suite}' + + if config.has_option('parallel', 'gpus_per_node'): + gpus_per_node = config.get('parallel', 'gpus_per_node') + else: + gpus_per_node = '' + wall_time = config.get('job', 'wall_time') template = Template(resources.read_text( @@ -101,7 +107,8 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir, text = template.render(job_name=job_name, account=account, nodes=f'{nodes}', wall_time=wall_time, qos=qos, partition=partition, constraint=constraint, - reservation=reservation, suite=suite, + reservation=reservation, + gpus_per_node=gpus_per_node, suite=suite, pre_run_commands=pre_run_commands, post_run_commands=post_run_commands) text = _clean_up_whitespace(text) diff --git a/compass/job/job_script.template b/compass/job/job_script.template index 37a384a28..fec4d599d 100644 --- a/compass/job/job_script.template +++ b/compass/job/job_script.template @@ -19,6 +19,9 @@ {% if constraint != '' -%} #SBATCH --constraint={{ constraint }} {%- endif %} +{% if gpus_per_node != '' -%} +#SBATCH --gpus-per-node={{ gpus_per_node }} +{%- endif %} source load_compass_env.sh {{ pre_run_commands }} diff --git a/compass/machines/pm-gpu.cfg b/compass/machines/pm-gpu.cfg new file mode 100644 index 000000000..b82faf518 --- /dev/null +++ b/compass/machines/pm-gpu.cfg @@ -0,0 +1,44 @@ + +# The paths section describes paths that are used within the ocean core test +# cases. +[paths] + +# A shared root directory where MPAS standalone data can be found +database_root = /global/cfs/cdirs/e3sm/mpas_standalonedata + +# the path to the base conda environment where compass environments have +# been created +compass_envs = /global/common/software/e3sm/compass/pm-gpu/base + + +# Options related to deploying a compass conda environment on supported +# machines +[deploy] + +# the compiler set to use for system libraries and MPAS builds +compiler = gnugpu + +# the system MPI library to use for gnugpu compiler +mpi_gnugpu = mpich + +# the system MPI library to use for nvidiagpu compiler +mpi_nvidiagpu = mpich + +# the base path for spack environments used by compass +spack = /global/cfs/cdirs/e3sm/software/compass/pm-gpu/spack + +# whether to use the same modules for hdf5, netcdf-c, netcdf-fortran and +# pnetcdf as E3SM (spack modules are used otherwise) +use_e3sm_hdf5_netcdf = True + +# The parallel section describes options related to running jobs in parallel. +# Most options in this section come from mache so here we just add or override +# some defaults +[parallel] + +# cores per node on the machine +cores_per_node = 64 + +# threads per core (set to 1 because trying to hyperthread seems to be causing +# hanging on perlmutter) +threads_per_core = 1 diff --git a/compass/parallel.py b/compass/parallel.py index cf49c930a..08d0e7d80 100644 --- a/compass/parallel.py +++ b/compass/parallel.py @@ -66,6 +66,11 @@ def get_available_parallel_resources(config): cores_per_node=cores_per_node, mpi_allowed=mpi_allowed ) + + if config.has_option('parallel', 'gpus_per_node'): + available_resources['gpus_per_node'] = \ + config.getint('parallel', 'gpus_per_node') + return available_resources diff --git a/compass/version.py b/compass/version.py index 91e460c18..1b2e6af6b 100644 --- a/compass/version.py +++ b/compass/version.py @@ -1 +1 @@ -__version__ = '1.4.0-alpha.7' +__version__ = '1.5.0-alpha.1' diff --git a/conda/albany_supported.txt b/conda/albany_supported.txt index 12ce2817a..d969e7a20 100644 --- a/conda/albany_supported.txt +++ b/conda/albany_supported.txt @@ -3,4 +3,5 @@ chicoma-cpu, gnu, mpich chrysalis, gnu, openmpi pm-cpu, gnu, mpich +pm-gpu, gnugpu, mpich morpheus, gnu, openmpi diff --git a/conda/bootstrap.py b/conda/bootstrap.py index c113fb0e5..ff2f74209 100755 --- a/conda/bootstrap.py +++ b/conda/bootstrap.py @@ -476,6 +476,9 @@ def build_spack_env(config, update_spack, machine, compiler, mpi, # noqa: C901 scorpio = config.get('deploy', 'scorpio') parallelio = config.get('deploy', 'parallelio') + # for now, we'll assume Cuda is needed anytime GPUs are present + with_cuda = config.has_option('parallel', 'gpus_per_node') + if config.has_option('deploy', 'spack_mirror'): spack_mirror = config.get('deploy', 'spack_mirror') else: @@ -536,8 +539,14 @@ def build_spack_env(config, update_spack, machine, compiler, mpi, # noqa: C901 f'@{parallelio}+pnetcdf~timing"') if albany != 'None': - specs.append(f'"trilinos-for-albany@{albany}"') - specs.append(f'"albany@{albany}+mpas~py+unit_tests"') + if with_cuda: + albany_cuda = '+cuda+uvm+sfad sfadsize=12' + trilinos_cuda = '+cuda+uvm' + else: + albany_cuda = '' + trilinos_cuda = '' + specs.append(f'"trilinos-for-albany@{albany}{trilinos_cuda}"') + specs.append(f'"albany@{albany}+mpas~py+unit_tests{albany_cuda}"') yaml_template = f'{spack_template_path}/{machine}_{compiler}_{mpi}.yaml' if not os.path.exists(yaml_template): @@ -1082,8 +1091,9 @@ def main(): # noqa: C901 print('Install local mache\n') commands = f'source {conda_base}/etc/profile.d/conda.sh && ' \ f'conda activate {conda_env_name} && ' \ - 'cd ../build_mache/mache && ' \ - 'python -m pip install --no-deps .' + f'cd ../build_mache/mache && ' \ + f'conda install -y --file spec-file.txt && ' \ + f'python -m pip install --no-deps .' check_call(commands, logger=logger) previous_conda_env = conda_env_name diff --git a/conda/compass_env/spec-file.template b/conda/compass_env/spec-file.template index e7ae69b96..ee6e0e7b1 100644 --- a/conda/compass_env/spec-file.template +++ b/conda/compass_env/spec-file.template @@ -16,7 +16,7 @@ ipython jupyter lxml {% if include_mache %} -mache=1.23.0 +mache=1.25.0 {% endif %} matplotlib-base >=3.9.1 metis @@ -49,8 +49,8 @@ cmake cxx-compiler fortran-compiler libnetcdf=4.9.2={{ mpi_prefix }}_* -libpnetcdf=1.12.3={{ mpi_prefix }}_* -parallelio=2.6.2={{ mpi_prefix }}_* +libpnetcdf=1.13.0={{ mpi_prefix }}_* +parallelio=2.6.3={{ mpi_prefix }}_* m4 make {{ mpi }} diff --git a/conda/configure_compass_env.py b/conda/configure_compass_env.py index 6ad866196..bb2eb51f8 100755 --- a/conda/configure_compass_env.py +++ b/conda/configure_compass_env.py @@ -100,7 +100,7 @@ def main(): if local_mache: mache = '' else: - mache = '"mache=1.23.0"' + mache = '"mache=1.25.0"' setup_install_env(env_name, activate_base, args.use_local, logger, args.recreate, conda_base, mache) @@ -114,6 +114,7 @@ def main(): f'git clone -b {args.mache_branch} ' \ f'git@github.com:{args.mache_fork}.git mache && ' \ f'cd mache && ' \ + f'conda install -y --file spec-file.txt && ' \ f'python -m pip install --no-deps .' check_call(commands, logger=logger) diff --git a/conda/default.cfg b/conda/default.cfg index 83e13ee11..6cae25589 100644 --- a/conda/default.cfg +++ b/conda/default.cfg @@ -29,9 +29,9 @@ lapack = 3.9.1 metis = 5.1.0 moab = 5.5.1 netcdf_c = 4.9.2 -netcdf_fortran = 4.6.0 +netcdf_fortran = 4.6.1 petsc = 3.19.1 -pnetcdf = 1.12.3 -scorpio = 1.6.3 -# parallelio = 2.6.2 +pnetcdf = 1.13.0 +scorpio = 1.6.5 +# parallelio = 2.6.3 parallelio = None diff --git a/conda/unsupported.txt b/conda/unsupported.txt index 5014d526e..cfce62076 100644 --- a/conda/unsupported.txt +++ b/conda/unsupported.txt @@ -15,7 +15,8 @@ compy, pgi, mvapich2 pm-cpu, nvidia, mpich pm-cpu, aocc, mpich pm-cpu, amdclang, mpich - +pm-gpu, gnu, mpich +pm-gpu, nvidia, mpich # compiles but tests unreliable (errors or hanging), # see https://github.com/MPAS-Dev/compass/issues/336