Skip to content

Commit

Permalink
Add CI workflow for GPU NVHPC build (#7)
Browse files Browse the repository at this point in the history
* First pass to get GPU ci test working

* Turn off label requirement for GPU ci workflow

* Add -y flag for instaling lmod on self-hosted runner

* NVHPC has been installed on the self-hosted runner, so we can skip that

* Try using -Kieee to fix debug floating point error for nvfortran

* Simplify -ta option for nvfortran

* Turn off debug build for gpu workflow because the build hangs

* Change from old GPU options to new ones

* Autodetect the compute capability when building

* Set stacksize limit to max value

* Add script to capture commands needed to setup the GHA self-hosted EC2 instance

* Cleanup GPU CI

* Update gcc used for macos gnu ci

* Use gcc-11 for macos CI, turn off GPU label

* Turn off label criteria for gpu ci

* Try a different label method
  • Loading branch information
christopherwharrop-noaa authored Aug 14, 2024
1 parent 19c2e2e commit e66e695
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 26 deletions.
79 changes: 61 additions & 18 deletions .github/workflows/gpu_nvhpc.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: Linux GPU NVHPC
# triggered events (push, pull_request) for the develop branch
name: Linux NVHPC GPU
# triggered events (push, pull_request) for the master branch
on:
pull_request:
branches: [ develop ]
types: [ labeled ]
branches: [ master ]
types: [ labeled, opened, synchronize, reopened ]
workflow_dispatch:

#defaults:
Expand All @@ -13,29 +13,62 @@ on:
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:

GPU_build:
if: ${{ github.event.label.name == 'GPU_Test' }}
ubuntu_build:
if: contains(github.event.pull_request.labels.*.name, 'GPU-CI')

name: GPU Build
name: Ubuntu NVHPC GPU Build
# Run on self-hosted
runs-on: self-hosted

steps:

# Load NVHPC module
- name: Load NVHPC Module
## Install Lmod
#- name: Install Lmod
# run: |
# sudo apt-get update -y
# sudo apt-get install -y lmod
# echo "source /usr/share/lmod/lmod/init/bash" >> ~/.bash_profile
# source /usr/share/lmod/lmod/init/bash
# module list
#
## Install NVIDIA HPC SDK
#- name: Install NVIDIA HPC SDK
# run: |
# curl https://developer.download.nvidia.com/hpc-sdk/ubuntu/DEB-GPG-KEY-NVIDIA-HPC-SDK | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg
# echo 'deb [signed-by=/usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | sudo tee /etc/apt/sources.list.d/nvhpc.list
# sudo apt-get update -y
# sudo apt-get install -y nvhpc-24-7

# Remove label
#- name: Remove GPU-CI label
# - uses: actions-ecosystem/action-remove-labels@v1
# with:
# labels: GPU-CI

# Check location of installed NVHPC compilers
- name: Check compiler install
run: |
pwd
ls -al
echo $SHELL
source /usr/share/lmod/lmod/init/bash
module use /opt/nvidia/hpc_sdk/modulefiles
module load nvhpc
which nvc
which nvfortran
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- name: Checkout repository
uses: actions/checkout@v2
# Test debug mode
# Turn this off because the compiler hangs while building in debug mode
#- name: Build gf debug
# run: |
# source /usr/share/lmod/lmod/init/bash
# module use /opt/nvidia/hpc_sdk/modulefiles
# module load nvhpc
# cd ref
# rm -rf build
# mkdir build
# cd build
# #export OMP_NUM_THREADS=4
# cmake -DCMAKE_BUILD_TYPE=debug -DENABLE_GPU=on ..
# make VERBOSE=1
# ctest --output-on-failure

# Test release mode
- name: Build gf release
Expand All @@ -47,7 +80,17 @@ jobs:
rm -rf build
mkdir build
cd build
export OMP_NUM_THREADS=4
#export OMP_NUM_THREADS=4
cmake -DCMAKE_BUILD_TYPE=release -DENABLE_GPU=on ..
make
ctest --output-on-failure -R gpu_kernel
make VERBOSE=1
ulimit -s hard
ctest --output-on-failure
# Debug session for failures
-
name: Debug session
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3
timeout-minutes: 60
with:
limit-access-to-actor: true
8 changes: 6 additions & 2 deletions .github/workflows/macos_gnu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v2

# Install gcc11
- name: Install GCC
run: brew install gcc@11

# Install OpenMPI
- name: Install OpenMPI
run: brew install open-mpi
Expand All @@ -38,7 +42,7 @@ jobs:
cd build
#export OMP_NUM_THREADS=4
export CC=gcc-11
export FC=gfortran-11
export FC=gfortran-11
cmake -DCMAKE_BUILD_TYPE=debug -DENABLE_GPU=off ..
make VERBOSE=1
ctest --output-on-failure
Expand All @@ -52,7 +56,7 @@ jobs:
cd build
#export OMP_NUM_THREADS=4
export CC=gcc-11
export FC=gfortran-11
export FC=gfortran-11
cmake -DCMAKE_BUILD_TYPE=release -DENABLE_GPU=off ..
make VERBOSE=1
ctest --output-on-failure
Expand Down
2 changes: 0 additions & 2 deletions ref/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ find_package( OpenMP COMPONENTS C Fortran )
find_package( MPI COMPONENTS C Fortran )

if ( ENABLE_GPU )
set( OpenACC_ACCEL_TARGET tesla )
find_package( OpenACC REQUIRED )
find_package( CUDAToolkit REQUIRED )
add_compile_definitions(ENABLE_GPU)
Expand All @@ -36,7 +35,6 @@ if ( ENABLE_GPU )
string(REPLACE "." "" CUDA_ARCH_LIST "${INSTALLED_GPU_CCS_4}")
message( STATUS "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" )
SET(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST})
add_compile_options("-Minfo=accel")
endif()

add_subdirectory(src)
Expand Down
4 changes: 2 additions & 2 deletions ref/cmake/compiler_flags_NVHPC_Fortran.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
####################################################################
# COMMON FLAGS
####################################################################
set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -g -traceback -Mnofma")
set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -g -traceback -Mnofma -Kieee" )

####################################################################
# RELEASE FLAGS
Expand All @@ -14,7 +14,7 @@ set( CMAKE_Fortran_FLAGS_RELEASE "-fast -mp -Mnovect" )
# DEBUG FLAGS
####################################################################

set( CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -O0 -Mbounds -Mchkptr -Mchkstk" )
set( CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -O0 -Mbounds -Mchkptr -Mchkstk -Ktrap=fp" )

####################################################################
# FLAGS FOR GPU
Expand Down
4 changes: 2 additions & 2 deletions ref/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ add_executable( gf_kernel_cpu ${gf_kernel_common_files} )

if(ENABLE_GPU)
add_executable( gf_kernel_gpu ${gf_kernel_common_files} )
target_compile_options(gf_kernel_gpu PUBLIC ${OpenACC_Fortran_OPTIONS}:cc${CUDA_ARCH_LIST},cuda${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR})
target_link_options(gf_kernel_gpu PUBLIC ${OpenACC_Fortran_OPTIONS}:cc${CUDA_ARCH_LIST})
target_compile_options(gf_kernel_gpu PUBLIC -Minfo=accel ${OpenACC_Fortran_OPTIONS}=gpu -gpu=ccnative -gpu=cuda${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR})
target_link_options(gf_kernel_gpu PUBLIC -Minfo=accel ${OpenACC_Fortran_OPTIONS}=gpu -gpu=ccnative -gpu=cuda${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR})
endif()

if(OpenMP_FOUND)
Expand Down
45 changes: 45 additions & 0 deletions ref/test/tools/setup-gpu-runner.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env bash

# Install drivers
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda-toolkit-12-6
sudo apt-get install nvidia-gds
sudo apt-get install -y cuda-drivers

# reboot

# Install LMOD
sudo apt-get update -y
sudo apt-get install -y lmod
echo "source /usr/share/lmod/lmod/init/bash" >> ~/.bash_profile
source /usr/share/lmod/lmod/init/bash
module list

# Install NVIDIA HPC SDK
curl https://developer.download.nvidia.com/hpc-sdk/ubuntu/DEB-GPG-KEY-NVIDIA-HPC-SDK | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg
echo 'deb [signed-by=/usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | sudo tee /etc/apt/sources.list.d/nvhpc.list
sudo apt-get update -y
sudo apt-get install -y nvhpc-24-7

# Install cmake
sudo apt-get install -y cmake


# Run persistence driver - not needed?
#sudo /usr/bin/nvidia-persistenced --verbose

# Create a folder
mkdir actions-runner && cd actions-runner
# Download the latest runner package
curl -o actions-runner-linux-x64-2.319.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.319.0/actions-runner-linux-x64-2.319.0.tar.gz
# Optional: Validate the hash
echo "52b8f9c5abb1a47cc506185a1a20ecea19daf0d94bbf4ddde7e617e7be109b14 actions-runner-linux-x64-2.319.0.tar.gz" | shasum -a 256 -c
# Extract the installer
tar xzf ./actions-runner-linux-x64-2.319.0.tar.gz

# Create the runner and start the configuration experience
$ ./config.sh --url https://github.com/NOAA-GSL/SENA-gf --token <given by github interface>
# Last step, run it!
$ ./run.sh

0 comments on commit e66e695

Please sign in to comment.