Add CI workflow for GPU NVHPC build (#7)

* First pass to get GPU ci test working * Turn off label requirement for GPU ci workflow * Add -y flag for instaling lmod on self-hosted runner * NVHPC has been installed on the self-hosted runner, so we can skip that * Try using -Kieee to fix debug floating point error for nvfortran * Simplify -ta option for nvfortran * Turn off debug build for gpu workflow because the build hangs * Change from old GPU options to new ones * Autodetect the compute capability when building * Set stacksize limit to max value * Add script to capture commands needed to setup the GHA self-hosted EC2 instance * Cleanup GPU CI * Update gcc used for macos gnu ci * Use gcc-11 for macos CI, turn off GPU label * Turn off label criteria for gpu ci * Try a different label method
NOAA-GSL · Aug 14, 2024 · e66e695 · e66e695
1 parent 19c2e2e
commit e66e695
Show file tree

Hide file tree

Showing 6 changed files with 116 additions and 26 deletions.
diff --git a/.github/workflows/gpu_nvhpc.yml b/.github/workflows/gpu_nvhpc.yml
@@ -1,9 +1,9 @@
-name: Linux GPU NVHPC
-# triggered events (push, pull_request) for the develop branch
+name: Linux NVHPC GPU
+# triggered events (push, pull_request) for the master branch
 on:
   pull_request:
-    branches: [ develop ]
-    types: [ labeled ]
+    branches: [ master ]
+    types: [ labeled, opened, synchronize, reopened ]
   workflow_dispatch:
 
 #defaults:
@@ -13,29 +13,62 @@ on:
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
 
-  GPU_build:
-    if: ${{ github.event.label.name == 'GPU_Test' }}
+  ubuntu_build:
+    if: contains(github.event.pull_request.labels.*.name, 'GPU-CI')
 
-    name: GPU Build
+    name: Ubuntu NVHPC GPU Build
     # Run on self-hosted
     runs-on: self-hosted
 
     steps:
 
-      # Load NVHPC module
-      - name: Load NVHPC Module
+      ## Install Lmod
+      #- name: Install Lmod
+      #  run: |
+      #    sudo apt-get update -y
+      #    sudo apt-get install -y lmod
+      #    echo "source /usr/share/lmod/lmod/init/bash" >> ~/.bash_profile
+      #    source /usr/share/lmod/lmod/init/bash
+      #    module list
+      #
+      ## Install NVIDIA HPC SDK
+      #- name: Install NVIDIA HPC SDK
+      #  run: |
+      #    curl https://developer.download.nvidia.com/hpc-sdk/ubuntu/DEB-GPG-KEY-NVIDIA-HPC-SDK | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg
+      #    echo 'deb [signed-by=/usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | sudo tee /etc/apt/sources.list.d/nvhpc.list
+      #    sudo apt-get update -y
+      #    sudo apt-get install -y nvhpc-24-7
+
+      # Remove label
+      #- name: Remove GPU-CI label
+      #  - uses: actions-ecosystem/action-remove-labels@v1
+      #  with:
+      #    labels: GPU-CI
+
+      # Check location of installed NVHPC compilers
+      - name: Check compiler install
         run: |
-          pwd
-          ls -al
-          echo $SHELL
+          source /usr/share/lmod/lmod/init/bash
           module use /opt/nvidia/hpc_sdk/modulefiles
           module load nvhpc
           which nvc
           which nvfortran
 
-      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - name: Checkout repository
-        uses: actions/checkout@v2
+      # Test debug mode
+      # Turn this off because the compiler hangs while building in debug mode
+      #- name: Build gf debug
+      #  run: |
+      #    source /usr/share/lmod/lmod/init/bash
+      #    module use /opt/nvidia/hpc_sdk/modulefiles
+      #    module load nvhpc
+      #    cd ref
+      #    rm -rf build
+      #    mkdir build
+      #    cd build
+      #    #export OMP_NUM_THREADS=4
+      #    cmake -DCMAKE_BUILD_TYPE=debug -DENABLE_GPU=on ..
+      #    make VERBOSE=1
+      #    ctest --output-on-failure
 
       # Test release mode
       - name: Build gf release
@@ -47,7 +80,17 @@ jobs:
           rm -rf build
           mkdir build
           cd build
-          export OMP_NUM_THREADS=4
+          #export OMP_NUM_THREADS=4
           cmake -DCMAKE_BUILD_TYPE=release -DENABLE_GPU=on ..
-          make
-          ctest --output-on-failure -R gpu_kernel
+          make VERBOSE=1
+          ulimit -s hard
+          ctest --output-on-failure
+
+      # Debug session for failures
+      -
+        name: Debug session
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3
+        timeout-minutes: 60
+        with:
+          limit-access-to-actor: true
diff --git a/.github/workflows/macos_gnu.yml b/.github/workflows/macos_gnu.yml
@@ -25,6 +25,10 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v2
 
+      # Install gcc11
+      - name: Install GCC
+        run: brew install gcc@11
+
       # Install OpenMPI
       - name: Install OpenMPI
         run: brew install open-mpi
@@ -38,7 +42,7 @@ jobs:
           cd build
           #export OMP_NUM_THREADS=4
           export CC=gcc-11
-          export FC=gfortran-11          
+          export FC=gfortran-11
           cmake -DCMAKE_BUILD_TYPE=debug -DENABLE_GPU=off .. 
           make VERBOSE=1
           ctest --output-on-failure
@@ -52,7 +56,7 @@ jobs:
           cd build
           #export OMP_NUM_THREADS=4
           export CC=gcc-11
-          export FC=gfortran-11           
+          export FC=gfortran-11
           cmake -DCMAKE_BUILD_TYPE=release -DENABLE_GPU=off ..
           make VERBOSE=1
           ctest --output-on-failure

diff --git a/ref/CMakeLists.txt b/ref/CMakeLists.txt
@@ -24,7 +24,6 @@ find_package( OpenMP COMPONENTS C Fortran )
 find_package( MPI COMPONENTS C Fortran )
 
 if ( ENABLE_GPU )
-  set( OpenACC_ACCEL_TARGET tesla )
   find_package( OpenACC REQUIRED )
   find_package( CUDAToolkit REQUIRED )
   add_compile_definitions(ENABLE_GPU)
@@ -36,7 +35,6 @@ if ( ENABLE_GPU )
   string(REPLACE "." "" CUDA_ARCH_LIST "${INSTALLED_GPU_CCS_4}")
   message( STATUS "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" )
   SET(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST})
-  add_compile_options("-Minfo=accel")
 endif()
 
 add_subdirectory(src)

diff --git a/ref/cmake/compiler_flags_NVHPC_Fortran.cmake b/ref/cmake/compiler_flags_NVHPC_Fortran.cmake
@@ -1,7 +1,7 @@
 ####################################################################
 # COMMON FLAGS
 ####################################################################
-set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -g -traceback -Mnofma")
+set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -g -traceback -Mnofma -Kieee" )
 
 ####################################################################
 # RELEASE FLAGS
@@ -14,7 +14,7 @@ set( CMAKE_Fortran_FLAGS_RELEASE "-fast -mp -Mnovect" )
 # DEBUG FLAGS
 ####################################################################
 
-set( CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -O0 -Mbounds -Mchkptr -Mchkstk" )
+set( CMAKE_Fortran_FLAGS_DEBUG "${CMAKE_Fortran_FLAGS_DEBUG} -O0 -Mbounds -Mchkptr -Mchkstk -Ktrap=fp" )
 
 ####################################################################
 # FLAGS FOR GPU

diff --git a/ref/src/CMakeLists.txt b/ref/src/CMakeLists.txt
@@ -14,8 +14,8 @@ add_executable( gf_kernel_cpu ${gf_kernel_common_files} )
 
 if(ENABLE_GPU)
   add_executable( gf_kernel_gpu ${gf_kernel_common_files} )
-  target_compile_options(gf_kernel_gpu PUBLIC ${OpenACC_Fortran_OPTIONS}:cc${CUDA_ARCH_LIST},cuda${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR})
-  target_link_options(gf_kernel_gpu PUBLIC ${OpenACC_Fortran_OPTIONS}:cc${CUDA_ARCH_LIST})
+  target_compile_options(gf_kernel_gpu PUBLIC -Minfo=accel ${OpenACC_Fortran_OPTIONS}=gpu -gpu=ccnative -gpu=cuda${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR})
+  target_link_options(gf_kernel_gpu PUBLIC -Minfo=accel ${OpenACC_Fortran_OPTIONS}=gpu -gpu=ccnative -gpu=cuda${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR})
 endif()
 
 if(OpenMP_FOUND)

diff --git a/ref/test/tools/setup-gpu-runner.sh b/ref/test/tools/setup-gpu-runner.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+# Install drivers
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get -y install cuda-toolkit-12-6
+sudo apt-get install nvidia-gds
+sudo apt-get install -y cuda-drivers
+
+# reboot
+
+# Install LMOD
+sudo apt-get update -y
+sudo apt-get install -y lmod
+echo "source /usr/share/lmod/lmod/init/bash" >> ~/.bash_profile
+source /usr/share/lmod/lmod/init/bash
+module list
+
+# Install NVIDIA HPC SDK
+curl https://developer.download.nvidia.com/hpc-sdk/ubuntu/DEB-GPG-KEY-NVIDIA-HPC-SDK | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg
+echo 'deb [signed-by=/usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | sudo tee /etc/apt/sources.list.d/nvhpc.list
+sudo apt-get update -y
+sudo apt-get install -y nvhpc-24-7
+
+# Install cmake
+sudo apt-get install -y cmake
+
+
+# Run persistence driver - not needed?
+#sudo /usr/bin/nvidia-persistenced --verbose
+
+# Create a folder
+mkdir actions-runner && cd actions-runner
+# Download the latest runner package
+curl -o actions-runner-linux-x64-2.319.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.319.0/actions-runner-linux-x64-2.319.0.tar.gz
+# Optional: Validate the hash
+echo "52b8f9c5abb1a47cc506185a1a20ecea19daf0d94bbf4ddde7e617e7be109b14  actions-runner-linux-x64-2.319.0.tar.gz" | shasum -a 256 -c
+# Extract the installer
+tar xzf ./actions-runner-linux-x64-2.319.0.tar.gz
+
+# Create the runner and start the configuration experience
+$ ./config.sh --url https://github.com/NOAA-GSL/SENA-gf --token <given by github interface>
+# Last step, run it!
+$ ./run.sh