Skip to content

Address CUDA MPI/IPC issue with Kokkos <=4.4.1 #1193

Address CUDA MPI/IPC issue with Kokkos <=4.4.1

Address CUDA MPI/IPC issue with Kokkos <=4.4.1 #1193

Workflow file for this run

name: CI extended
on:
# run every day at 06:00 UTC
schedule:
- cron: '0 6 * * *'
# when triggered manually
workflow_dispatch:
# when auto merge is enabled (hack to make sure it's run before merging)
pull_request:
types: [auto_merge_enabled]
# Cancel "duplicated" workflows triggered by pushes to internal
# branches with associated PRs.
concurrency:
group: ${{ github.ref }}-${{ github.head_ref }}-CI-extended
cancel-in-progress: true
env:
CTEST_OUTPUT_ON_FAILURE: 1
CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
MACHINE_CFG: cmake/machinecfg/CI.cmake
OMPI_MCA_mpi_common_cuda_event_max: 1000
# https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
OMPI_MCA_btl_vader_single_copy_mechanism: none
jobs:
perf-and-regression:
strategy:
matrix:
device: ['cuda', 'host']
parallel: ['serial', 'mpi']
runs-on: [self-hosted, A100]
container:
image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
# map to local user id on CI machine to allow writing to build cache
options: --user 1001
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: Setup cache for gold standard
uses: actions/cache@v3
with:
path: tst/regression/gold_standard/
key: gold-standard
- name: Set vars based on matrix
id: cmake-vars
run: |
if ${{ matrix.device == 'host' }}; then
echo "enable_asan=ON" >> $GITHUB_OUTPUT
else
echo "enable_asan=OFF" >> $GITHUB_OUTPUT
fi
- name: Configure
run: |
cmake -B build \
-DCMAKE_BUILD_TYPE=Release \
-DENABLE_ASAN=${{ steps.cmake-vars.outputs.enable_asan }} \
-DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }}
- name: Build
run: cmake --build build
# run performance "unit" tests (none use MPI)
- name: Performance tests
if: ${{ matrix.parallel == 'serial' }}
run: |
cd build
# Pick GPU with most available memory
export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
# Sanitizers options (leak detection is disabled)
export ASAN_OPTIONS=abort_on_error=1:fast_unwind_on_malloc=1
export UBSAN_OPTIONS=print_stacktrace=0
export LSAN_OPTIONS=detect_leaks=0
ctest -L performance -LE perf-reg
# run regression tests
- name: Regression tests
run: |
cd build
# Pick GPU with most available memory
export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
# Sanitizers options (disable leak detection for MPI runs, due to OpenMPI leaks)
export ASAN_OPTIONS=abort_on_error=1:fast_unwind_on_malloc=1
export UBSAN_OPTIONS=print_stacktrace=0
export LSAN_OPTIONS=detect_leaks=0
ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600
# Test Ascent integration (only most complex setup with MPI and on device)
- name: Ascent tests
if: ${{ matrix.parallel == 'mpi' && matrix.device == 'cuda' }}
run: |
cmake -B build-ascent \
-DCMAKE_BUILD_TYPE=Release \
-DMACHINE_VARIANT=${{ matrix.device }}-${{ matrix.parallel }} \
-DPARTHENON_ENABLE_ASCENT=ON \
-DAscent_DIR=/usr/local/ascent-develop/lib/cmake/ascent
cmake --build build-ascent
cd example/advection/
# Pick GPU with most available memory
export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
mpirun -np 2 ../../build-ascent/example/advection/advection-example \
-i parthinput.advection \
parthenon/output5/dt=0.05 \
parthenon/time/tlim=0.1
# check if file exists
if [ ! -f "ascent_render_57.png" ]; then
echo "'ascent_render_57.png' does not exist."
exit 1
fi
- uses: actions/upload-artifact@v3
with:
name: log-and-convergence-${{ matrix.device }}-${{ matrix.parallel }}
path: |
build/CMakeFiles/CMakeOutput.log
build/tst/regression/outputs/advection_convergence*/advection-errors.dat
build/tst/regression/outputs/advection_convergence*/advection-errors.png
example/advection/ascent_render_57.png
retention-days: 3
perf-and-regression-amdgpu:
strategy:
matrix:
parallel: ['serial', 'mpi']
runs-on: [self-hosted, navi1030]
container:
image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
# Map to local user id on CI machine to allow writing to build cache and
# forward device handles to access AMD GPU within container
options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
env:
CMAKE_GENERATOR: Ninja
CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: Setup cache for gold standard
uses: actions/cache@v3
with:
path: tst/regression/gold_standard/
key: gold-standard
- name: Configure
run: |
cmake -B build \
-DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DMACHINE_VARIANT=hip-${{ matrix.parallel }} \
-DCMAKE_CXX_COMPILER=hipcc
- name: Build
run: cmake --build build
# run performance "unit" tests (none use MPI)
- name: Performance tests
if: ${{ matrix.parallel == 'serial' }}
run: |
cd build
ctest -L performance -LE perf-reg
# run regression tests
- name: Regression tests
run: |
cd build
ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600
- uses: actions/upload-artifact@v3
with:
name: log-and-convergence-${{ matrix.parallel }}
path: |
build/CMakeFiles/CMakeOutput.log
build/tst/regression/outputs/advection_convergence*/advection-errors.dat
build/tst/regression/outputs/advection_convergence*/advection-errors.png
retention-days: 3