From 8a330f9135a527c170b6f8006151540fb44669e4 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 20 Sep 2024 10:57:02 -0700 Subject: [PATCH] Update ROCm CI (#357) Co-authored-by: Binyang Li --- .azure-pipelines/integration-test-rocm.yml | 1 - .github/workflows/codeql-analysis.yml | 58 +++++++++++++++++++--- README.md | 3 +- 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/.azure-pipelines/integration-test-rocm.yml b/.azure-pipelines/integration-test-rocm.yml index c098ab085..029259ffb 100644 --- a/.azure-pipelines/integration-test-rocm.yml +++ b/.azure-pipelines/integration-test-rocm.yml @@ -64,7 +64,6 @@ jobs: set -e git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/azure-mscclpp cd azure-mscclpp - git checkout binyli/ci mkdir execution-files python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 7295171e9..73496445d 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -9,11 +9,11 @@ on: - cron: "30 1 * * 1" jobs: - analyze: - name: Analyze + analyze-cuda: + name: Analyze (CUDA) runs-on: 'ubuntu-latest' container: - image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda-version }} + image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }} permissions: actions: read @@ -24,7 +24,7 @@ jobs: fail-fast: false matrix: language: [ 'cpp', 'python' ] - cuda-version: [ 'cuda11.8', 'cuda12.2' ] + version: [ 'cuda11.8', 'cuda12.2' ] steps: - name: Checkout repository @@ -45,10 +45,56 @@ jobs: - name: Build run: | - cmake -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON . + rm -rf build && mkdir build && cd build + cmake -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. make -j - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 with: - category: "/language:${{matrix.language}}/cuda-version:${{matrix.cuda-version}}" + category: "/language:${{matrix.language}}/version:${{matrix.version}}" + + analyze-rocm: + name: Analyze (ROCm) + runs-on: 'ubuntu-latest' + container: + image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }} + + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'cpp', 'python' ] + version: [ 'rocm6.2' ] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Check disk space + run: | + df -h + + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + + - name: Dubious ownership exception + run: | + git config --global --add safe.directory /__w/mscclpp/mscclpp + + - name: Build + run: | + rm -rf build && mkdir build && cd build + CXX=/opt/rocm/bin/hipcc cmake -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON .. + make -j + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{matrix.language}}/version:${{matrix.version}}" diff --git a/README.md b/README.md index 9796179d3..cd8e80790 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,7 @@ |--------------------------|-------------------| | Unit Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-ut?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=4&branchName=main) | | Integration Tests (CUDA) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-test?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=3&branchName=main) | - -*NOTE (Nov 2023): Azure pipelines for ROCm will be added soon.* +| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=7&branchName=main) | A GPU-driven communication stack for scalable AI applications.