use nccl test

NVIDIA · Aug 20, 2024 · df243b9 · df243b9
1 parent 813ed22
commit df243b9
Showing 1 changed file with 51 additions and 34 deletions.
diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml
@@ -5,37 +5,54 @@ on:
 
 jobs:
   sandbox:
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Print usage
-        run: |
-          cat << EOF
-          This is an empty workflow file located in the main branch of your
-          repository. It serves as a testing ground for new GitHub Actions on
-          development branches before merging them to the main branch. By
-          defining and overloading this workflow on your development branch,
-          you can test new actions without affecting your main branch, ensuring
-          a smooth integration process once the changes are ready to be merged.
-
-          Usage:
-          
-          1. In your development branch, modify the sandbox.yml workflow file
-             to include the new actions you want to test. Make sure to commit
-             the changes to the development branch.
-          2. Navigate to the 'Actions' tab in your repository, select the
-             '~Sandbox' workflow, and choose your development branch from the
-             branch dropdown menu. Click on 'Run workflow' to trigger the
-             workflow on your development branch.
-          3. Once you have tested and verified the new actions in the Sandbox
-             workflow, you can incorporate them into your main workflow(s) and
-             merge the development branch into the main branch. Remember to
-             revert the changes to the sandbox.yml file in the main branch to
-             keep it empty for future testing.
-          EOF
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: nsys-jax
+      EXECUTE: |
+        set -o pipefail
+        IMAGE_NAME=ghcr.io/nvidia/jax-toolbox-internal:10352352914-maxtext-amd64
+        nvidia-smi
+        num_failures=0
+        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+        for mode in 1-process 2-process process-per-gpu; do
+          DOCKER="docker run --shm-size=1g --gpus all -v ${PWD}:/opt/output ${IMAGE_NAME}"
+          if [[ "${mode}" == "1-process" ]]; then
+            APP="jax-nccl-test.py"
+          elif [[ ${mode}" == "2-process" ]]; then
+            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+            # this will flush out more bugs than process-per-node or process-per-GPU.
+            APP="jax-nccl-test-multiprocess.sh 2 --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)"
+          else
+            APP="jax-nccl-test-multiprocess.sh ${GPUS_PER_NODE} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1"
+          fi
+          NSYS_JAX="nsys-jax --nsys-jax-analysis communication --nsys-jax-analysis summary"
+          for collection in full partial; do
+            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution.zip"
+            if [[ "${collection}" == "partial" ]]; then
+              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+              # nvbug/4801401
+              NSYS_JAX+=" --sample=none"
+            fi
+            ${DOCKER} ${NSYS_JAX} -- ${APP} |& tee /opt/output/${mode}-${collection}-execution.log
+            num_failures=$((num_failures + ($? != 0)))
+          done
+        done
+        ls -R .
+        exit $num_failures
+      STATISTICS_SCRIPT: |
+        # errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+        # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+        # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+        # total_tests=$((failed_tests + passed_tests))
+        # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+        echo "TOTAL_TESTS=3" >> $GITHUB_OUTPUT
+        echo "ERRORS=1" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=1" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=1" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        full-execution.log
+        partial-execution.log
+    secrets: inherit