diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 7b90b72ca..b87e6993b 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -5,37 +5,54 @@ on: jobs: sandbox: - runs-on: ubuntu-22.04 - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Print usage - run: | - cat << EOF - This is an empty workflow file located in the main branch of your - repository. It serves as a testing ground for new GitHub Actions on - development branches before merging them to the main branch. By - defining and overloading this workflow on your development branch, - you can test new actions without affecting your main branch, ensuring - a smooth integration process once the changes are ready to be merged. - - Usage: - - 1. In your development branch, modify the sandbox.yml workflow file - to include the new actions you want to test. Make sure to commit - the changes to the development branch. - 2. Navigate to the 'Actions' tab in your repository, select the - '~Sandbox' workflow, and choose your development branch from the - branch dropdown menu. Click on 'Run workflow' to trigger the - workflow on your development branch. - 3. Once you have tested and verified the new actions in the Sandbox - workflow, you can incorporate them into your main workflow(s) and - merge the development branch into the main branch. Remember to - revert the changes to the sandbox.yml file in the main branch to - keep it empty for future testing. - EOF + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: nsys-jax + EXECUTE: | + set -o pipefail + IMAGE_NAME=ghcr.io/nvidia/jax-toolbox-internal:10352352914-maxtext-amd64 + nvidia-smi + num_failures=0 + GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + for mode in 1-process 2-process process-per-gpu; do + DOCKER="docker run --shm-size=1g --gpus all -v ${PWD}:/opt/output ${IMAGE_NAME}" + if [[ "${mode}" == "1-process" ]]; then + APP="jax-nccl-test.py" + elif [[ ${mode}" == "2-process" ]]; then + # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # this will flush out more bugs than process-per-node or process-per-GPU. + APP="jax-nccl-test-multiprocess.sh 2 --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)" + else + APP="jax-nccl-test-multiprocess.sh ${GPUS_PER_NODE} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1" + fi + NSYS_JAX="nsys-jax --nsys-jax-analysis communication --nsys-jax-analysis summary" + for collection in full partial; do + NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution.zip" + if [[ "${collection}" == "partial" ]]; then + NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # nvbug/4801401 + NSYS_JAX+=" --sample=none" + fi + ${DOCKER} ${NSYS_JAX} -- ${APP} |& tee /opt/output/${mode}-${collection}-execution.log + num_failures=$((num_failures + ($? != 0))) + done + done + ls -R . + exit $num_failures + STATISTICS_SCRIPT: | + # errors=$(cat test-*.log | grep -c 'ERROR:' || true) + # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + echo "TOTAL_TESTS=3" >> $GITHUB_OUTPUT + echo "ERRORS=1" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=1" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=1" >> $GITHUB_OUTPUT + ARTIFACTS: | + full-execution.log + partial-execution.log + secrets: inherit