NVIDIA · DwarKapex · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
@@ -275,12 +275,12 @@ jobs:
     with:
       TEST_NAME: jax
       EXECUTE: |
-        docker run -i --shm-size=1g --gpus all \
+        docker run -i --shm-size=1g --gpus all --ulimit nofile=65536:65536 \
         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
         bash <<"EOF" |& tee test-backend-independent.log
           test-jax.sh -b backend-independent 
         EOF
-        docker run -i --shm-size=1g --gpus all \
+        docker run -i --shm-size=1g --gpus all --ulimit nofile=65536:65536 \
         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
         bash <<"EOF" |& tee tee test-gpu.log
           test-jax.sh -b gpu
@@ -432,7 +432,7 @@ jobs:
     with:
       TEST_NAME: pallas
       EXECUTE: |
-        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+        docker run -i --shm-size=1g --gpus all --ulimit nofile=65536:65536 --volume $PWD:/output \
         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
         bash <<"EOF" |& tee test-pallas.log
           python /opt/jax/tests/pallas/pallas_test.py --xml_output_file /output/pallas_test.xml
@@ -458,7 +458,7 @@ jobs:
     with:
       TEST_NAME: triton
       EXECUTE: |
-        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+        docker run -i --shm-size=1g --gpus all --ulimit nofile=65536:65536 --volume $PWD:/output \
         ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
         bash <<"EOF" |& tee test-triton.log
           # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch...
@@ -486,7 +486,7 @@ jobs:
     with:
       TEST_NAME: levanter
       EXECUTE: |
-        docker run -i --gpus all --shm-size=1g \
+        docker run -i --gpus all --shm-size=1g --ulimit nofile=65536:65536 \
         ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
         bash <<"EOF" |& tee test-levanter.log
           pip install flake8 pytest soundfile librosa
@@ -513,7 +513,7 @@ jobs:
     with:
       TEST_NAME: te
       EXECUTE: |
-        docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+        docker run -i --gpus all --shm-size=1g --ulimit nofile=65536:65536 -v $PWD:/log \
         ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
         bash <<"EOF" |& tee test-te.log
           pip install pytest-reportlog
@@ -557,7 +557,7 @@ jobs:
     with:
       TEST_NAME: gemma
       EXECUTE: |
-        docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
+        docker run --shm-size=1g --gpus all --ulimit nofile=65536:65536 ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
         bash -ec \
         "cd /opt/gemma && pip install -e .[test] && pytest ." | tee test-gemma.log
       STATISTICS_SCRIPT: |

diff --git a/.github/workflows/_runner_ondemand_slurm.yaml b/.github/workflows/_runner_ondemand_slurm.yaml
@@ -91,6 +91,7 @@ jobs:
           time docker run \
             --network host \
             --gpus all \
+            --ulimit nofile=65536:65536 \
             --privileged \
             -v /runner \
             -e RUNNER_NAME="${{ steps.meta.outputs.JOB_NAME }}" \

diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml
@@ -4,38 +4,24 @@ on:
   workflow_dispatch:
 
 jobs:
-  sandbox:
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Print usage
-        run: |
-          cat << EOF
-          This is an empty workflow file located in the main branch of your
-          repository. It serves as a testing ground for new GitHub Actions on
-          development branches before merging them to the main branch. By
-          defining and overloading this workflow on your development branch,
-          you can test new actions without affecting your main branch, ensuring
-          a smooth integration process once the changes are ready to be merged.
-
-          Usage:
-
-          1. In your development branch, modify the sandbox.yml workflow file
-             to include the new actions you want to test. Make sure to commit
-             the changes to the development branch.
-          2. Navigate to the 'Actions' tab in your repository, select the
-             '~Sandbox' workflow, and choose your development branch from the
-             branch dropdown menu. Click on 'Run workflow' to trigger the
-             workflow on your development branch.
-          3. Once you have tested and verified the new actions in the Sandbox
-             workflow, you can incorporate them into your main workflow(s) and
-             merge the development branch into the main branch. Remember to
-             revert the changes to the sandbox.yml file in the main branch to
-             keep it empty for future testing.
-          EOF
+  test-levanter:
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: levanter
+      EXECUTE: |
+        docker run --gpus all --shm-size=1g --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --ulimit nofile=1048576:1048576 \
+        ghcr.io/nvidia/jax:levanter \
+        bash -ec "ulimit -a; pip install flake8 pytest soundfile librosa; PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests"
+      STATISTICS_SCRIPT: |
+        summary_line=$(tail -n1 test-levanter.log)
+        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-levanter.log
+    secrets: inherit
diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml
@@ -47,7 +47,7 @@ jobs:
           docker tag ${{ inputs.ROSETTA_IMAGE }} rosetta:latest
 
       - name: Run Rosetta tests w/ docker
-        shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh
+        shell: docker run --gpus all --ulimit nofile=65536:65536 -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh
         run: |
           ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)"))
           pip install "${ROSETTA_PATH}[test]" pytest-reportlog

diff --git a/.github/workflows/mjx-build-test.yaml b/.github/workflows/mjx-build-test.yaml
@@ -185,7 +185,7 @@ jobs:
         shell: bash -x -e {0}
         continue-on-error: true
         run: |
-          docker run --gpus=all --shm-size=1g ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} bash -ec "mjx-testspeed --mjcf=humanoid/humanoid.xml --batch_size=8192 --unroll=4 --output=tsv" | tee -a test-mjx.log
+          docker run --gpus=all --ulimit nofile=65536:65536 --shm-size=1g ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} bash -ec "mjx-testspeed --mjcf=humanoid/humanoid.xml --batch_size=8192 --unroll=4 --output=tsv" | tee -a test-mjx.log
 
       - name: Save perf to summary
         shell: bash -x -e {0}