Update badge links in README (#538)

This is a preliminary update to the landing page badges, due to our recent transition to a unified nightly/presubmit workflow. Going forward, we will no longer use any badges generated by GitHub. Instead, all badge endpoint JSON files are created in the CI workflow on a per-job basis, and collectively uploaded as GitHub gist by `_finalize.yaml`. Known issues to be addressed in follow-up work: - docker container badges current are not linked to the latest ghcr.io tags. This is because GitHub container repository uses a 'version number' to key the tag and that prevents us from having a static link that point to the latest tag of an image. - a few badge are missing: - TE multi-GPU badges are created but having the wrong suffix and hence not properly recognized during workflow finalization. - Levanter tests was previously disabled and just enabled by this PR. The associated badges will be created during the next scheduled run. - Rosetta T5X test seems to be failing.
NVIDIA · Feb 15, 2024 · 1f5e4d6 · 1f5e4d6
1 parent 8498409
commit 1f5e4d6
Show file tree

Hide file tree

Showing 8 changed files with 290 additions and 184 deletions.
diff --git a/.github/workflows/_build.yaml b/.github/workflows/_build.yaml
@@ -135,7 +135,7 @@ jobs:
             BUILD_DATE=${{ inputs.BUILD_DATE }}
 
       - name: Generate sitrep
-        if: success() || failure()
+        if: "!cancelled()"
         shell: bash -x -e {0}
         run: |
           # bring in utility functions
@@ -169,6 +169,7 @@ jobs:
           > ${{ env.BADGE_FILENAME_FULL }}
 
       - name: Upload sitrep and badge
+        if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
           name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }}

diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml
@@ -169,7 +169,7 @@ jobs:
             ${{ inputs.BASE_IMAGE != 'latest' && format('BASE_IMAGE={0}', inputs.BASE_IMAGE) }}
         
       - name: Generate sitrep
-        if: success() || failure()
+        if: "!cancelled()"
         shell: bash -x -e {0}
         run: |
           # bring in utility functions
@@ -203,6 +203,7 @@ jobs:
           > ${{ env.BADGE_FILENAME_FULL }}
 
       - name: Upload sitrep and badge
+        if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
           name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }}

diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml
@@ -163,7 +163,7 @@ jobs:
             BUILD_DATE=${{ inputs.BUILD_DATE }}
 
       - name: Generate sitrep
-        if: success() || failure()
+        if: "!cancelled()"
         shell: bash -x -e {0}
         run: |
           # bring in utility functions
@@ -212,6 +212,7 @@ jobs:
       #     path: image-name.txt
 
       - name: Upload sitrep and badge
+        if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
           name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }}

diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml
@@ -142,7 +142,7 @@ jobs:
             BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }}
             
       - name: Generate sitrep
-        if: success() || failure()
+        if: "!cancelled()"
         shell: bash -x -e {0}
         run: |
           # bring in utility functions
@@ -176,6 +176,7 @@ jobs:
           > ${{ env.BADGE_FILENAME_FULL }}
 
       - name: Upload sitrep and badge
+        if: "!cancelled()"
         uses: actions/upload-artifact@v4
         with:
           name: ${{ env.ARTIFACT_NAME_FULL }}

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
@@ -244,6 +244,17 @@ jobs:
       BADGE_FILENAME: badge-pallas-unit-test
     secrets: inherit
 
+  test-levanter:
+    needs: build-levanter
+    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      IMAGE: ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}
+      TEST_NAME: levanter
+      ARTIFACT_NAME: artifact-levanter-unit-test
+      BADGE_FILENAME: badge-levanter-unit-test
+    secrets: inherit
+
   test-upstream-pax:
     needs: build-pax
     if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
@@ -259,12 +270,3 @@ jobs:
     with:
       PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
-
-  # TODO: re-activate after 20-workflow limit resolved
-  # test-levanter:
-  #   needs: build-levanter
-  #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_levanter.yaml
-  #   with:
-  #     JAX_IMAGE: ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
diff --git a/.github/workflows/_finalize.yaml b/.github/workflows/_finalize.yaml
@@ -3,6 +3,11 @@ name: ~finalize workflow run with job report and badge updates
 on:
   workflow_call:
     inputs:
+      BUILD_DATE:
+        type: string
+        description: 'Date of the build'
+        default: ''
+        required: false
       PUBLISH_BADGE:
         type: boolean
         description: Update the landing page badges with run outcomes
@@ -14,32 +19,42 @@ on:
         required: false
         default: 'artifact-final-report'
 jobs:
-  # show-containers:
-  #   runs-on: ubuntu-22.04
-  #   steps:
-  #     - name: Generate job summary for container build
-  #       shell: bash -x -e {0}
-  #       run: |
-  #         cat > $GITHUB_STEP_SUMMARY << EOF
-  #         # Images created
-
-  #         | Image        | Link                                               |
-  #         | ------------ | -------------------------------------------------- |
-  #         | Base         | ${{ needs.amd64.outputs.TAG_BASE }}                |
-  #         |              | ${{ needs.arm64.outputs.TAG_BASE }}                |
-  #         | JAX          | ${{ needs.amd64.outputs.TAG_JAX }}                 |
-  #         |              | ${{ needs.arm64.outputs.TAG_JAX }}                 |
-  #         | T5X          | ${{ needs.amd64.outputs.TAG_T5X }}                 |
-  #         |              | ${{ needs.arm64.outputs.TAG_T5X }}                 |
-  #         | PAX          | ${{ needs.amd64.outputs.TAG_PAX }}                 |
-  #         |              | ${{ needs.arm64.outputs.TAG_PAX }}                 |
-  #         EOF
-
-  #         # | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} |
-  #         # | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} |
+  # create shields.io badge endpoint JSON for workflow metadata
+  workflow-badge:
+    runs-on: ubuntu-22.04
+    env:
+      ARTIFACT_NAME: 'artifact-workflow-metadata'
+      BADGE_FILENAME: 'badge-workflow-metadata.json'
+    steps:
+      - name: Checkout the repository
+        uses: actions/checkout@v4
+
+      - name: Generate badge
+        shell: bash -x -e {0}
+        run: |
+          # bring in utility functions
+          source .github/workflows/scripts/to_json.sh
+
+          badge_label='workflow metadata'
+          badge_message="Run ${{ github.run_id }}, ${{ inputs.BUILD_DATE || github.event.created_at }}"
+
+          schemaVersion=1 \
+          label="${badge_label}" \
+          message="${badge_message}" \
+          color="bisque" \
+          to_json schemaVersion label message color \
+          > ${{ env.BADGE_FILENAME }}
+
+      - name: Upload sitrep and badge
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.ARTIFACT_NAME }}
+          path: |
+            ${{ env.BADGE_FILENAME }}
 
   upload-badge:
     runs-on: ubuntu-22.04
+    needs: [workflow-badge]
     env:
       # Name/bash regex for shields.io endpoint JSON files
       BADGE_FILES: '*badge*.json'
@@ -154,6 +169,7 @@ jobs:
         with:
           name: ${{ inputs.ARTIFACT_NAME }}
           path: ./sitrep.json
+
   publish-badge:
     needs: [upload-badge]
     if: inputs.PUBLISH_BADGE == true
@@ -176,9 +192,21 @@ jobs:
               gist_id: srcId
             });
 
-            // Prepare file upload
+            // Fetch existing files from destination gist
+            const { data: dstData } = await github.rest.gists.get({
+              gist_id: dstId
+            });
+
+            // Mark existing files in destination gist for deletion
             let filesToUpdate = {};
-            pattern = new RegExp(`${PUBLISH_BADGE_FILES}`);
+            for (const filename of Object.keys(dstData.files)) {
+              filesToUpdate[filename] = {
+                content: null
+              };
+            }
+
+            // Add or update files based on the pattern
+            const pattern = new RegExp(`${PUBLISH_BADGE_FILES}`);
             for (const [filename, fileObj] of Object.entries(srcData.files)) {
               if (filename.match(pattern)) {
                 filesToUpdate[filename] = {
@@ -194,4 +222,3 @@ jobs:
             });
             console.log("Files copied successfully.");
             console.log(Object.keys(filesToUpdate));
-            
diff --git a/.github/workflows/_test_unit.yaml b/.github/workflows/_test_unit.yaml
@@ -214,3 +214,88 @@ jobs:
             test-pallas.log
             sitrep.json
             ${{ env.BADGE_FILENAME_FULL }}
+
+  levanter-unit-test:
+    if: ${{ inputs.TEST_NAME == 'levanter' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        GPU_ARCH: [V100, A100]
+    # ensures A100 job lands on dedicated runner for this particular job
+    runs-on: [self-hosted, "${{ matrix.GPU_ARCH == 'A100' && format('{0}:{1}', matrix.GPU_ARCH, github.run_id) || matrix.GPU_ARCH }}"]
+    env:
+      BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ matrix.GPU_ARCH }}.json
+    steps:
+      - name: Print environment variables
+        run: env
+
+      - name: Print GPU information
+        run: nvidia-smi  
+
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Pull Levanter image
+        shell: bash -x -e {0}
+        run: |
+          docker pull ${{ inputs.IMAGE }}
+
+      - name: Test with pytest
+        run: |
+          docker run --gpus all --shm-size=1g ${{ inputs.IMAGE }} bash -ec "pip install flake8 pytest test_util && pytest /opt/levanter/tests" | tee test-levanter.log
+
+      - name: Generate sitrep
+        shell: bash -x -e {0}
+        run: |
+          # bring in utility functions
+          source .github/workflows/scripts/to_json.sh
+
+          badge_label='${{ matrix.GPU_ARCH }} Unit'
+
+          errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+          failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+          passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+          total_tests=$((failed_tests + passed_tests))
+
+          if [[ ${errors} > 0 ]] || [[ ${total_tests} == 0 ]]; then
+            badge_message='error'
+            badge_color=red
+            summary='Levanter unit test on ${{ matrix.GPU_ARCH }} did not complete due to errors.'
+          else
+            badge_message="${passed_tests}/${total_tests} passed"
+            if [[ ${failed_tests} == 0 ]]; then
+              badge_color=brightgreen
+            else
+              badge_color=yellow
+            fi
+            summary="Levanter unit test on ${{ matrix.GPU_ARCH }}: $badge_message"
+          fi
+
+          to_json \
+            summary \
+            errors total_tests passed_tests failed_tests \
+            badge_label badge_color badge_message \
+          > sitrep.json
+
+          schemaVersion=1 \
+          label="${badge_label}" \
+          message="${badge_message}" \
+          color="${badge_color}" \
+          to_json schemaVersion label message color \
+          > ${{ env.BADGE_FILENAME_FULL }}
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ inputs.ARTIFACT_NAME }}-${{ matrix.GPU_ARCH }}
+          path: |
+            test-levanter.log
+            sitrep.json
+            ${{ env.BADGE_FILENAME_FULL }}