From 6ace13ba7063321bf1ab69e10a32c2d916f535b5 Mon Sep 17 00:00:00 2001 From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> Date: Thu, 22 Jun 2023 10:49:57 -0500 Subject: [PATCH 01/16] Pax new completion badge (#94) * save slurm status to json * publish badge using slurm statuses --- .github/workflows/_test_pax.yaml | 10 ++++++ .github/workflows/nightly-pax-test-mgmn.yaml | 32 +++++++++++++++++++- README.md | 2 +- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index e3c4c9051..b14e8f3d4 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -132,6 +132,16 @@ jobs: output/ \ ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + - name: Write SLURM job status to file + shell: bash -x -e {0} + run: | + python << EOF + import json + with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: + dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} + json.dump(dump, f) + EOF + - name: Upload training logs as artifacts uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/nightly-pax-test-mgmn.yaml b/.github/workflows/nightly-pax-test-mgmn.yaml index 5f8cbb594..bed6dc68c 100644 --- a/.github/workflows/nightly-pax-test-mgmn.yaml +++ b/.github/workflows/nightly-pax-test-mgmn.yaml @@ -33,6 +33,7 @@ jobs: outputs: BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} PAX_IMAGE: ${{ steps.date.outputs.PAX_IMAGE }} + PUBLISH: ${{ steps.date.outputs.PUBLISH }} steps: - name: Set metadata id: date @@ -44,6 +45,7 @@ jobs: PAX_IMAGE=${{ inputs.PAX_IMAGE }} PAX_IMAGE=${PAX_IMAGE:-${{ env.DEFAULT_PAX_IMAGE }}} echo "PAX_IMAGE=${PAX_IMAGE}" >> $GITHUB_OUTPUT + echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT run-jobs: needs: metadata @@ -100,8 +102,36 @@ jobs: EOF ) | tee $GITHUB_STEP_SUMMARY + publish-completion: + needs: [metadata, run-jobs] + uses: ./.github/workflows/_publish_badge.yaml + if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + secrets: inherit + with: + ENDPOINT_FILENAME: 'pax-test-completion-status.json' + PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} + SCRIPT: | + EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json" + PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) + FAILED_TESTS=$(jq -r '. | select ((.state == "FAILED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) + TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) + + echo "Test statuses:" + jq -rc 'input_filename,.' $EXIT_STATUSES + + if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then + BADGE_COLOR=brightgreen + elif [[ $PASSED_TESTS -eq 0 ]]; then + BADGE_COLOR=red + else + BADGE_COLOR=yellow + fi + echo "LABEL='Completion'" >> $GITHUB_OUTPUT + echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + if-upstream-failed: runs-on: ubuntu-latest if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 \ No newline at end of file + - run: echo 'Upstream workflow failed, aborting run' && exit 1 diff --git a/README.md b/README.md index 5dcfcf535..092b69084 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ [test-badge-jax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fjax-unit-test-status.json&logo=nvidia [test-badge-t5x]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-t5x-test-mgmn.yaml?branch=main&label=A100%20MGMN&logo=nvidia -[test-badge-pax]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-pax-test-mgmn.yaml?branch=main&label=A100%20MGMN&logo=nvidia +[test-badge-pax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fpax-test-completion-status.json&logo=nvidia [unit-test-badge-te]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fte-unit-test-status.json&logo=nvidia [integration-test-badge-te]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fte-integration-test-status.json&logo=nvidia [test-badge-rosetta-t5x]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-rosetta-t5x-build.yaml?branch=main&label=A100%20MGMN&logo=nvidia From 2f88b5ccc07f7914251f001102cb304d791ec02c Mon Sep 17 00:00:00 2001 From: Abhinav Goel Date: Thu, 22 Jun 2023 11:11:16 -0700 Subject: [PATCH 02/16] Update patchlist-praxis.txt --- rosetta/patchlist-praxis.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rosetta/patchlist-praxis.txt b/rosetta/patchlist-praxis.txt index 632e963ce..8ecf1fd00 100644 --- a/rosetta/patchlist-praxis.txt +++ b/rosetta/patchlist-praxis.txt @@ -5,3 +5,6 @@ # - External Pull Requests (These are pull requests with upstream praxis and are of the form "pull/$PULLID/head") # - Note: Only the first column is used as a git-ref, so anything after is a comment + +pull/17/head # This PR creates an option to remove the NaN check at the end of every microbatch when using Pipeline Parallelism. This allows cublas to fuse the GeMMs with the gradient accumulation leading to a 6% perf improvement. + From f2199a7cd5e5e11ab7aa5643250c3c80be50226d Mon Sep 17 00:00:00 2001 From: Abhinav Goel Date: Thu, 22 Jun 2023 11:13:01 -0700 Subject: [PATCH 03/16] Update patchlist-praxis.txt --- rosetta/patchlist-praxis.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/rosetta/patchlist-praxis.txt b/rosetta/patchlist-praxis.txt index 8ecf1fd00..42c1acc6b 100644 --- a/rosetta/patchlist-praxis.txt +++ b/rosetta/patchlist-praxis.txt @@ -4,6 +4,7 @@ # - Internal patches (These are branches that start with "patch/") # - External Pull Requests (These are pull requests with upstream praxis and are of the form "pull/$PULLID/head") # - Note: Only the first column is used as a git-ref, so anything after is a comment +pull/18/head # This PR allows XLA:GPU to detect the MHA pattern more easily to call fused kernels from cublas. pull/17/head # This PR creates an option to remove the NaN check at the end of every microbatch when using Pipeline Parallelism. This allows cublas to fuse the GeMMs with the gradient accumulation leading to a 6% perf improvement. From 6079a442c41edc86f7a41f7c0bd7b163f18c2aa4 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Mon, 26 Jun 2023 09:26:58 -0700 Subject: [PATCH 04/16] adds proper build/test badges for rosetta && fixes gist upload for new files --- .github/workflows/_publish_badge.yaml | 15 ++++- .github/workflows/_test_rosetta.yaml | 27 +++++++- .../workflows/nightly-rosetta-pax-build.yaml | 63 ++++++++++++++++++- .../workflows/nightly-rosetta-t5x-build.yaml | 63 ++++++++++++++++++- README.md | 12 ++-- 5 files changed, 166 insertions(+), 14 deletions(-) diff --git a/.github/workflows/_publish_badge.yaml b/.github/workflows/_publish_badge.yaml index a8e2325f3..2d3b4b350 100644 --- a/.github/workflows/_publish_badge.yaml +++ b/.github/workflows/_publish_badge.yaml @@ -79,6 +79,15 @@ jobs: gist_id: gistId, files: { [filename]: { content }, - ...Object.fromEntries(Object.entries(gist.files).filter(([name]) => name !== filename)) - } - }); + ...Object.fromEntries( + Object.entries(gist.files) + .filter(([name]) => name !== filename) + .map(([name, value]) => [ + name, + Object.fromEntries( + Object.entries(value).filter(([nestedKey]) => nestedKey === "content") + ), + ]) + ), + }, + }); \ No newline at end of file diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml index ddf4700f1..d233f7338 100644 --- a/.github/workflows/_test_rosetta.yaml +++ b/.github/workflows/_test_rosetta.yaml @@ -8,14 +8,23 @@ on: description: 'Rosetta image build by NVIDIA/JAX-Toolbox' required: true default: 'ghcr.io/nvidia/rosetta-t5x:latest' + outputs: + TEST_ARTIFACT_NAME: + description: 'Name of the unit test artifact for downstream workflows' + value: ${{ jobs.rosetta-tests.outputs.TEST_ARTIFACT_NAME }} + +env: + TEST_ARTIFACT_NAME: test-logs jobs: rosetta-tests: strategy: matrix: - MARKERS: ["", "-m integration"] + TEST_TYPE: ["unit", "integration"] fail-fast: false runs-on: [self-hosted, compute, V100] + outputs: + TEST_ARTIFACT_NAME: ${{ env.TEST_ARTIFACT_NAME }} steps: - name: Print environment variables run: | @@ -35,9 +44,21 @@ jobs: shell: bash -x -e {0} run: | docker pull ${{ inputs.ROSETTA_IMAGE }} + docker tag ${{ inputs.ROSETTA_IMAGE }} rosetta:latest - name: Run Rosetta tests w/ docker - shell: bash -x -e {0} + shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh run: | - docker run --gpus all ${{ inputs.ROSETTA_IMAGE }} sh -c "pip install '/opt/rosetta[test]' && pytest /opt/rosetta ${{ matrix.MARKERS }}" + EXTRA_ARGS="" + if [[ ${{ matrix.TEST_TYPE }} == integration ]]; then + EXTRA_ARGS="-m integration" + fi + ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)")) + pip install "${ROSETTA_PATH}[test]" pytest-reportlog + pytest --report-log=/log/${{ matrix.TEST_TYPE }}-report.jsonl ${ROSETTA_PATH} ${EXTRA_ARGS} || true + - name: Upload unit test json logs + uses: actions/upload-artifact@v3 + with: + name: ${{ env.TEST_ARTIFACT_NAME }} + path: /log/${{ matrix.TEST_TYPE }}-report.jsonl \ No newline at end of file diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index f0be2d1d2..a69cd35b7 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -36,6 +36,7 @@ jobs: BUILD_DATE: ${{ steps.meta-vars.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ steps.meta-vars.outputs.BASE_LIBRARY }} BASE_IMAGE: ${{ steps.meta-vars.outputs.BASE_IMAGE }} + PUBLISH: ${{ steps.meta-vars.outputs.PUBLISH }} steps: - name: Set build metadata id: meta-vars @@ -50,6 +51,7 @@ jobs: echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT + echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT build: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' @@ -61,6 +63,26 @@ jobs: BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} secrets: inherit + publish-build: + needs: [metadata, build] + uses: ./.github/workflows/_publish_badge.yaml + if: ( success() || failure() ) && (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + secrets: inherit + with: + ENDPOINT_FILENAME: 'rosetta-pax-build-status.json' + PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} + SCRIPT: | + if [[ ${{ needs.build.result }} == "success" ]]; then + BADGE_COLOR=brightgreen + MSG=passing + else + BADGE_COLOR=red + MSG=failing + fi + echo "LABEL='nightly'" >> $GITHUB_OUTPUT + echo "MESSAGE='${MSG}'" >> $GITHUB_OUTPUT + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + test: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' needs: build @@ -69,7 +91,46 @@ jobs: ROSETTA_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} secrets: inherit - publish: + publish-test: + needs: [metadata, build, test] + uses: ./.github/workflows/_publish_badge.yaml + if: ( success() || failure() ) && (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + secrets: inherit + with: + ENDPOINT_FILENAME: 'rosetta-pax-test-status.json' + PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} + SCRIPT: | + ARTIFACTS="${{ needs.test.outputs.TEST_ARTIFACT_NAME }}/*.jsonl" + all_outcomes() { + cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' + } + cnt_type() { + cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l + } + SKIPPED_TESTS=$(cnt_type skipped) + FAILED_TESTS=$(cnt_type failed) + PASSED_TESTS=$(cnt_type passed) + TOTAL_TESTS=$(all_outcomes | wc -l) + echo "Unit/Integration test breakdown:" + all_outcomes | sort | uniq -c + if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then + BADGE_COLOR=brightgreen + else + if [[ $PASSED_TESTS -eq 0 ]]; then + BADGE_COLOR=red + else + BADGE_COLOR=yellow + fi + fi + echo "LABEL='V100'" >> $GITHUB_OUTPUT + if [[ ${{ needs.build.result }} == "success" ]]; then + echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT + else + echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT + fi + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + + publish-container: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) needs: [metadata, build] uses: ./.github/workflows/_publish_container.yaml diff --git a/.github/workflows/nightly-rosetta-t5x-build.yaml b/.github/workflows/nightly-rosetta-t5x-build.yaml index 38c38f8ad..3a26e2c13 100644 --- a/.github/workflows/nightly-rosetta-t5x-build.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build.yaml @@ -36,6 +36,7 @@ jobs: BUILD_DATE: ${{ steps.meta-vars.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ steps.meta-vars.outputs.BASE_LIBRARY }} BASE_IMAGE: ${{ steps.meta-vars.outputs.BASE_IMAGE }} + PUBLISH: ${{ steps.meta-vars.outputs.PUBLISH }} steps: - name: Set build metadata id: meta-vars @@ -50,6 +51,7 @@ jobs: echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT + echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT build: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' @@ -61,6 +63,26 @@ jobs: BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} secrets: inherit + publish-build: + needs: [metadata, build] + uses: ./.github/workflows/_publish_badge.yaml + if: ( success() || failure() ) && (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + secrets: inherit + with: + ENDPOINT_FILENAME: 'rosetta-t5x-build-status.json' + PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} + SCRIPT: | + if [[ ${{ needs.build.result }} == "success" ]]; then + BADGE_COLOR=brightgreen + MSG=passing + else + BADGE_COLOR=red + MSG=failing + fi + echo "LABEL='nightly'" >> $GITHUB_OUTPUT + echo "MESSAGE='${MSG}'" >> $GITHUB_OUTPUT + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + test: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' needs: build @@ -69,7 +91,46 @@ jobs: ROSETTA_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} secrets: inherit - publish: + publish-test: + needs: [metadata, build, test] + uses: ./.github/workflows/_publish_badge.yaml + if: ( success() || failure() ) && (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + secrets: inherit + with: + ENDPOINT_FILENAME: 'rosetta-t5x-test-status.json' + PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} + SCRIPT: | + ARTIFACTS="${{ needs.test.outputs.TEST_ARTIFACT_NAME }}/*.jsonl" + all_outcomes() { + cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' + } + cnt_type() { + cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l + } + SKIPPED_TESTS=$(cnt_type skipped) + FAILED_TESTS=$(cnt_type failed) + PASSED_TESTS=$(cnt_type passed) + TOTAL_TESTS=$(all_outcomes | wc -l) + echo "Unit/Integration test breakdown:" + all_outcomes | sort | uniq -c + if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then + BADGE_COLOR=brightgreen + else + if [[ $PASSED_TESTS -eq 0 ]]; then + BADGE_COLOR=red + else + BADGE_COLOR=yellow + fi + fi + echo "LABEL='V100'" >> $GITHUB_OUTPUT + if [[ ${{ needs.build.result }} == "success" ]]; then + echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT + else + echo "MESSAGE='n/a'" >> $GITHUB_OUTPUT + fi + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + + publish-container: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) needs: [metadata, build] uses: ./.github/workflows/_publish_container.yaml diff --git a/README.md b/README.md index 092b69084..e0e887d22 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,8 @@ | [![container-badge-t5x]][container-link-t5x] | [![build-badge-t5x]][workflow-t5x] | [![test-badge-t5x]][workflow-t5x-perf] | | [![container-badge-pax]][container-link-pax] | [![build-badge-pax]][workflow-pax] | [![test-badge-pax]][workflow-pax-perf] | | [![container-badge-te]][container-link-te] | [![build-badge-te]][workflow-te] | [![unit-test-badge-te]][workflow-te-test]
[![integration-test-badge-te]][workflow-te-test] | -| [![container-badge-rosetta-t5x]][container-link-rosetta-t5x] | [![build-badge-rosetta-t5x]][workflow-rosetta-t5x] | [![test-badge-rosetta-t5x]][workflow-rosetta-t5x] (dummy) | -| [![container-badge-rosetta-pax]][container-link-rosetta-pax] | [![build-badge-rosetta-pax]][workflow-rosetta-pax] | [![test-badge-rosetta-pax]][workflow-rosetta-pax] (dummy) | +| [![container-badge-rosetta-t5x]][container-link-rosetta-t5x] | [![build-badge-rosetta-t5x]][workflow-rosetta-t5x] | [![test-badge-rosetta-t5x]][workflow-rosetta-t5x] | +| [![container-badge-rosetta-pax]][container-link-rosetta-pax] | [![build-badge-rosetta-pax]][workflow-rosetta-pax] | [![test-badge-rosetta-pax]][workflow-rosetta-pax] | [container-badge-base]: https://img.shields.io/static/v1?label=&message=.base&color=gray&logo=docker [container-badge-jax]: https://img.shields.io/static/v1?label=&message=JAX&color=gray&logo=docker @@ -30,8 +30,8 @@ [build-badge-jax]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-jax-build.yaml?branch=main&label=nightly&logo=github-actions&logoColor=dddddd [build-badge-t5x]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-t5x-build.yaml?branch=main&label=nightly&logo=github-actions&logoColor=dddddd [build-badge-pax]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-pax-build.yaml?branch=main&label=nightly&logo=github-actions&logoColor=dddddd -[build-badge-rosetta-t5x]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-rosetta-t5x-build.yaml?branch=main&label=nightly&logo=github-actions&logoColor=dddddd -[build-badge-rosetta-pax]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-rosetta-pax-build.yaml?branch=main&label=nightly&logo=github-actions&logoColor=dddddd +[build-badge-rosetta-t5x]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Frosetta-t5x-build-status.json&logo=github-actions&logoColor=dddddd +[build-badge-rosetta-pax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Frosetta-pax-build-status.json&logo=github-actions&logoColor=dddddd [build-badge-te]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-te-build.yaml?branch=main&label=nightly&logo=github-actions&logoColor=dddddd [workflow-base]: https://github.com/NVIDIA/JAX-Toolbox/actions/workflows/weekly-base-build.yaml @@ -47,8 +47,8 @@ [test-badge-pax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fpax-test-completion-status.json&logo=nvidia [unit-test-badge-te]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fte-unit-test-status.json&logo=nvidia [integration-test-badge-te]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fte-integration-test-status.json&logo=nvidia -[test-badge-rosetta-t5x]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-rosetta-t5x-build.yaml?branch=main&label=A100%20MGMN&logo=nvidia -[test-badge-rosetta-pax]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-rosetta-pax-build.yaml?branch=main&label=A100%20MGMN&logo=nvidia +[test-badge-rosetta-t5x]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Frosetta-t5x-test-status.json&logo=nvidia +[test-badge-rosetta-pax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Frosetta-pax-test-status.json&logo=nvidia [workflow-jax-unit]: https://github.com/NVIDIA/JAX-Toolbox/actions/workflows/nightly-jax-test-unit.yaml [workflow-t5x-perf]: https://github.com/NVIDIA/JAX-Toolbox/actions/workflows/nightly-t5x-test-mgmn.yaml From f300f08454d8931f5fef670f950cb0c471a65304 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Tue, 27 Jun 2023 19:01:41 +0100 Subject: [PATCH 05/16] fix JAX unit test due to missing matplotlib --- .github/container/test-jax.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/container/test-jax.sh b/.github/container/test-jax.sh index ac4691159..5c94c5eb4 100755 --- a/.github/container/test-jax.sh +++ b/.github/container/test-jax.sh @@ -165,6 +165,10 @@ set -ex ## Install dependencies pip install -r `jax_source_dir`/build/test-requirements.txt +# Reason for manually installing matplotlib: +# https://github.com/google/jax/commit/6b76937c530bd8ee185cc9e1991b3696bd10e831 +# https://github.com/google/jax/blob/6bc74d2a9874e1fe93a45191bb829c07dfee04fa/tests/BUILD#L134 +pip install matplotlib ## Run tests From ed1158cdcd6b70d6744b6a2bf6e09107834e7b4e Mon Sep 17 00:00:00 2001 From: sahilj Date: Tue, 27 Jun 2023 12:54:42 -0700 Subject: [PATCH 06/16] Added T5x patch and README --- rosetta/patchlist-t5x.txt | 1 + rosetta/rosetta/projects/t5x/README.md | 146 +++++++++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 rosetta/rosetta/projects/t5x/README.md diff --git a/rosetta/patchlist-t5x.txt b/rosetta/patchlist-t5x.txt index a53e0457d..165081c9e 100644 --- a/rosetta/patchlist-t5x.txt +++ b/rosetta/patchlist-t5x.txt @@ -5,3 +5,4 @@ # - External Pull Requests (These are pull requests with upstream t5x and are of the form "pull/$PULLID/head") # - Note: Only the first column is used as a git-ref, so anything after is a comment +pull/1320/head diff --git a/rosetta/rosetta/projects/t5x/README.md b/rosetta/rosetta/projects/t5x/README.md new file mode 100644 index 000000000..bd1bec92a --- /dev/null +++ b/rosetta/rosetta/projects/t5x/README.md @@ -0,0 +1,146 @@ +# GPU Scripts and Usage + +The [t5x/contrib/gpu/scripts_gpu](../../t5x/contrib/gpu/scripts_gpu) directory contains scripts optimized for GPU usage and includes FP8 support via [Transformer Engine](https://github.com/NVIDIA/TransformerEngine). + +Install with `pip install -r pile_requirements.txt` to get all pile dependencies. + +## Building the container +We provide a fully built and ready-to-use container here: [ghcr.io/nvidia/t5x:te-fp8-reference](ghcr.io/nvidia/t5x:te-fp8-reference) +If you'd like you build your own, +The Dockerfile in `t5x/contrib/gpu` will build a container with all gpu/pile dependencies. It can be built with `t5x/contrib/gpu/docker/build.sh `. However, we **highly** recommend using the pre-built container. + +## Running interactively +Note: this should only be done with singlenode jobs and/or for downloading the pile. Use `t5x/contrib/gpu/docker/interactive_pull_and_launch.sh`. This takes arguments for the URL to pull a container from and the location of the dataset directory to mount. For example: + +`t5x/contrib/gpu/docker/interactive_pull_and_launch.sh [URL] /my/dataset/dir` + +## Downloading The Pile +We use The Pile for our pretraining experiments. If you would like to as well, run `download_the_pile.py` to download it. The download is approximately 1TB. It will download to the directory set in the environment variable: `TFDS_DATA_DIR`. After that, set the `TFDS_DATA_DIR` to the same directory in your scripts to use. + +## Single Node runs +Pretraining and Finetuning can be done with `singlenode_*.sh`. These will build a T5X model with the Adam optimizer and relevant parameters. These will allow multi-gpu on one host. + +## Multi Node runs +For a SLURM+pyxis cluster, `example*.sub` files provide example slurm submit files (edit with your details), which call `multiprocess*.sh` to execute training. You can add a binding script in the `.sub` file for your cluster, or remove it entirely (dropping some throughput) + +## Convergence and performance +For our Pile convergence runs, we used a Global batch size of 2304 for XXL and 2016-2048 for all other models, where GBS is defined as #GPUs * BS/GPU / Tensor Parallel(TP). Below are example (tested) hardware topologies on NVIDIA DGX A100 (8x A100-SXM4-80G) and H100-SXM-80G nodes. + +| size | GPU | Precision | #GPUs | TP | BS / GPU | Sequences/Sec | Seq/Sec/GPU | Est. Walltime | GPU-days | MNLI 2.0 - matched | SQuAD v1.1 (EM/F1) | Convergence Log | Config | +| ---- | ------------ | --------- | ----- | ----- | -------- | ------------- | ----------- | ------------- | -------- |------------------ | ------------------ | --------------- | ---- | +| [T5-v1.1-small](../t5/t5_1_1/small.gin) | A100 80G SXM | bf16 | 8 | 1 | 256 | ~5712 | 714 | 4.2 days | 33 | 83.06% | 78.33 / 86.63 | [log](https://tensorboard.dev/experiment/lWnHal7PRnOLeZuewyWVxQ/#scalars&_smoothingWeight=0) | [pile](../t5/t5_1_1/examples/small_pile_pretrain.gin) +| [T5-v1.1-large](../t5/t5_1_1/large.gin) | A100 80G SXM | bf16 | 64 | 1 | 32 | ~4853 | 75.8 | 4.8 days | 309 | 90.50% | 87.31 / 94.04 | [log](https://tensorboard.dev/experiment/aOxJBIvTQBeTJ8XGXxaL6Q/#scalars&_smoothingWeight=0) |[pile](../t5/t5_1_1/examples/large_pile_pretrain.gin) +| [T5-v1.1-xl](../t5/t5_1_1/xl.gin) | A100 80G SXM | bf16 | 144 | 1 | 8 | ~3021 | 21.0 | 7.9 days | 1,133 | N/A(perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) +| [T5-v1.1-xl](../t5/t5_1_1/xl.gin) | A100 80G SXM | bf16 | 256 | 1 | 8 | ~4322 | 16.9 | 5.5 days | 1,408 | 91.15% | 89.36 / 95.29 | [log](https://tensorboard.dev/experiment/vuRoEYgkRgWiEtbvgxlOqw/#scalars&_smoothingWeight=0) |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) +| [T5-v1.1-xxl](../t5/t5_1_1/xxl.gin) | A100 80G SXM | bf16 | 512 | 8 | 36 | ~1887 | 3.69 | 12.6 days | 6,431 |N/A(partial run) | N/A(partial run) | |[pile](../t5/t5_1_1/examples/xxl_pile_pretrain.gin) +| [T5-v1.1-large](../t5/t5_1_1/large.gin) | **H100 80G SXM** | TE-fp8 | 64 | 1 | 32 | ~10156 | **158.7** | **2.3 days** | **147** | 89.1% | 86.36 / 93.5 | |[pile](../t5/t5_1_1/examples/large_pile_pretrain.gin) +| [T5-v1.1-xl](../t5/t5_1_1/xl.gin) | **H100 80G SXM** | TE-fp8 | 144 | 1 | 14 | ~7257 | **50.4** | **3.3 days** | **475** | N/A (perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) +| [T5-v1.1-xl](../t5/t5_1_1/xl.gin) | **H100 80G SXM** | TE-fp8 | 256 | 1 | 8 | ~9688 | **37.8** | **2.4 days** | **614** | N/A (perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) + +Note: Convergence (as shown in log) was not necessarily done with the hardware topology listed, but the listed topology is tested. Estimated Walltime is calculated assuming full throughput (seq/sec) continuously. In practice, there are compilation overheads at the beginning of each run/restart(in cluster settings) + checkpointing overheads (if any). + +Other hyperparameters are specified in the associated pile `gin` files in the `contrib/gpu/t5/t5_1_1/examples` directory. + +## Pretraining run commands + +### Multinode +Arguments are set by environment variable as such: + +`PREC={PRECISION} T5_SIZE={SIZE} BSIZE_PER_GPU={BSIZE} ..... sbatch -N {NODE_CT} t5x/contrib/gpu/t5/scripts_gpu/example_slurm_pretrain_pile.sub {GPUS_PER_NODE}` + +All parameters can be found in the relevant script. + +### Example Pretraining Commands +Assumes 8GPU 80GB A100/H100 Nodes. `ENABLE_FP8` uses transformer engine (included in container) and requires H100 + +* Note: To use, FP8 set `ENABLE_FP8` to `1`. This will automatically set `PREC` to `bfloat16` as is required by internals for `FP8` usage. +#### [T5-v1.1-small](../t5/t5_1_1/small.gin) (60M): +```sh +PREC=bfloat16 T5_SIZE=small BSIZE_PER_GPU=256 TRAIN_STEPS=1000000 NUM_MICROBATCHES=1 ENABLE_FP8=1 TP_SIZE=1 \ +sbatch -N1 t5x/contrib/gpu/t5/scripts_gpu/example_slurm_pretrain_pile.sub +``` + +#### [T5-v1.1-large](../t5/t5_1_1/large.gin) (770M): +```sh +PREC=bfloat16 T5_SIZE=large BSIZE_PER_GPU=32 TRAIN_STEPS=1000000 NUM_MICROBATCHES=1 ENABLE_FP8=1 TP_SIZE=1 \ +sbatch -N8 t5x/contrib/gpu/t5/scripts_gpu/example_slurm_pretrain_pile.sub +``` + +#### [T5-v1.1-xl](../t5/t5_1_1/xl.gin) (3B): +```sh +PREC=bfloat16 T5_SIZE=large BSIZE_PER_GPU=8 TRAIN_STEPS=1000000 NUM_MICROBATCHES=1 ENABLE_FP8=1 TP_SIZE=1 \ +sbatch -N 32 t5x/contrib/gpu/t5/scripts_gpu/example_slurm_pretrain_pile.sub +``` + +### Example Finetuning Commands +Finetuning commands simply change the script and have an additional `{FT_TASK}` as the first argument (along with relevant hyperparameter changes). Your `MODEL_DIR` should contain the pretrained checkpoint to restore from. + +#### MNLI v2: +```sh +FT_TASK=mnli2 PREC=bfloat16 T5_SIZE={SIZE} BSIZE_PER_GPU={BSIZE} NUM_MICROBATCHES=1 ENABLE_FP8=1 TP_SIZE=1 \ +sbatch -N{NODE_CT} t5x/contrib/gpu/t5/scripts_gpu/example_slurm_ft_frompile.sub +``` + +#### SQuAD v1.1: +```sh +FT_TASK=squad1 PREC=bfloat16 T5_SIZE={SIZE} BSIZE_PER_GPU={BSIZE} NUM_MICROBATCHES=1 ENABLE_FP8=1 TP_SIZE=1 \ +sbatch -N{NODE_CT} t5x/contrib/gpu/t5/scripts_gpu/example_slurm_ft_frompile.sub + +``` + +## Performance Settings: +There are 3 major performance settings: `ENABLE_FP8`, `FUSE_QKV` and `TRANSPOSE_BS` (all of which are controllable via env var in the commands above). +We recommend always enabling `TRANSPOSE_BS` (default), but only using `FUSE_QKV` when using `ENABLE_FP8` for optimal performance. + +On all finetuning runs, we use a Global Batch Size of 256 with bfloat16 precision + FP8. + +WARNING: Finetuning is configured by default to save every checkpoint and delete none (to avoid accidentally deleting your pretrained checkpoint). Watch your disk space! This behavior can be changed in `t5x/configs/runs/finetune_{TASK}.gin`, however this puts the pretrained checkpoint at risk unless backed up. + +### Singlenode (single process) +small: + +```sh +t5x/contrib/gpu/scripts_gpu/singlenode_pretrain_pile.sh \ + small \ + bfloat16 \ + 8 \ + 256 \ + {LOGDIR - create before running} \ + {MODEL_DIR} \ + {GRADIENT_ACCUMULATION (1 by default)} \ + {ENABLE_FP8 (1 by default)} \ + {TRANSPOSE_BS (1 by default)} \ + {FUSE_QKV (1 by default)} \ + {PACK (0 by default)} +``` + +Finetuning: +MNLI v2: +```sh +t5x/contrib/gpu/scripts_gpu/singlenode_ft_frompile.sh \ + mnli2 \ + small \ + bfloat16 \ + 8 \ + 256 \ + {LOGDIR - create before running} \ + {MODEL_DIR(to restore pretrained checkpoint from)} \ + {GRADIENT_ACCUMULATION (1 by default)} \ + {MAKE_FT_DIR (false by default)} + {ENABLE_FP8 (1 by default)} \ + {TRANSPOSE_BS (1 by default)} \ + {FUSE_QKV (1 by default)} \ + {PACK (0 by default)} +``` + +# Changelog +- Added Transformer Engine + FP8 support +- Added the Transposed Batch-Sequence GPU optimization +- A100 Perf gains! (BF16) + - 80% speedup - T5-small + - 23% speedup - T5-large + - 18% speedup - T5-xl + - 40% speedup - T5-xxl +- H100 FP8 support, with gains over A100 + - 2.08x faster - T5-large (FP8) + - 2.24x faster - T5-xl (FP8) From 752b729d87b36dd3162f6d82d144bd63b3510d8e Mon Sep 17 00:00:00 2001 From: sahilj Date: Tue, 27 Jun 2023 12:57:11 -0700 Subject: [PATCH 07/16] Updated broken links --- rosetta/rosetta/projects/t5x/README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/rosetta/rosetta/projects/t5x/README.md b/rosetta/rosetta/projects/t5x/README.md index bd1bec92a..500de98ce 100644 --- a/rosetta/rosetta/projects/t5x/README.md +++ b/rosetta/rosetta/projects/t5x/README.md @@ -1,6 +1,6 @@ # GPU Scripts and Usage -The [t5x/contrib/gpu/scripts_gpu](../../t5x/contrib/gpu/scripts_gpu) directory contains scripts optimized for GPU usage and includes FP8 support via [Transformer Engine](https://github.com/NVIDIA/TransformerEngine). +The t5x/contrib/gpu/scripts_gpu directory contains scripts optimized for GPU usage and includes FP8 support via [Transformer Engine](https://github.com/NVIDIA/TransformerEngine). Install with `pip install -r pile_requirements.txt` to get all pile dependencies. @@ -28,14 +28,14 @@ For our Pile convergence runs, we used a Global batch size of 2304 for XXL and 2 | size | GPU | Precision | #GPUs | TP | BS / GPU | Sequences/Sec | Seq/Sec/GPU | Est. Walltime | GPU-days | MNLI 2.0 - matched | SQuAD v1.1 (EM/F1) | Convergence Log | Config | | ---- | ------------ | --------- | ----- | ----- | -------- | ------------- | ----------- | ------------- | -------- |------------------ | ------------------ | --------------- | ---- | -| [T5-v1.1-small](../t5/t5_1_1/small.gin) | A100 80G SXM | bf16 | 8 | 1 | 256 | ~5712 | 714 | 4.2 days | 33 | 83.06% | 78.33 / 86.63 | [log](https://tensorboard.dev/experiment/lWnHal7PRnOLeZuewyWVxQ/#scalars&_smoothingWeight=0) | [pile](../t5/t5_1_1/examples/small_pile_pretrain.gin) -| [T5-v1.1-large](../t5/t5_1_1/large.gin) | A100 80G SXM | bf16 | 64 | 1 | 32 | ~4853 | 75.8 | 4.8 days | 309 | 90.50% | 87.31 / 94.04 | [log](https://tensorboard.dev/experiment/aOxJBIvTQBeTJ8XGXxaL6Q/#scalars&_smoothingWeight=0) |[pile](../t5/t5_1_1/examples/large_pile_pretrain.gin) -| [T5-v1.1-xl](../t5/t5_1_1/xl.gin) | A100 80G SXM | bf16 | 144 | 1 | 8 | ~3021 | 21.0 | 7.9 days | 1,133 | N/A(perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) -| [T5-v1.1-xl](../t5/t5_1_1/xl.gin) | A100 80G SXM | bf16 | 256 | 1 | 8 | ~4322 | 16.9 | 5.5 days | 1,408 | 91.15% | 89.36 / 95.29 | [log](https://tensorboard.dev/experiment/vuRoEYgkRgWiEtbvgxlOqw/#scalars&_smoothingWeight=0) |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) -| [T5-v1.1-xxl](../t5/t5_1_1/xxl.gin) | A100 80G SXM | bf16 | 512 | 8 | 36 | ~1887 | 3.69 | 12.6 days | 6,431 |N/A(partial run) | N/A(partial run) | |[pile](../t5/t5_1_1/examples/xxl_pile_pretrain.gin) -| [T5-v1.1-large](../t5/t5_1_1/large.gin) | **H100 80G SXM** | TE-fp8 | 64 | 1 | 32 | ~10156 | **158.7** | **2.3 days** | **147** | 89.1% | 86.36 / 93.5 | |[pile](../t5/t5_1_1/examples/large_pile_pretrain.gin) -| [T5-v1.1-xl](../t5/t5_1_1/xl.gin) | **H100 80G SXM** | TE-fp8 | 144 | 1 | 14 | ~7257 | **50.4** | **3.3 days** | **475** | N/A (perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) -| [T5-v1.1-xl](../t5/t5_1_1/xl.gin) | **H100 80G SXM** | TE-fp8 | 256 | 1 | 8 | ~9688 | **37.8** | **2.4 days** | **614** | N/A (perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) +| T5-v1.1-small | A100 80G SXM | bf16 | 8 | 1 | 256 | ~5712 | 714 | 4.2 days | 33 | 83.06% | 78.33 / 86.63 | [log](https://tensorboard.dev/experiment/lWnHal7PRnOLeZuewyWVxQ/#scalars&_smoothingWeight=0) | [pile](../t5/t5_1_1/examples/small_pile_pretrain.gin) +| T5-v1.1-large | A100 80G SXM | bf16 | 64 | 1 | 32 | ~4853 | 75.8 | 4.8 days | 309 | 90.50% | 87.31 / 94.04 | [log](https://tensorboard.dev/experiment/aOxJBIvTQBeTJ8XGXxaL6Q/#scalars&_smoothingWeight=0) |[pile](../t5/t5_1_1/examples/large_pile_pretrain.gin) +| T5-v1.1-xl | A100 80G SXM | bf16 | 144 | 1 | 8 | ~3021 | 21.0 | 7.9 days | 1,133 | N/A(perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) +| T5-v1.1-xl | A100 80G SXM | bf16 | 256 | 1 | 8 | ~4322 | 16.9 | 5.5 days | 1,408 | 91.15% | 89.36 / 95.29 | [log](https://tensorboard.dev/experiment/vuRoEYgkRgWiEtbvgxlOqw/#scalars&_smoothingWeight=0) |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) +| T5-v1.1-xxl | A100 80G SXM | bf16 | 512 | 8 | 36 | ~1887 | 3.69 | 12.6 days | 6,431 |N/A(partial run) | N/A(partial run) | |[pile](../t5/t5_1_1/examples/xxl_pile_pretrain.gin) +| T5-v1.1-large | **H100 80G SXM** | TE-fp8 | 64 | 1 | 32 | ~10156 | **158.7** | **2.3 days** | **147** | 89.1% | 86.36 / 93.5 | |[pile](../t5/t5_1_1/examples/large_pile_pretrain.gin) +| T5-v1.1-xl | **H100 80G SXM** | TE-fp8 | 144 | 1 | 14 | ~7257 | **50.4** | **3.3 days** | **475** | N/A (perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) +| T5-v1.1-xl | **H100 80G SXM** | TE-fp8 | 256 | 1 | 8 | ~9688 | **37.8** | **2.4 days** | **614** | N/A (perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) Note: Convergence (as shown in log) was not necessarily done with the hardware topology listed, but the listed topology is tested. Estimated Walltime is calculated assuming full throughput (seq/sec) continuously. In practice, there are compilation overheads at the beginning of each run/restart(in cluster settings) + checkpointing overheads (if any). @@ -54,19 +54,19 @@ All parameters can be found in the relevant script. Assumes 8GPU 80GB A100/H100 Nodes. `ENABLE_FP8` uses transformer engine (included in container) and requires H100 * Note: To use, FP8 set `ENABLE_FP8` to `1`. This will automatically set `PREC` to `bfloat16` as is required by internals for `FP8` usage. -#### [T5-v1.1-small](../t5/t5_1_1/small.gin) (60M): +#### T5-v1.1-small (60M): ```sh PREC=bfloat16 T5_SIZE=small BSIZE_PER_GPU=256 TRAIN_STEPS=1000000 NUM_MICROBATCHES=1 ENABLE_FP8=1 TP_SIZE=1 \ sbatch -N1 t5x/contrib/gpu/t5/scripts_gpu/example_slurm_pretrain_pile.sub ``` -#### [T5-v1.1-large](../t5/t5_1_1/large.gin) (770M): +#### T5-v1.1-large (770M): ```sh PREC=bfloat16 T5_SIZE=large BSIZE_PER_GPU=32 TRAIN_STEPS=1000000 NUM_MICROBATCHES=1 ENABLE_FP8=1 TP_SIZE=1 \ sbatch -N8 t5x/contrib/gpu/t5/scripts_gpu/example_slurm_pretrain_pile.sub ``` -#### [T5-v1.1-xl](../t5/t5_1_1/xl.gin) (3B): +#### T5-v1.1-xl (3B): ```sh PREC=bfloat16 T5_SIZE=large BSIZE_PER_GPU=8 TRAIN_STEPS=1000000 NUM_MICROBATCHES=1 ENABLE_FP8=1 TP_SIZE=1 \ sbatch -N 32 t5x/contrib/gpu/t5/scripts_gpu/example_slurm_pretrain_pile.sub From 5d4fe3607108da3f2436f13b304987aa4ddf9f66 Mon Sep 17 00:00:00 2001 From: Sahil Jain <48468750+SahilJain314@users.noreply.github.com> Date: Tue, 27 Jun 2023 22:58:27 -0700 Subject: [PATCH 08/16] Changelog documentation update (#101) Changelog documentation update --- rosetta/rosetta/projects/t5x/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rosetta/rosetta/projects/t5x/README.md b/rosetta/rosetta/projects/t5x/README.md index 500de98ce..c9b81ca23 100644 --- a/rosetta/rosetta/projects/t5x/README.md +++ b/rosetta/rosetta/projects/t5x/README.md @@ -135,7 +135,7 @@ t5x/contrib/gpu/scripts_gpu/singlenode_ft_frompile.sh \ # Changelog - Added Transformer Engine + FP8 support -- Added the Transposed Batch-Sequence GPU optimization +- Updated T5x and JAX=0.4.11 - A100 Perf gains! (BF16) - 80% speedup - T5-small - 23% speedup - T5-large From 774231f2e4c76beb71d08610884ce84b0b7b6e89 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 28 Jun 2023 10:53:31 -0700 Subject: [PATCH 09/16] Revert "Update patchlist-praxis.txt" This reverts commit 2f88b5ccc07f7914251f001102cb304d791ec02c. --- rosetta/patchlist-praxis.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/rosetta/patchlist-praxis.txt b/rosetta/patchlist-praxis.txt index 42c1acc6b..0f97db91a 100644 --- a/rosetta/patchlist-praxis.txt +++ b/rosetta/patchlist-praxis.txt @@ -6,6 +6,3 @@ # - Note: Only the first column is used as a git-ref, so anything after is a comment pull/18/head # This PR allows XLA:GPU to detect the MHA pattern more easily to call fused kernels from cublas. - -pull/17/head # This PR creates an option to remove the NaN check at the end of every microbatch when using Pipeline Parallelism. This allows cublas to fuse the GeMMs with the gradient accumulation leading to a 6% perf improvement. - From cd995e3e5d71ea92df6ec968cb4816273744ba27 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 28 Jun 2023 13:39:04 -0700 Subject: [PATCH 10/16] Add a supported models section at the top level readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 092b69084..8dd456203 100644 --- a/README.md +++ b/README.md @@ -58,3 +58,11 @@ ## Note This repo currently hosts a public CI for JAX on NVIDIA GPUs and covers some JAX libraries like: [T5x](https://github.com/google-research/t5x), [PAXML](https://github.com/google/paxml), [Transformer Engine](https://github.com/NVIDIA/TransformerEngine), and others to come soon. + +## Supported Models +We currently enable training and evaluation for the following models: +| Model Name | Pretraining | Fine-tuning | Evaluation | +| :--- | :---: | :---: | :---: | +| [t5(t5x)](./rosetta/rosetta/projects/t5x) | ✔️ | ✔️ | ✔️ | + +We will update this table as new models become available, so stay tuned. \ No newline at end of file From b9376838856eff1d0a004d5eec364ef14b209e6b Mon Sep 17 00:00:00 2001 From: Abhinav Goel Date: Wed, 28 Jun 2023 17:12:15 -0700 Subject: [PATCH 11/16] made a new PR --- rosetta/patchlist-praxis.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rosetta/patchlist-praxis.txt b/rosetta/patchlist-praxis.txt index 0f97db91a..74792f885 100644 --- a/rosetta/patchlist-praxis.txt +++ b/rosetta/patchlist-praxis.txt @@ -4,5 +4,5 @@ # - Internal patches (These are branches that start with "patch/") # - External Pull Requests (These are pull requests with upstream praxis and are of the form "pull/$PULLID/head") # - Note: Only the first column is used as a git-ref, so anything after is a comment -pull/18/head # This PR allows XLA:GPU to detect the MHA pattern more easily to call fused kernels from cublas. +pull/19/head # This PR allows XLA:GPU to detect the MHA pattern more easily to call fused kernels from cublas. From 4d6a13d121d12383b09b01b8dd688156d571b15b Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 28 Jun 2023 13:56:29 -0700 Subject: [PATCH 12/16] Adds description for t5x's PR1320 --- rosetta/patchlist-t5x.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rosetta/patchlist-t5x.txt b/rosetta/patchlist-t5x.txt index 165081c9e..48ce0955a 100644 --- a/rosetta/patchlist-t5x.txt +++ b/rosetta/patchlist-t5x.txt @@ -5,4 +5,4 @@ # - External Pull Requests (These are pull requests with upstream t5x and are of the form "pull/$PULLID/head") # - Note: Only the first column is used as a git-ref, so anything after is a comment -pull/1320/head +pull/1320/head # https://github.com/google-research/t5x/pull/1320: Adds transformer engine support and GPU optimizations to T5x (enables H100) From 3d1a253eaed6a300276f63d8ebfcb03fc7c7efff Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 28 Jun 2023 22:34:48 -0700 Subject: [PATCH 13/16] updates t5x readme to account for code being within container --- rosetta/rosetta/projects/t5x/README.md | 88 ++++++++++++++++++++++---- 1 file changed, 74 insertions(+), 14 deletions(-) diff --git a/rosetta/rosetta/projects/t5x/README.md b/rosetta/rosetta/projects/t5x/README.md index c9b81ca23..70696586a 100644 --- a/rosetta/rosetta/projects/t5x/README.md +++ b/rosetta/rosetta/projects/t5x/README.md @@ -1,25 +1,81 @@ -# GPU Scripts and Usage +# T5x -The t5x/contrib/gpu/scripts_gpu directory contains scripts optimized for GPU usage and includes FP8 support via [Transformer Engine](https://github.com/NVIDIA/TransformerEngine). +[T5x](https://github.com/google-research/t5x) is a project developed by Google, which is maintained as a [distribution](../../../docs/DEVELOPMENT.md) within rosetta. -Install with `pip install -r pile_requirements.txt` to get all pile dependencies. +Any `t5x/*` relative directory/file can be found in [google-research/t5x](https://github.com/google-research/t5x), but to +view the most up to date version of that directory/file, please see ["Inspecting the source code"](#inspecting-the-source-code) -## Building the container +## GPU Scripts and Usage +The `t5x/contrib/gpu/scripts_gpu` directory contains scripts optimized for GPU usage and includes FP8 support via [Transformer Engine](https://github.com/NVIDIA/TransformerEngine). + + +## Prerequisites +The examples below will reuse these environment variables. Feel free to change them: +```bash +CONTAINER=ghcr.io/nvidia/t5x:te-fp8-reference +DATASET_PATH= +T5X_DIR= # Root path of t5x +WORKSPACE_PATH="" # Path used for run outputs (unspecified = /t5x_home/workspace) +``` + +## Container We provide a fully built and ready-to-use container here: [ghcr.io/nvidia/t5x:te-fp8-reference](ghcr.io/nvidia/t5x:te-fp8-reference) -If you'd like you build your own, -The Dockerfile in `t5x/contrib/gpu` will build a container with all gpu/pile dependencies. It can be built with `t5x/contrib/gpu/docker/build.sh `. However, we **highly** recommend using the pre-built container. -## Running interactively -Note: this should only be done with singlenode jobs and/or for downloading the pile. Use `t5x/contrib/gpu/docker/interactive_pull_and_launch.sh`. This takes arguments for the URL to pull a container from and the location of the dataset directory to mount. For example: +We **highly** recommend using the pre-built container, but if you'd like to build your own container with all of the gpu/pile dependencies, +here is how you can build your own: +```bash +CONTAINER=t5x:te-fp8 +git clone git@github.com:google-research/t5x.git $T5X_DIR +cd $T5X_DIR +git fetch origin pull/1320/head:te-distribution && git switch te-distribution -`t5x/contrib/gpu/docker/interactive_pull_and_launch.sh [URL] /my/dataset/dir` +docker build -t t5x:te-fp8 -f t5x/contrib/gpu/Dockerfile . +``` ## Downloading The Pile -We use The Pile for our pretraining experiments. If you would like to as well, run `download_the_pile.py` to download it. The download is approximately 1TB. It will download to the directory set in the environment variable: `TFDS_DATA_DIR`. After that, set the `TFDS_DATA_DIR` to the same directory in your scripts to use. +We use The Pile for our pretraining experiments. If you would like to as well, run `download_the_pile.py` to download it. The download is approximately 1TB. It will download to the directory set in the environment variable: `TFDS_DATA_DIR`. After that, set the `TFDS_DATA_DIR` to the same directory in your scripts to use. Here is how you would run it: + +```bash +docker run --rm -e TFDS_DATA_DIR=/t5x_home/datasets -v ${DATASET_PATH}:/t5x_home/datasets $CONTAINER python -m t5x.contrib.gpu.scripts_gpu.download_the_pile +``` + +## Running interactively +**Note**: this should only be done with singlenode jobs and/or for downloading The Pile. + +```bash +docker run --rm --gpus=all -it --net=host --ipc=host -v ${PWD}:/t5x_home -v ${DATASET_PATH}:/t5x_home/datasets -v ${WORKSPACE_PATH:-${PWD}/workspace}:/t5x_home/workspace --privileged $CONTAINER bash +``` + +## Inspecting the source code +If you would like to inspect t5x's source code (`t5x/*`) to learn more about what is being run, you can do so by inspecting +the source within the container. Here are some examples: + +```bash +# (Interactive = already in container): navigate to t5x/contrib/gpu/scripts_gpu/ +cd $(python -c 'import t5x; print(*t5x.__path__)'))../t5x/contrib/gpu/scripts_gpu + +# (Non-interactive): View t5x/contrib/gpu/Dockerfile +FILE=t5x/contrib/gpu/Dockerfile +docker run --entrypoint="" --rm $CONTAINER sh -c 'cat $(python -c "import t5x; print(*t5x.__path__)" 2>/dev/null)/../'$FILE +``` ## Single Node runs Pretraining and Finetuning can be done with `singlenode_*.sh`. These will build a T5X model with the Adam optimizer and relevant parameters. These will allow multi-gpu on one host. +```bash +# Pretraining (interactive: already inside container) +bash t5x/contrib/gpu/scripts_gpu/singlenode_pretrain_pile.sh + +# Pretraining (non-interactive) +docker run --rm --gpus=all --net=host --ipc=host -v ${DATASET_PATH}:/t5x_home/datasets $CONTAINER bash t5x/contrib/gpu/scripts_gpu/singlenode_pretrain_pile.sh + +# Finetuning (interactive: already inside container) +bash t5x/contrib/gpu/scripts_gpu/singlenode_ft_frompile.sh + +# Finetuning (non-interactive) +docker run --rm --gpus=all --net=host --ipc=host -v ${DATASET_PATH}:/t5x_home/datasets $CONTAINER bash t5x/contrib/gpu/scripts_gpu/singlenode_ft_frompile.sh +``` + ## Multi Node runs For a SLURM+pyxis cluster, `example*.sub` files provide example slurm submit files (edit with your details), which call `multiprocess*.sh` to execute training. You can add a binding script in the `.sub` file for your cluster, or remove it entirely (dropping some throughput) @@ -37,16 +93,20 @@ For our Pile convergence runs, we used a Global batch size of 2304 for XXL and 2 | T5-v1.1-xl | **H100 80G SXM** | TE-fp8 | 144 | 1 | 14 | ~7257 | **50.4** | **3.3 days** | **475** | N/A (perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) | T5-v1.1-xl | **H100 80G SXM** | TE-fp8 | 256 | 1 | 8 | ~9688 | **37.8** | **2.4 days** | **614** | N/A (perf test) | N/A (perf test) | |[pile](../t5/t5_1_1/examples/xl_pile_pretrain.gin) -Note: Convergence (as shown in log) was not necessarily done with the hardware topology listed, but the listed topology is tested. Estimated Walltime is calculated assuming full throughput (seq/sec) continuously. In practice, there are compilation overheads at the beginning of each run/restart(in cluster settings) + checkpointing overheads (if any). +Note: Convergence (as shown in log) was not necessarily done with the hardware topology listed, but the listed topology is tested. Estimated Walltime is calculated assuming full throughput (seq/sec) continuously. In practice, there are compilation overheads at the beginning of each run/restart (in cluster settings) + checkpointing overheads (if any). -Other hyperparameters are specified in the associated pile `gin` files in the `contrib/gpu/t5/t5_1_1/examples` directory. +Other hyperparameters are specified in the associated pile `gin` files in the `t5x/contrib/gpu/t5/t5_1_1/examples` directory. ## Pretraining run commands +All commands below assume you are in `$T5X_DIR` and have the scripts and slurm scripts locally. ### Multinode Arguments are set by environment variable as such: -`PREC={PRECISION} T5_SIZE={SIZE} BSIZE_PER_GPU={BSIZE} ..... sbatch -N {NODE_CT} t5x/contrib/gpu/t5/scripts_gpu/example_slurm_pretrain_pile.sub {GPUS_PER_NODE}` +```sh +PREC={PRECISION} T5_SIZE={SIZE} BSIZE_PER_GPU={BSIZE} ..... \ + sbatch -N {NODE_CT} t5x/contrib/gpu/t5/scripts_gpu/example_slurm_pretrain_pile.sub {GPUS_PER_NODE} +``` All parameters can be found in the relevant script. @@ -143,4 +203,4 @@ t5x/contrib/gpu/scripts_gpu/singlenode_ft_frompile.sh \ - 40% speedup - T5-xxl - H100 FP8 support, with gains over A100 - 2.08x faster - T5-large (FP8) - - 2.24x faster - T5-xl (FP8) + - 2.24x faster - T5-xl (FP8) \ No newline at end of file From 34cb214c215600eb60a7e3a5bf18286ef3689d76 Mon Sep 17 00:00:00 2001 From: "Yu-Hang \"Maxin\" Tang" Date: Tue, 4 Jul 2023 23:06:56 -0700 Subject: [PATCH 14/16] Update self-hosted runner tag --- .github/workflows/_build_jax.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index 255544b47..4d9c35d48 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -51,7 +51,7 @@ jobs: build: outputs: DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - runs-on: [self-hosted, builder] + runs-on: [self-hosted, large-builder] steps: - name: Print environment variables run: env @@ -118,4 +118,4 @@ jobs: REPO_JAX=${{ inputs.REPO_JAX }} REPO_XLA=${{ inputs.REPO_XLA }} REF_JAX=${{ inputs.REF_JAX }} - REF_XLA=${{ inputs.REF_XLA }} \ No newline at end of file + REF_XLA=${{ inputs.REF_XLA }} From 0f75bd771f9552dbfe26755be2a6c7d5c3318fc4 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 6 Jul 2023 22:21:35 -0700 Subject: [PATCH 15/16] Fixes issue in rosetta dockerfiles where distributions were copied, but deletions weren't being respected. Switched away from multi-stage builder pattern to fix --- rosetta/Dockerfile.pax | 9 ++------- rosetta/Dockerfile.t5x | 8 ++------ 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/rosetta/Dockerfile.pax b/rosetta/Dockerfile.pax index da3c03ecf..29529bb51 100644 --- a/rosetta/Dockerfile.pax +++ b/rosetta/Dockerfile.pax @@ -15,7 +15,7 @@ ADD --keep-git-dir=true https://github.com/google/praxis.git#main / FROM scratch as flax-mirror-source ADD --keep-git-dir=true https://github.com/google/flax.git#main / -FROM ${BASE_IMAGE} AS distribution-builder +FROM ${BASE_IMAGE} AS rosetta ARG GIT_USER_EMAIL ARG GIT_USER_NAME @@ -47,14 +47,9 @@ bash create-distribution.sh \ rm -rf $(find /opt -name "__pycache__") EOF -FROM ${BASE_IMAGE} AS rosetta -COPY --link --from=distribution-builder /opt/paxml /opt/paxml -COPY --link --from=distribution-builder /opt/praxis /opt/praxis -COPY --link --from=distribution-builder /opt/flax /opt/flax -COPY --link --from=distribution-builder /opt/rosetta /opt/rosetta - WORKDIR /opt/rosetta RUN < Date: Fri, 7 Jul 2023 19:07:03 +0100 Subject: [PATCH 16/16] Generate tensorboard links on HTTPS:443 port instead of HTTP:6006 --- .github/workflows/_test_pax.yaml | 2 +- .github/workflows/_test_t5x.yaml | 2 +- .github/workflows/nightly-pax-test-mgmn.yaml | 2 +- .github/workflows/nightly-t5x-test-mgmn.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index b14e8f3d4..8029477cf 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -159,7 +159,7 @@ jobs: ## PAX MGMN training - [view metrics](http://${{ vars.HOSTNAME_TENSORBOARD }}:6006/#scalars®exInput=${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) + [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) EOF ) | tee $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index d9aaef356..411cdd927 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -230,7 +230,7 @@ jobs: ## T5X MGMN training - [view metrics](http://${{ vars.HOSTNAME_TENSORBOARD }}:6006/#scalars®exInput=${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) + [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) EOF ) | tee $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/.github/workflows/nightly-pax-test-mgmn.yaml b/.github/workflows/nightly-pax-test-mgmn.yaml index bed6dc68c..91396615a 100644 --- a/.github/workflows/nightly-pax-test-mgmn.yaml +++ b/.github/workflows/nightly-pax-test-mgmn.yaml @@ -97,7 +97,7 @@ jobs: ## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }} - [view metrics](http://${{ vars.HOSTNAME_TENSORBOARD }}:6006/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) + [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) EOF ) | tee $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/nightly-t5x-test-mgmn.yaml b/.github/workflows/nightly-t5x-test-mgmn.yaml index 7f8c306d3..55e3d45b6 100644 --- a/.github/workflows/nightly-t5x-test-mgmn.yaml +++ b/.github/workflows/nightly-t5x-test-mgmn.yaml @@ -95,7 +95,7 @@ jobs: ## T5X MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }} - [view metrics](http://${{ vars.HOSTNAME_TENSORBOARD }}:6006/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) + [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) EOF ) | tee $GITHUB_STEP_SUMMARY