From 023dc8828d324a75c985bd3aa246f5eb0ee494c0 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Tue, 18 Jul 2023 19:23:39 +0100 Subject: [PATCH 1/7] install TE and JAX side-by-side --- .github/container/Dockerfile.jax | 27 +++++++++--- .github/container/Dockerfile.te | 12 ------ .github/container/install-te.sh | 70 -------------------------------- .github/workflows/_sandbox.yaml | 45 +++++--------------- 4 files changed, 33 insertions(+), 121 deletions(-) delete mode 100644 .github/container/Dockerfile.te delete mode 100755 .github/container/install-te.sh diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index ef1781f29..c06896660 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -1,10 +1,13 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-toolbox:base ARG REPO_JAX="https://github.com/google/jax.git" ARG REPO_XLA="https://github.com/openxla/xla.git" +ARG REPO_TE="https://github.com/NVIDIA/TransformerEngine.git" ARG REF_JAX=main ARG REF_XLA=main -ARG SRC_PATH_JAX=/opt/jax-source +ARG REF_TE=main ARG SRC_PATH_XLA=/opt/xla-source +ARG SRC_PATH_JAX=/opt/jax-source +ARG SRC_PATH_TE=/opt/transformer-engine ARG BAZEL_CACHE=/tmp ARG BUILD_DATE @@ -15,16 +18,20 @@ ARG BUILD_DATE FROM ${BASE_IMAGE} as jax-builder ARG REPO_JAX ARG REPO_XLA +ARG REPO_TE ARG REF_JAX ARG REF_XLA +ARG REF_TE ARG SRC_PATH_JAX ARG SRC_PATH_XLA +ARG SRC_PATH_TE ARG BAZEL_CACHE RUN git clone "${REPO_JAX}" "${SRC_PATH_JAX}" && cd "${SRC_PATH_JAX}" && git checkout ${REF_JAX} RUN --mount=type=ssh \ --mount=type=secret,id=SSH_KNOWN_HOSTS,target=/root/.ssh/known_hosts \ git clone "${REPO_XLA}" "${SRC_PATH_XLA}" && cd "${SRC_PATH_XLA}" && git checkout ${REF_XLA} +RUN git clone "${REPO_TE}" "${SRC_PATH_TE}" && cd "${SRC_PATH_TE}" && git checkout ${REF_TE} && git submodule init && git submodule update --recursive ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ RUN build-jax.sh \ @@ -36,6 +43,7 @@ RUN build-jax.sh \ RUN cp -r ${SRC_PATH_JAX} ${SRC_PATH_JAX}-no-git && rm -rf ${SRC_PATH_JAX}-no-git/.git RUN cp -r ${SRC_PATH_XLA} ${SRC_PATH_XLA}-no-git && rm -rf ${SRC_PATH_XLA}-no-git/.git +RUN cp -r ${SRC_PATH_TE} ${SRC_PATH_TE}-no-git && rm -rf ${SRC_PATH_TE}-no-git/.git ############################################################################### ## Build 'runtime' flavor without the git metadata @@ -45,15 +53,22 @@ ARG BASE_IMAGE FROM ${BASE_IMAGE} as runtime-image ARG SRC_PATH_JAX ARG SRC_PATH_XLA +ARG SRC_PATH_TE ARG BUILD_DATE ENV BUILD_DATE=${BUILD_DATE} COPY --from=jax-builder ${SRC_PATH_JAX}-no-git ${SRC_PATH_JAX} COPY --from=jax-builder ${SRC_PATH_XLA}-no-git ${SRC_PATH_XLA} +COPY --from=jax-builder ${SRC_PATH_TE}-no-git ${SRC_PATH_TE} -RUN pip --disable-pip-version-check install ${SRC_PATH_JAX}/dist/*.whl && \ - pip --disable-pip-version-check install -e ${SRC_PATH_JAX} && \ - rm -rf ~/.cache/pip/ +# Transformer Engine installation dependencies +RUN pip install --no-cache-dir pybind11 ninja packaging && rm -rf ~/.cache/pip/ +# Install JAX + Transformer Engine +RUN NVTE_FRAMEWORK=jax pip --disable-pip-version-check install -e \ + ${SRC_PATH_JAX}/dist/*.whl \ + ${SRC_PATH_JAX} \ + ${SRC_PATH_TE} \ + && rm -rf ~/.cache/pip/ # Install software stack in JAX ecosystem # Made this optional since tensorstore cannot build on Ubuntu 20.04 + ARM @@ -69,8 +84,10 @@ RUN { pip install flax || true; } && rm -rf ~/.cache/pip FROM runtime-image as devel-image ARG SRC_PATH_JAX ARG SRC_PATH_XLA +ARG SRC_PATH_TE ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ COPY --from=jax-builder ${SRC_PATH_JAX}/.git ${SRC_PATH_JAX}/.git -COPY --from=jax-builder ${SRC_PATH_XLA}/.git ${SRC_PATH_XLA}/.git \ No newline at end of file +COPY --from=jax-builder ${SRC_PATH_XLA}/.git ${SRC_PATH_XLA}/.git +COPY --from=jax-builder ${SRC_PATH_TE}/.git ${SRC_PATH_TE}/.git \ No newline at end of file diff --git a/.github/container/Dockerfile.te b/.github/container/Dockerfile.te deleted file mode 100644 index ca8cd53d1..000000000 --- a/.github/container/Dockerfile.te +++ /dev/null @@ -1,12 +0,0 @@ -############################################################################### -## T5X -############################################################################### - -ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} -ARG REPO_TE="https://github.com/NVIDIA/TransformerEngine.git" -ARG REF_TE=main -ARG SRC_PATH_TE=/opt/transformer-engine - -ADD install-te.sh /usr/local/bin -RUN install-te.sh --from=${REPO_TE} --ref=${REF_TE} --dir=${SRC_PATH_TE} \ No newline at end of file diff --git a/.github/container/install-te.sh b/.github/container/install-te.sh deleted file mode 100755 index f7b89bbcb..000000000 --- a/.github/container/install-te.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash - -## Parse command-line arguments - -usage() { - echo "Usage: $0 [OPTION]..." - echo " -d, --dir=PATH Path to store TE source. Defaults to /opt/transformer-engine" - echo " -f, --from=URL URL of the TE repo. Defaults to https://github.com/NVIDIA/TransformerEngine.git" - echo " -h, --help Print usage." - echo " -r, --ref=REF Git commit hash or tag name that specifies the version of TE to install. Defaults to HEAD." - exit $1 -} - -args=$(getopt -o d:f:hr: --long dir:,from:,help,ref: -- "$@") -if [[ $? -ne 0 ]]; then - exit 1 -fi - -eval set -- "$args" -while [ : ]; do - case "$1" in - -d | --dir) - INSTALL_DIR="$2" - shift 2 - ;; - -f | --from) - TE_REPO="$2" - shift 2 - ;; - -h | --help) - usage - ;; - -r | --ref) - TE_REF="$2" - shift 2 - ;; - --) - shift; - break - ;; - esac -done - -if [[ $# -ge 1 ]]; then - echo "Un-recognized argument: $*" && echo - usage 1 -fi - -## Set default arguments if not provided via command-line - -TE_REF="${TE_REF:-HEAD}" -TE_REPO="${TE_REPO:-https://github.com/NVIDIA/TransformerEngine.git}" -INSTALL_DIR="${INSTALL_DIR:-/opt/transformer-engine}" - -echo "Installing TE $TE_REF from $TE_REPO to $INSTALL_DIR" - -set -ex - -## Install dependencies - -pip install --no-cache-dir pybind11 ninja packaging - -## Install TE - -git clone ${TE_REPO} ${INSTALL_DIR} -cd ${INSTALL_DIR} -git checkout ${TE_REF} -git submodule init -git submodule update --recursive -NVTE_FRAMEWORK=jax pip install -e . diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 37fa6ca68..ce6e20bc8 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -3,39 +3,16 @@ name: "~Sandbox" on: workflow_dispatch: -jobs: - sandbox: - runs-on: ubuntu-22.04 - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container - - name: Print usage - run: | - cat << EOF - This is an empty workflow file located in the main branch of your - repository. It serves as a testing ground for new GitHub Actions on - development branches before merging them to the main branch. By - defining and overloading this workflow on your development branch, - you can test new actions without affecting your main branch, ensuring - a smooth integration process once the changes are ready to be merged. +jobs: - Usage: - - 1. In your development branch, modify the sandbox.yml workflow file - to include the new actions you want to test. Make sure to commit - the changes to the development branch. - 2. Navigate to the 'Actions' tab in your repository, select the - '~Sandbox' workflow, and choose your development branch from the - branch dropdown menu. Click on 'Run workflow' to trigger the - workflow on your development branch. - 3. Once you have tested and verified the new actions in the Sandbox - workflow, you can incorporate them into your main workflow(s) and - merge the development branch into the main branch. Remember to - revert the changes to the sandbox.yml file in the main branch to - keep it empty for future testing. - EOF \ No newline at end of file + build: + needs: metadata + uses: ./.github/workflows/_build_jax.yaml + with: + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + secrets: inherit From 2120c4c5ab2f7a42c5b0e0e1147b841e05af030e Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Tue, 18 Jul 2023 19:24:18 +0100 Subject: [PATCH 2/7] install TE and JAX side-by-side --- .github/workflows/_sandbox.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index ce6e20bc8..0f37cf160 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -11,8 +11,5 @@ permissions: jobs: build: - needs: metadata uses: ./.github/workflows/_build_jax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit From 503ad91705861eee9f599c2f928a254b0f22c66f Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Tue, 18 Jul 2023 22:35:59 +0100 Subject: [PATCH 3/7] use RUN heredoc --- .github/container/Dockerfile.jax | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index c06896660..b0d2972dd 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -61,14 +61,16 @@ COPY --from=jax-builder ${SRC_PATH_JAX}-no-git ${SRC_PATH_JAX} COPY --from=jax-builder ${SRC_PATH_XLA}-no-git ${SRC_PATH_XLA} COPY --from=jax-builder ${SRC_PATH_TE}-no-git ${SRC_PATH_TE} +RUN < Date: Wed, 19 Jul 2023 02:11:20 +0100 Subject: [PATCH 4/7] remove TE build from all workflows and use JAX build instead --- .github/workflows/_build_pax.yaml | 2 +- .github/workflows/_build_t5x.yaml | 2 +- .github/workflows/_build_te.yaml | 90 ------------------------- .github/workflows/_test_te.yaml | 2 +- .github/workflows/ci.yaml | 17 +---- .github/workflows/nightly-te-build.yaml | 64 ------------------ .github/workflows/nightly-te-test.yaml | 6 +- README.md | 7 +- 8 files changed, 10 insertions(+), 180 deletions(-) delete mode 100644 .github/workflows/_build_te.yaml delete mode 100644 .github/workflows/nightly-te-build.yaml diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 630db4202..0c1ea75b5 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -7,7 +7,7 @@ on: type: string description: 'Base docker image that provides JAX' required: false - default: ghcr.io/nvidia/jax-te:latest + default: ghcr.io/nvidia/jax:latest BUILD_DATE: type: string description: "Build date in YYYY-MM-DD format" diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index c4cd4475a..0b6b8c29a 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -7,7 +7,7 @@ on: type: string description: 'Base docker image that provides JAX' required: false - default: ghcr.io/nvidia/jax-te:latest + default: ghcr.io/nvidia/jax:latest BUILD_DATE: type: string description: "Build date in YYYY-MM-DD format" diff --git a/.github/workflows/_build_te.yaml b/.github/workflows/_build_te.yaml deleted file mode 100644 index 1919be104..000000000 --- a/.github/workflows/_build_te.yaml +++ /dev/null @@ -1,90 +0,0 @@ -name: ~build Transformer Engine container - -on: - workflow_call: - inputs: - BASE_IMAGE: - type: string - description: 'Base docker image that provides JAX' - required: false - default: ghcr.io/nvidia/jax:latest - BUILD_DATE: - type: string - description: "Build date in YYYY-MM-DD format" - required: false - default: 'NOT SPECIFIED' - REPO_TE: - type: string - description: URL of TE repository to check out - required: false - default: "https://github.com/NVIDIA/TransformerEngine.git" - REF_TE: - type: string - description: Git commit, tag, or branch for TE - required: false - default: main - outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.build.outputs.DOCKER_TAGS }} - -env: - UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - - build: - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - runs-on: [self-hosted, small-builder] - steps: - - name: Print environment variables - run: env - - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-te - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - with: - driver-opts: | - image=moby/buildkit:v0.10.6 - - - name: Build docker images - uses: docker/build-push-action@v4 - with: - context: .github/container - push: true - file: .github/container/Dockerfile.te - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - build-args: | - BASE_IMAGE=${{ inputs.BASE_IMAGE }} - BUILD_DATE=${{ inputs.BUILD_DATE }} - REPO_TE=${{ inputs.REPO_TE }} - REF_TE=${{ inputs.REF_TE }} \ No newline at end of file diff --git a/.github/workflows/_test_te.yaml b/.github/workflows/_test_te.yaml index 399631c3f..b4609765d 100644 --- a/.github/workflows/_test_te.yaml +++ b/.github/workflows/_test_te.yaml @@ -8,7 +8,7 @@ on: type: string description: 'JAX-TE image build by NVIDIA/JAX-Toolbox' required: true - default: 'ghcr.io/nvidia/jax-te:latest' + default: 'ghcr.io/nvidia/jax:latest' outputs: UNIT_TEST_ARTIFACT_NAME: description: 'Name of the unit test artifact for downstream workflows' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3e9f78d4c..068801384 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -119,16 +119,6 @@ jobs: REF_XLA: ${{ needs.metadata.outputs.REF_XLA }} secrets: inherit - build-te: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_te.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} - REF_TE: ${{ needs.metadata.outputs.REF_TE }} - secrets: inherit - build-t5x: needs: [metadata, build-jax] uses: ./.github/workflows/_build_t5x.yaml @@ -170,7 +160,7 @@ jobs: secrets: inherit build-summary: - needs: [build-base, build-jax, build-te, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] + needs: [build-base, build-jax, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] runs-on: ubuntu-22.04 steps: - name: Generate job summary for container build @@ -183,7 +173,6 @@ jobs: | ------------ | -------------------------------------------------- | | Base | ${{ needs.build-base.outputs.DOCKER_TAGS }} | | JAX | ${{ needs.build-jax.outputs.DOCKER_TAGS }} | - | JAX-TE | ${{ needs.build-te.outputs.DOCKER_TAGS }} | | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | @@ -198,10 +187,10 @@ jobs: secrets: inherit test-te: - needs: build-te + needs: [build-jax, test-jax] uses: ./.github/workflows/_test_te.yaml with: - JAX_TE_IMAGE: ${{ needs.build-te.outputs.DOCKER_TAGS }} + JAX_TE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} secrets: inherit test-t5x: diff --git a/.github/workflows/nightly-te-build.yaml b/.github/workflows/nightly-te-build.yaml deleted file mode 100644 index 3fecd1067..000000000 --- a/.github/workflows/nightly-te-build.yaml +++ /dev/null @@ -1,64 +0,0 @@ -name: Nightly Transformer Engine build - -on: - workflow_run: - workflows: [Nightly JAX build] - types: [completed] - branches: [main] - workflow_dispatch: - inputs: - PUBLISH: - type: boolean - description: Publish dated images and update the 'latest' tag? - default: false - required: false - -env: - TARGET: jax-te - DOCKER_REGISTRY: ghcr.io/nvidia - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - - metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - steps: - - name: Set build date - id: date - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - needs: metadata - uses: ./.github/workflows/_build_te.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - secrets: inherit - - publish: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: jax-te - TARGET_TAGS: | - type=raw,value=latest,priority=1000 - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 \ No newline at end of file diff --git a/.github/workflows/nightly-te-test.yaml b/.github/workflows/nightly-te-test.yaml index c030af044..182cdf641 100644 --- a/.github/workflows/nightly-te-test.yaml +++ b/.github/workflows/nightly-te-test.yaml @@ -2,7 +2,7 @@ name: Nightly Transformer Engine test on: workflow_run: - workflows: [Nightly Transformer Engine build] + workflows: [Nightly JAX build] types: [completed] branches: [main] workflow_dispatch: @@ -11,7 +11,7 @@ on: type: string description: 'JAX-TE image build by NVIDIA/JAX-Toolbox' required: true - default: 'ghcr.io/nvidia/jax-te:latest' + default: 'ghcr.io/nvidia/jax:latest' PUBLISH: type: boolean description: Update status badge? @@ -24,7 +24,7 @@ permissions: packages: write # to upload container env: - DEFAULT_JAX_TE_IMAGE: 'ghcr.io/nvidia/jax-te:latest' + DEFAULT_JAX_TE_IMAGE: 'ghcr.io/nvidia/jax:latest' jobs: diff --git a/README.md b/README.md index 150b557e1..1bc49618f 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,9 @@ | Image | Build | Test | | ---------------------------------------------------- | ------------------------------------------ | -------------------------------------- | | [![container-badge-base]][container-link-base] | [![build-badge-base]][workflow-base] | n/a | -| [![container-badge-jax]][container-link-jax] | [![build-badge-jax]][workflow-jax] | [![test-badge-jax]][workflow-jax-unit] | +| [![container-badge-jax]][container-link-jax] | [![build-badge-jax]][workflow-jax] | [![test-badge-jax]][workflow-jax-unit]
[![unit-test-badge-te]][workflow-te-test]
[![integration-test-badge-te]][workflow-te-test] | | [![container-badge-t5x]][container-link-t5x] | [![build-badge-t5x]][workflow-t5x] | [![test-badge-t5x]][workflow-t5x-perf] | | [![container-badge-pax]][container-link-pax] | [![build-badge-pax]][workflow-pax] | [![test-badge-pax]][workflow-pax-perf] | -| [![container-badge-te]][container-link-te] | [![build-badge-te]][workflow-te] | [![unit-test-badge-te]][workflow-te-test]
[![integration-test-badge-te]][workflow-te-test] | | [![container-badge-rosetta-t5x]][container-link-rosetta-t5x] | [![build-badge-rosetta-t5x]][workflow-rosetta-t5x] | [![test-badge-rosetta-t5x]][workflow-rosetta-t5x] | | [![container-badge-rosetta-pax]][container-link-rosetta-pax] | [![build-badge-rosetta-pax]][workflow-rosetta-pax] | [![test-badge-rosetta-pax]][workflow-rosetta-pax] | @@ -16,13 +15,11 @@ [container-badge-pax]: https://img.shields.io/static/v1?label=&message=PAX&color=gray&logo=docker [container-badge-rosetta-t5x]: https://img.shields.io/static/v1?label=&message=ROSETTA(T5X)&color=gray&logo=docker [container-badge-rosetta-pax]: https://img.shields.io/static/v1?label=&message=ROSETTA(PAX)&color=gray&logo=docker -[container-badge-te]: https://img.shields.io/static/v1?label=&message=TE&color=gray&logo=docker [container-link-base]: https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/jax-toolbox [container-link-jax]: https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/jax [container-link-t5x]: https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/t5x [container-link-pax]: https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/pax -[container-link-te]: https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/jax-te [container-link-rosetta-t5x]: https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/rosetta-t5x [container-link-rosetta-pax]: https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/rosetta-pax @@ -32,7 +29,6 @@ [build-badge-pax]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-pax-build.yaml?branch=main&label=nightly&logo=github-actions&logoColor=dddddd [build-badge-rosetta-t5x]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Frosetta-t5x-build-status.json&logo=github-actions&logoColor=dddddd [build-badge-rosetta-pax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Frosetta-pax-build-status.json&logo=github-actions&logoColor=dddddd -[build-badge-te]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-te-build.yaml?branch=main&label=nightly&logo=github-actions&logoColor=dddddd [workflow-base]: https://github.com/NVIDIA/JAX-Toolbox/actions/workflows/weekly-base-build.yaml [workflow-jax]: https://github.com/NVIDIA/JAX-Toolbox/actions/workflows/nightly-jax-build.yaml @@ -40,7 +36,6 @@ [workflow-pax]: https://github.com/NVIDIA/JAX-Toolbox/actions/workflows/nightly-pax-build.yaml [workflow-rosetta-t5x]: https://github.com/NVIDIA/JAX-Toolbox/actions/workflows/nightly-rosetta-t5x-build.yaml [workflow-rosetta-pax]: https://github.com/NVIDIA/JAX-Toolbox/actions/workflows/nightly-rosetta-pax-build.yaml -[workflow-te]: https://github.com/NVIDIA/JAX-Toolbox/actions/workflows/nightly-te-build.yaml [test-badge-jax]: https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fjax-unit-test-status.json&logo=nvidia [test-badge-t5x]: https://img.shields.io/github/actions/workflow/status/NVIDIA/JAX-Toolbox/nightly-t5x-test-mgmn.yaml?branch=main&label=A100%20MGMN&logo=nvidia From 8344548a0660952ca7fff0fa836d9c687ec833b8 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 19 Jul 2023 02:13:38 +0100 Subject: [PATCH 5/7] fix line order --- .github/container/Dockerfile.jax | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index b0d2972dd..3937df852 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -5,8 +5,8 @@ ARG REPO_TE="https://github.com/NVIDIA/TransformerEngine.git" ARG REF_JAX=main ARG REF_XLA=main ARG REF_TE=main -ARG SRC_PATH_XLA=/opt/xla-source ARG SRC_PATH_JAX=/opt/jax-source +ARG SRC_PATH_XLA=/opt/xla-source ARG SRC_PATH_TE=/opt/transformer-engine ARG BAZEL_CACHE=/tmp ARG BUILD_DATE From d014f0a23f829ba73dc695d7cb303a361263fa15 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 19 Jul 2023 02:17:34 +0100 Subject: [PATCH 6/7] add the TE installer script back as a refernece point for future refacgtor --- .github/container/install-te.sh | 70 +++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100755 .github/container/install-te.sh diff --git a/.github/container/install-te.sh b/.github/container/install-te.sh new file mode 100755 index 000000000..f7b89bbcb --- /dev/null +++ b/.github/container/install-te.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +## Parse command-line arguments + +usage() { + echo "Usage: $0 [OPTION]..." + echo " -d, --dir=PATH Path to store TE source. Defaults to /opt/transformer-engine" + echo " -f, --from=URL URL of the TE repo. Defaults to https://github.com/NVIDIA/TransformerEngine.git" + echo " -h, --help Print usage." + echo " -r, --ref=REF Git commit hash or tag name that specifies the version of TE to install. Defaults to HEAD." + exit $1 +} + +args=$(getopt -o d:f:hr: --long dir:,from:,help,ref: -- "$@") +if [[ $? -ne 0 ]]; then + exit 1 +fi + +eval set -- "$args" +while [ : ]; do + case "$1" in + -d | --dir) + INSTALL_DIR="$2" + shift 2 + ;; + -f | --from) + TE_REPO="$2" + shift 2 + ;; + -h | --help) + usage + ;; + -r | --ref) + TE_REF="$2" + shift 2 + ;; + --) + shift; + break + ;; + esac +done + +if [[ $# -ge 1 ]]; then + echo "Un-recognized argument: $*" && echo + usage 1 +fi + +## Set default arguments if not provided via command-line + +TE_REF="${TE_REF:-HEAD}" +TE_REPO="${TE_REPO:-https://github.com/NVIDIA/TransformerEngine.git}" +INSTALL_DIR="${INSTALL_DIR:-/opt/transformer-engine}" + +echo "Installing TE $TE_REF from $TE_REPO to $INSTALL_DIR" + +set -ex + +## Install dependencies + +pip install --no-cache-dir pybind11 ninja packaging + +## Install TE + +git clone ${TE_REPO} ${INSTALL_DIR} +cd ${INSTALL_DIR} +git checkout ${TE_REF} +git submodule init +git submodule update --recursive +NVTE_FRAMEWORK=jax pip install -e . From 8897c4f6d3ae27ebf1b3619506d241f90b7f1157 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 19 Jul 2023 06:30:03 +0100 Subject: [PATCH 7/7] fix editable/non-editable installation issue --- .github/container/Dockerfile.jax | 2 +- .github/workflows/_test_te.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 3937df852..a22577136 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -65,8 +65,8 @@ RUN <> ~/.ssh/known_hosts << EOF + cat >> ~/.ssh/known_hosts <