From abb6f9704b8e32ed331ac56bd767338796224c3c Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Mon, 23 Oct 2023 08:37:53 +0000 Subject: [PATCH 001/146] wip Co-authored-by: Terry Kong --- .github/container/Dockerfile.base | 2 +- .github/container/Dockerfile.jax | 44 ++++++++----------- .github/container/Dockerfile.t5x | 2 +- .../{install-flax.sh => get-flax.sh} | 0 .../container/{install-t5x.sh => get-t5x.sh} | 0 5 files changed, 20 insertions(+), 28 deletions(-) rename .github/container/{install-flax.sh => get-flax.sh} (100%) rename .github/container/{install-t5x.sh => get-t5x.sh} (100%) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 6e71ed702..6060cb39c 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -23,7 +23,7 @@ RUN apt-get update && \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN pip install --upgrade --no-cache-dir pip +RUN pip install --upgrade --no-cache-dir pip pip-tools && rm -rf ~/.cache/* ############################################################################### ## Install cuDNN diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 9914b28c3..7ecba4070 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -12,7 +12,7 @@ ARG BUILD_DATE ## Build JAX ############################################################################### -FROM ${BASE_IMAGE} as jax-builder +FROM ${BASE_IMAGE} as builder ARG REPO_JAX ARG REPO_XLA ARG REF_JAX @@ -34,15 +34,12 @@ RUN build-jax.sh \ --sm all \ --clean -RUN cp -r ${SRC_PATH_JAX} ${SRC_PATH_JAX}-no-git && rm -rf ${SRC_PATH_JAX}-no-git/.git -RUN cp -r ${SRC_PATH_XLA} ${SRC_PATH_XLA}-no-git && rm -rf ${SRC_PATH_XLA}-no-git/.git - ############################################################################### -## Build 'runtime' flavor without the git metadata +## Copy wheels and source dirs into pre-installation image ############################################################################### ARG BASE_IMAGE -FROM ${BASE_IMAGE} as runtime-image +FROM ${BASE_IMAGE} as pre-install ARG SRC_PATH_JAX ARG SRC_PATH_XLA ARG BUILD_DATE @@ -53,29 +50,24 @@ ENV CUDA_DEVICE_MAX_CONNECTIONS=1 ENV NCCL_IB_SL=1 ENV NCCL_NVLS_ENABLE=0 -COPY --from=jax-builder ${SRC_PATH_JAX}-no-git ${SRC_PATH_JAX} -COPY --from=jax-builder ${SRC_PATH_XLA}-no-git ${SRC_PATH_XLA} - -RUN pip --disable-pip-version-check install ${SRC_PATH_JAX}/dist/*.whl && \ - pip --disable-pip-version-check install -e ${SRC_PATH_JAX} && \ - rm -rf ~/.cache/pip/ +COPY --from=jax-builder ${SRC_PATH_JAX} ${SRC_PATH_JAX} +COPY --from=jax-builder ${SRC_PATH_XLA} ${SRC_PATH_XLA} +ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ -# Install software stack in JAX ecosystem -# Made this optional since tensorstore cannot build on Ubuntu 20.04 + ARM -RUN { pip install flax || true; } && rm -rf ~/.cache/* +RUN mkdir -p /opt/pip-tools.d +RUN <> /opt/pip-tools.d/requirements-jax.in +echo "$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/requirements-jax.in +echo "flax" >> /opt/pip-tools.d/requirements-jax.in +EOF # TODO: properly configure entrypoint -# COPY entrypoint.d/ /opt/nvidia/entrypoint.d/ -############################################################################### -## Build 'devel' image with build scripts and git metadata -############################################################################### +# ############################################################################### +# ## Build 'devel' image with build scripts and git metadata +# ############################################################################### -FROM runtime-image as devel-image -ARG SRC_PATH_JAX -ARG SRC_PATH_XLA - -ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ +FROM pre-install as assemble -COPY --from=jax-builder ${SRC_PATH_JAX}/.git ${SRC_PATH_JAX}/.git -COPY --from=jax-builder ${SRC_PATH_XLA}/.git ${SRC_PATH_XLA}/.git +ADD pip-finalize.sh /usr/local/bin +RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index 0bf63b291..4ef1afde7 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -6,7 +6,7 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest FROM ${BASE_IMAGE} -ADD install-t5x.sh /usr/local/bin +ADD get-t5x.sh /usr/local/bin ADD install-flax.sh /usr/local/bin ADD install-te.sh /usr/local/bin diff --git a/.github/container/install-flax.sh b/.github/container/get-flax.sh similarity index 100% rename from .github/container/install-flax.sh rename to .github/container/get-flax.sh diff --git a/.github/container/install-t5x.sh b/.github/container/get-t5x.sh similarity index 100% rename from .github/container/install-t5x.sh rename to .github/container/get-t5x.sh From 068aab9df08b56b23542de4d42d1c0a68d96846f Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Mon, 23 Oct 2023 08:41:37 +0000 Subject: [PATCH 002/146] fix typo --- .github/container/Dockerfile.jax | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 7ecba4070..df1b8f6db 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -63,9 +63,9 @@ EOF # TODO: properly configure entrypoint -# ############################################################################### -# ## Build 'devel' image with build scripts and git metadata -# ############################################################################### +############################################################################### +## Build 'devel' image with build scripts and git metadata +############################################################################### FROM pre-install as assemble From fb7cf0be1a3e733a1cbb035d11d7c476a1f4b5de Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Tue, 24 Oct 2023 05:43:09 +0000 Subject: [PATCH 003/146] wip --- .github/container/Dockerfile.base | 3 + .github/container/Dockerfile.jax | 30 +++-- .github/container/Dockerfile.pax.amd64 | 1 - .github/container/Dockerfile.pax.arm64 | 1 - .github/container/Dockerfile.t5x | 45 +++---- .github/container/get-flax.sh | 81 ------------ .github/container/get-source.sh | 101 +++++++++++++++ .github/container/pip-finalize.sh | 7 + .github/workflows/_build_jax.yaml | 169 ++++++++++++------------- .github/workflows/_build_t5x.yaml | 58 +++++++-- .github/workflows/_sandbox.yaml | 110 ++++++++++------ 11 files changed, 353 insertions(+), 253 deletions(-) delete mode 100755 .github/container/get-flax.sh create mode 100755 .github/container/get-source.sh create mode 100755 .github/container/pip-finalize.sh diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 6060cb39c..887fc555e 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -1,4 +1,5 @@ ARG BASE_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu22.04 + FROM ${BASE_IMAGE} ############################################################################### @@ -24,6 +25,8 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* RUN pip install --upgrade --no-cache-dir pip pip-tools && rm -rf ~/.cache/* +ADD --chmod=777 get-source.sh /usr/local/bin +ADD --chmod=777 pip-finalize.sh /usr/local/bin ############################################################################### ## Install cuDNN diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index df1b8f6db..fd8bcc597 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -1,10 +1,14 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-toolbox:base ARG REPO_JAX="https://github.com/google/jax.git" ARG REPO_XLA="https://github.com/openxla/xla.git" +ARG REPO_TE="https://github.com/NVIDIA/TransformerEngine.git" ARG REF_JAX=main ARG REF_XLA=main +ARG REF_TE=main ARG SRC_PATH_JAX=/opt/jax-source ARG SRC_PATH_XLA=/opt/xla-source +ARG SRC_PATH_TE=/opt/transformer-engine + ARG BAZEL_CACHE=/tmp ARG BUILD_DATE @@ -35,11 +39,11 @@ RUN build-jax.sh \ --clean ############################################################################### -## Copy wheels and source dirs into pre-installation image +## Pack jaxlib wheel and various source dirs into a pre-installation image ############################################################################### ARG BASE_IMAGE -FROM ${BASE_IMAGE} as pre-install +FROM ${BASE_IMAGE} as staging ARG SRC_PATH_JAX ARG SRC_PATH_XLA ARG BUILD_DATE @@ -55,19 +59,25 @@ COPY --from=jax-builder ${SRC_PATH_XLA} ${SRC_PATH_XLA} ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ RUN mkdir -p /opt/pip-tools.d -RUN <> /opt/pip-tools.d/requirements-jax.in -echo "$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/requirements-jax.in -echo "flax" >> /opt/pip-tools.d/requirements-jax.in -EOF +RUN echo "-e ${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax +RUN echo "$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax + +## Flax +RUN get-source.sh -f https://github.com/google/flax.git -r v0.7.4 -d /opt/flax -o /opt/pip-tools.d/manifest.flax + +## Transformer engine +ARG REPO_TE +ARG REF_TE +ARG SRC_PATH_TE +ENV NVTE_FRAMEWORK=jax +RUN get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} -o /opt/pip-tools.d/manifest.te # TODO: properly configure entrypoint ############################################################################### -## Build 'devel' image with build scripts and git metadata +## Install primary packages and transitive dependencies ############################################################################### -FROM pre-install as assemble +FROM staging as final -ADD pip-finalize.sh /usr/local/bin RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index e8ae18291..b581ea716 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -10,7 +10,6 @@ ADD install-pax.sh /usr/local/bin ADD install-flax.sh /usr/local/bin ADD install-te.sh /usr/local/bin -ENV NVTE_FRAMEWORK=jax ARG REPO_PAXML=https://github.com/google/paxml.git ARG REPO_PRAXIS=https://github.com/google/praxis.git ARG REF_PAXML=main diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 5d55bf2a5..cb0583e49 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -32,7 +32,6 @@ RUN /opt/install_lingvo_aarch64.sh ADD install-pax.sh /usr/local/bin RUN install-pax.sh -ENV NVTE_FRAMEWORK=jax ADD install-te.sh /usr/local/bin RUN install-te.sh # Lingvo has pinned TF to 2.13, so we need to downgrade the pydantic version so that its diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index 4ef1afde7..30d4dacb4 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -1,33 +1,26 @@ # syntax=docker/dockerfile:1-labs -############################################################################### -## T5X -############################################################################### ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} - -ADD get-t5x.sh /usr/local/bin -ADD install-flax.sh /usr/local/bin -ADD install-te.sh /usr/local/bin - -ENV NVTE_FRAMEWORK=jax ARG REPO_T5X=https://github.com/google-research/t5x.git ARG REF_T5X=main -ARG REPO_TE=https://github.com/NVIDIA/TransformerEngine.git -ARG REF_TE=main -RUN <<"EOF" bash -ex -install-t5x.sh --defer --from ${REPO_T5X} --ref ${REF_T5X} -install-te.sh --defer --from ${REPO_TE} --ref ${REF_TE} - -if [[ -f /opt/requirements-defer.txt ]]; then - pip install -r /opt/requirements-defer.txt -fi -if [[ -f /opt/cleanup.sh ]]; then - bash -ex /opt/cleanup.sh -fi - -# Note: Install after t5x installation b/c t5x installs flax from source -install-flax.sh -EOF +ARG SRC_PATH_T5X=/opt/t5x + +############################################################################### +## Download source and add auxiliary scripts +############################################################################### + +FROM ${BASE_IMAGE} as staging + +ARG REPO_T5X +ARG REF_T5X +RUN get-source.sh -f ${REF_T5X} -r ${REF_T5X} -d ${SRC_PATH_T5X} -o /opt/pip-tools.d/manifest.t5x ADD test-t5x.sh /usr/local/bin + +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### + +FROM staging as final + +RUN pip-finalize.sh diff --git a/.github/container/get-flax.sh b/.github/container/get-flax.sh deleted file mode 100755 index 30802e0d4..000000000 --- a/.github/container/get-flax.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - -## Parse command-line arguments - -usage() { - echo "Usage: $0 [OPTION]..." - echo ' --defer When passed, will defer the installation of the main package. Can be installed afterwards with `pip install -r requirements-defer.txt` and any deferred cleanup commands can be run with `bash cleanup.sh`' - echo " -d, --dir=PATH Path to store flax source. Defaults to /opt/flax" - echo " -f, --from=URL URL of the flax repo. Defaults to https://github.com/google/flax.git" - echo " -h, --help Print usage." - echo " -r, --ref=REF Git commit hash or tag name that specifies the version of flax to install. Defaults to HEAD." - exit $1 -} - -args=$(getopt -o d:f:hr: --long defer,dir:,from:,help,ref: -- "$@") -if [[ $? -ne 0 ]]; then - exit 1 -fi - -eval set -- "$args" -while [ : ]; do - case "$1" in - --defer) - DEFER=true - shift - ;; - -d | --dir) - INSTALL_DIR="$2" - shift 2 - ;; - -f | --from) - FLAX_REPO="$2" - shift 2 - ;; - -h | --help) - usage - ;; - -r | --ref) - FLAX_REF="$2" - shift 2 - ;; - --) - shift; - break - ;; - esac -done - -if [[ $# -ge 1 ]]; then - echo "Un-recognized argument: $*" && echo - usage 1 -fi - -## Set default arguments if not provided via command-line - -DEFER=${DEFER:-false} -FLAX_REF="${FLAX_REF:-HEAD}" -FLAX_REPO="${FLAX_REPO:-https://github.com/google/flax.git}" -INSTALL_DIR="${INSTALL_DIR:-/opt/flax}" - -echo "Installing flax $FLAX_REF from $FLAX_REPO to $INSTALL_DIR" - -maybe_defer_pip_install() { - if [[ "$DEFER" = true ]]; then - echo "Deferring installation of 'pip install $*'" - echo "$*" >> /opt/requirements-defer.txt - else - pip install $@ - fi -} - -set -ex - -## Install flax - -git clone ${FLAX_REPO} ${INSTALL_DIR} -cd ${INSTALL_DIR} -git checkout ${FLAX_REF} -# We currently require installing editable (-e) to build a distribution since -# we edit the source in place and do not re-install -maybe_defer_pip_install -e ${INSTALL_DIR} \ No newline at end of file diff --git a/.github/container/get-source.sh b/.github/container/get-source.sh new file mode 100755 index 000000000..9dfb483a4 --- /dev/null +++ b/.github/container/get-source.sh @@ -0,0 +1,101 @@ +#!/bin/bash +## Fetch a Python package from a git repo and write the pip-tools input manifest to stdout +## Example: +## get-source.sh -f https://github.com/google/flax.git -r main -d /opt/flax +## Output: +## -e /opt/flax + +## Parse command-line arguments + +usage() { + echo "Usage: $0 [OPTION]..." + echo " -d, --dir=PATH [Required] Local path to check out the source code." + echo " -f, --from=URL [Required] URL of the source repo." + echo " -h, --help Print usage." + echo " -i, --install Install the package immediately using pip install." + echo " -o, --output File to write pip manifests. Defaults to stdout" + echo " -r, --ref=REF Git commit SHA, branch name, or tag name to checkout. Uses default branch if not specified." + echo + exit $1 +} + +args=$(getopt -o d:f:hio:r: --long dir:,from:,help,install,output:,ref: -- "$@") +if [[ $? -ne 0 ]]; then + exit 1 +fi + +## Set default arguments + +GIT_REPO="" +GIT_REF="${GIT_REF:-HEAD}" +INSTALL=${INSTALL:-0} +INSTALL_DIR="" +OUTPUT_FILE="/dev/stdout" + +eval set -- "$args" +while [ : ]; do + case "$1" in + -d | --dir) + INSTALL_DIR="$2" + shift 2 + ;; + -f | --from) + GIT_REPO="$2" + shift 2 + ;; + -h | --help) + usage + ;; + -i | --install) + INSTALL=true + shift + ;; + -o | --output) + OUTPUT_FILE="$2" + shift 2 + ;; + -r | --ref) + GIT_REF="$2" + shift 2 + ;; + --) + shift; + break + ;; + esac +done + +if [[ $# -ge 1 ]]; then + echo "Un-recognized argument: $*" && echo + usage 1 +fi + +if [[ ! -n "${GIT_REPO}" ]]; then + echo "Source repository not speicified." && echo + usage 1 +fi + +if [[ ! -n "${INSTALL_DIR}" ]]; then + echo "Check out destination not specified." && echo + usage 1 +fi + +## check out the source + +echo "Fetching $GIT_REPO#$GIT_REF to $INSTALL_DIR" + +set -ex + +git clone --depth 1 ${GIT_REPO} ${INSTALL_DIR} +pushd ${INSTALL_DIR} +git checkout ${GIT_REF} +git submodule init +git submodule update --recursive +popd + +if (( INSTALL == 1 )); then + pip install -e ${INSTALL_DIR} +else + echo "Writing to $OUTPUT_FILE:" + echo "-e ${INSTALL_DIR}" | tee -a $OUTPUT_FILE +fi diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh new file mode 100755 index 000000000..63880bdd0 --- /dev/null +++ b/.github/container/pip-finalize.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +pip-compile /opt/pip-tools.d/*.in -o /opt/pip-tools.d/requirements.txt + +pip-sync /opt/pip-tools.d/requirements.txt + +rm -rf ~/.cache/* diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index 311336a36..bb93115f5 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -3,6 +3,10 @@ name: ~build JAX container on: workflow_call: inputs: + ARCHITECTURE: + type: string + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true BASE_IMAGE: type: string description: 'Base docker image that provides CUDA and Python:' @@ -44,9 +48,12 @@ on: required: false default: 'badge-jax-build' outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.merge.outputs.DOCKER_TAGS }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build.outputs.DOCKER_TAG_FINAL }} + DOCKER_TAG_STAGING: + description: "Tags of the 'staging' image built" + value: ${{ jobs.build.outputs.DOCKER_TAG_STAGING }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -59,13 +66,12 @@ permissions: jobs: build: - strategy: - fail-fast: false - matrix: - PLATFORM: [amd64, arm64] - runs-on: [self-hosted, "${{ matrix.PLATFORM }}", large] + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", large] env: - BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ matrix.PLATFORM }}.json + BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json + outputs: + DOCKER_TAG_FINAL: ${{ steps.meta-staging.outputs.tags }} + DOCKER_TAG_STAGING: ${{ steps.meta-final.outputs.tags }} steps: - name: Print environment variables run: env @@ -95,8 +101,14 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Set docker metadata - id: meta + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + driver-opts: | + image=moby/buildkit:v0.12.1 + + - name: Set docker metadata - staging + id: meta-staging uses: docker/metadata-action@v4 with: images: | @@ -104,26 +116,56 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-jax-${{ matrix.PLATFORM }} + type=raw,value=${{ github.run_id }}-jax-${{ inputs.ARCHITECTURE }}-staging labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + - name: Build staging image + id: build-staging + uses: docker/build-push-action@v4 with: - driver-opts: | - image=moby/buildkit:v0.12.1 + context: .github/container + push: true + file: .github/container/Dockerfile.jax + platforms: linux/${{ inputs.ARCHITECTURE }} + target: staging + tags: ${{ steps.meta-staging.outputs.tags }} + labels: ${{ steps.meta-staging.outputs.labels }} + ssh: default + secret-files: | + "SSH_KNOWN_HOSTS=${{ steps.ssh-known-hosts.outputs.FILE }}" + build-args: | + BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BAZEL_CACHE=${{ vars.BAZEL_REMOTE_CACHE_URL }} + BUILD_DATE=${{ inputs.BUILD_DATE }} + REPO_JAX=${{ inputs.REPO_JAX }} + REPO_XLA=${{ inputs.REPO_XLA }} + REF_JAX=${{ inputs.REF_JAX }} + REF_XLA=${{ inputs.REF_XLA }} - - name: Build docker images - id: build + - name: Set docker metadata - final + id: meta-final + uses: docker/metadata-action@v4 + with: + images: | + ${{ env.UPLD_IMAGE }} + flavor: | + latest=false + tags: | + type=raw,value=${{ github.run_id }}-jax-${{ inputs.ARCHITECTURE }} + labels: + org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + + - name: Build final image + id: build-final uses: docker/build-push-action@v4 with: context: .github/container push: true file: .github/container/Dockerfile.jax - platforms: linux/${{ matrix.PLATFORM }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + platforms: linux/${{ inputs.ARCHITECTURE }} + tags: ${{ steps.meta-final.outputs.tags }} + labels: ${{ steps.meta-final.outputs.labels }} ssh: default secret-files: | "SSH_KNOWN_HOSTS=${{ steps.ssh-known-hosts.outputs.FILE }}" @@ -143,10 +185,10 @@ jobs: # bring in utility functions source .github/workflows/scripts/to_json.sh - badge_label='JAX ${{ matrix.PLATFORM }} build' - tags="${{ steps.meta.outputs.tags }}" - digest="${{ steps.build.outputs.digest }}" - outcome="${{ steps.build.outcome }}" + badge_label='JAX ${{ inputs.ARCHITECTURE }} build' + tags="${{ steps.meta-final.outputs.tags }}" + digest="${{ steps.build-final.outputs.digest }}" + outcome="${{ steps.build-final.outcome }}" if [[ ${outcome} == "success" ]]; then badge_message="pass" @@ -170,72 +212,25 @@ jobs: to_json schemaVersion label message color \ > ${{ env.BADGE_FILENAME_FULL }} - # Temporary workaround until the following issues are solved: - # https://github.com/orgs/community/discussions/17245 - # https://github.com/actions/runner/pull/2477 - # https://github.com/orgs/community/discussions/26639 - - name: Save image name as text file - shell: bash -x -e {0} - run: | - echo "${{ steps.meta.outputs.tags }}" >> image-name.txt - - - name: Upload image name file as artifact - uses: actions/upload-artifact@v3 - with: - name: image-name-jax-${{ matrix.PLATFORM }} - path: image-name.txt + # # Temporary workaround until the following issues are solved: + # # https://github.com/orgs/community/discussions/17245 + # # https://github.com/actions/runner/pull/2477 + # # https://github.com/orgs/community/discussions/26639 + # - name: Save image name as text file + # shell: bash -x -e {0} + # run: | + # echo "${{ steps.meta-final.outputs.tags }}" >> image-name.txt + + # - name: Upload image name file as artifact + # uses: actions/upload-artifact@v3 + # with: + # name: image-name-jax-${{ matrix.PLATFORM }} + # path: image-name.txt - name: Upload sitrep and badge uses: actions/upload-artifact@v3 with: - name: ${{ inputs.ARTIFACT_NAME }}-${{ matrix.PLATFORM }} + name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }} path: | sitrep.json ${{ env.BADGE_FILENAME_FULL }} - - merge: - runs-on: ubuntu-latest - needs: build - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - steps: - # TODO: currently downloading all artifacts of the entire workflow - # Revise when this request is fulfilled: - # https://github.com/actions/download-artifact/issues/214 - - name: Download image name files into separate folders - uses: actions/download-artifact@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-jax-multiarch - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Combine images into a single multi-arch image - shell: bash -x -e {0} - run: | - docker manifest create ${{ steps.meta.outputs.tags }} $( - for IMAGE in $(cat image-name-jax-*/image-name.txt); do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${{ steps.meta.outputs.tags }} diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index ed0c1a628..19cadcf45 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -3,6 +3,10 @@ name: ~build T5X container on: workflow_call: inputs: + ARCHITECTURE: + type: string + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true BASE_IMAGE: type: string description: 'Base docker image that provides JAX' @@ -51,7 +55,7 @@ jobs: build: outputs: DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - runs-on: [self-hosted, x86, small] + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] steps: - name: Print environment variables run: env @@ -66,8 +70,14 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Set docker metadata - id: meta + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + driver-opts: | + image=moby/buildkit:v0.12.1 + + - name: Set docker metadata - staging + id: meta-staging uses: docker/metadata-action@v4 with: images: | @@ -75,24 +85,50 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-upstream-t5x + type=raw,value=${{ github.run_id }}-upstream-t5x-${{ inputs.ARCHITECTURE }}-staging labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + - name: Build staging image + uses: docker/build-push-action@v4 with: - driver-opts: | - image=moby/buildkit:v0.10.6 + context: .github/container + push: true + file: .github/container/Dockerfile.t5x + platforms: linux/${{ inputs.ARCHITECTURE }} + target: staging + tags: ${{ steps.meta-staging.outputs.tags }} + labels: ${{ steps.meta-staging.outputs.labels }} + build-args: | + BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BUILD_DATE=${{ inputs.BUILD_DATE }} + REPO_T5X=${{ inputs.REPO_T5X }} + REF_T5X=${{ inputs.REF_T5X }} + REPO_TE=${{ inputs.REPO_TE }} + REF_TE=${{ inputs.REF_TE }} + + - name: Set docker metadata - final + id: meta-final + uses: docker/metadata-action@v4 + with: + images: | + ${{ env.UPLD_IMAGE }} + flavor: | + latest=false + tags: | + type=raw,value=${{ github.run_id }}-upstream-t5x-${{ inputs.ARCHITECTURE }} + labels: + org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Build docker images + - name: Build final image uses: docker/build-push-action@v4 with: context: .github/container push: true file: .github/container/Dockerfile.t5x - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + platforms: linux/${{ inputs.ARCHITECTURE }} + tags: ${{ steps.meta-staging.outputs.tags }} + labels: ${{ steps.meta-staging.outputs.labels }} build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 3dd5b1f66..4b0a4d6bc 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -1,41 +1,79 @@ name: "~Sandbox" on: - workflow_dispatch: + push: + +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container jobs: - sandbox: - runs-on: ubuntu-22.04 - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Print usage - run: | - cat << EOF - This is an empty workflow file located in the main branch of your - repository. It serves as a testing ground for new GitHub Actions on - development branches before merging them to the main branch. By - defining and overloading this workflow on your development branch, - you can test new actions without affecting your main branch, ensuring - a smooth integration process once the changes are ready to be merged. - - Usage: - - 1. In your development branch, modify the sandbox.yml workflow file - to include the new actions you want to test. Make sure to commit - the changes to the development branch. - 2. Navigate to the 'Actions' tab in your repository, select the - '~Sandbox' workflow, and choose your development branch from the - branch dropdown menu. Click on 'Run workflow' to trigger the - workflow on your development branch. - 3. Once you have tested and verified the new actions in the Sandbox - workflow, you can incorporate them into your main workflow(s) and - merge the development branch into the main branch. Remember to - revert the changes to the sandbox.yml file in the main branch to - keep it empty for future testing. - EOF + + build-base: + uses: ./.github/workflows/_build_base.yaml + secrets: inherit + + build-jax: + needs: [build-base] + uses: ./.github/workflows/_build_jax.yaml + with: + ARCHITECTURE: amd64 + BASE_IMAGE: ${{ needs.build-base.outputs.IMAGE }} + secrets: inherit + + build-t5x: + needs: [build-jax] + uses: ./.github/workflows/_build_t5x.yaml + with: + ARCHITECTURE: amd64 + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + secrets: inherit + + + # merge: + # runs-on: ubuntu-latest + # needs: build + # outputs: + # DOCKER_TAGS: ${{ steps.meta.outputs.tags }} + # steps: + # # TODO: currently downloading all artifacts of the entire workflow + # # Revise when this request is fulfilled: + # # https://github.com/actions/download-artifact/issues/214 + # - name: Download image name files into separate folders + # uses: actions/download-artifact@v3 + + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v2 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + + # - name: Set docker metadata + # id: meta + # uses: docker/metadata-action@v4 + # with: + # images: | + # ${{ env.UPLD_IMAGE }} + # flavor: | + # latest=false + # tags: | + # type=raw,value=${{ github.run_id }}-jax-multiarch + # labels: + # org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + + # - name: Combine images into a single multi-arch image + # shell: bash -x -e {0} + # run: | + # docker manifest create ${{ steps.meta.outputs.tags }} $( + # for IMAGE in $(cat image-name-jax-*/image-name.txt); do + # REPO=$(echo $IMAGE | cut -d: -f1) + # DIGEST=$( + # docker manifest inspect $IMAGE |\ + # jq -r '.manifests[] | select(.platform.os == "linux") | .digest' + # ) + # echo $REPO@${DIGEST} + # done + # ) + # docker manifest push ${{ steps.meta.outputs.tags }} From 4b0c406d6abd4f40121dac102d3ad9f32f627439 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Tue, 24 Oct 2023 05:53:02 +0000 Subject: [PATCH 004/146] wip --- .github/workflows/_build_base.yaml | 136 +++++++++++++++-------------- .github/workflows/_sandbox.yaml | 4 +- 2 files changed, 72 insertions(+), 68 deletions(-) diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml index dc35b653d..b790f157b 100644 --- a/.github/workflows/_build_base.yaml +++ b/.github/workflows/_build_base.yaml @@ -3,6 +3,10 @@ name: ~build CUDA+Python base container on: workflow_call: inputs: + ARCHITECTURE: + type: string + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true BASE_IMAGE: type: string description: 'Base CUDA image, e.g. nvidia/cuda:X.Y.Z-devel-ubuntu22.04' @@ -14,9 +18,9 @@ on: required: false default: 'NOT SPECIFIED' outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.merge.outputs.DOCKER_TAGS }} + DOCKER_TAG: + description: "Tag of the image built" + value: ${{ jobs.build.outputs.DOCKER_TAG }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -29,11 +33,9 @@ permissions: jobs: build: - strategy: - fail-fast: false - matrix: - PLATFORM: [amd64, arm64] - runs-on: [self-hosted, "${{ matrix.PLATFORM }}", small] + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] + outputs: + DOCKER_TAG: ${{ steps.meta.outputs.tags }} steps: - name: Print environment variables run: env @@ -57,7 +59,7 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-base-${{ matrix.PLATFORM }} + type=raw,value=${{ github.run_id }}-base-${{ inputs.ARCHITECTURE }} labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} @@ -80,61 +82,61 @@ jobs: BUILD_DATE=${{ inputs.BUILD_DATE }} ${{ inputs.BASE_IMAGE != 'latest' && format('BASE_IMAGE={0}', inputs.BASE_IMAGE) }} - # Temporary workaround until the following issues are solved: - # https://github.com/orgs/community/discussions/17245 - # https://github.com/actions/runner/pull/2477 - # https://github.com/orgs/community/discussions/26639 - - name: Save image name as text file - shell: bash -x -e {0} - run: | - echo "${{ steps.meta.outputs.tags }}" >> image-name.txt - - - name: Upload image name file as artifact - uses: actions/upload-artifact@v3 - with: - name: image-name-base-${{ matrix.PLATFORM }} - path: image-name.txt - - merge: - runs-on: ubuntu-latest - needs: build - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - steps: - - name: Download image name files into separate folders - uses: actions/download-artifact@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-base-multiarch - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Combine images into a single multi-arch image - shell: bash -x -e {0} - run: | - docker manifest create ${{ steps.meta.outputs.tags }} $( - for IMAGE in $(cat image-name-base-*/image-name.txt); do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${{ steps.meta.outputs.tags }} + # # Temporary workaround until the following issues are solved: + # # https://github.com/orgs/community/discussions/17245 + # # https://github.com/actions/runner/pull/2477 + # # https://github.com/orgs/community/discussions/26639 + # - name: Save image name as text file + # shell: bash -x -e {0} + # run: | + # echo "${{ steps.meta.outputs.tags }}" >> image-name.txt + + # - name: Upload image name file as artifact + # uses: actions/upload-artifact@v3 + # with: + # name: image-name-base-${{ matrix.PLATFORM }} + # path: image-name.txt + + # merge: + # runs-on: ubuntu-latest + # needs: build + # outputs: + # DOCKER_TAGS: ${{ steps.meta.outputs.tags }} + # steps: + # - name: Download image name files into separate folders + # uses: actions/download-artifact@v3 + + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v2 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + + # - name: Set docker metadata + # id: meta + # uses: docker/metadata-action@v4 + # with: + # images: | + # ${{ env.UPLD_IMAGE }} + # flavor: | + # latest=false + # tags: | + # type=raw,value=${{ github.run_id }}-base-multiarch + # labels: + # org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + + # - name: Combine images into a single multi-arch image + # shell: bash -x -e {0} + # run: | + # docker manifest create ${{ steps.meta.outputs.tags }} $( + # for IMAGE in $(cat image-name-base-*/image-name.txt); do + # REPO=$(echo $IMAGE | cut -d: -f1) + # DIGEST=$( + # docker manifest inspect $IMAGE |\ + # jq -r '.manifests[] | select(.platform.os == "linux") | .digest' + # ) + # echo $REPO@${DIGEST} + # done + # ) + # docker manifest push ${{ steps.meta.outputs.tags }} diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 4b0a4d6bc..d49ba584f 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -12,6 +12,8 @@ jobs: build-base: uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: amd64 secrets: inherit build-jax: @@ -19,7 +21,7 @@ jobs: uses: ./.github/workflows/_build_jax.yaml with: ARCHITECTURE: amd64 - BASE_IMAGE: ${{ needs.build-base.outputs.IMAGE }} + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} secrets: inherit build-t5x: From 40444504077984a5e6df2b63fc49f91103f3c879 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Tue, 24 Oct 2023 05:55:10 +0000 Subject: [PATCH 005/146] wip --- .github/workflows/_build_base.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml index b790f157b..572ba2b62 100644 --- a/.github/workflows/_build_base.yaml +++ b/.github/workflows/_build_base.yaml @@ -75,7 +75,7 @@ jobs: context: .github/container push: true file: .github/container/Dockerfile.base - platforms: linux/${{ matrix.PLATFORM }} + platforms: linux/${{ inputs.ARCHITECTURE }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-args: | @@ -94,7 +94,7 @@ jobs: # - name: Upload image name file as artifact # uses: actions/upload-artifact@v3 # with: - # name: image-name-base-${{ matrix.PLATFORM }} + # name: image-name-base-${{ inputs.ARCHITECTURE }} # path: image-name.txt # merge: From 38da8a170453ebcaa6d2418a18912af02b6c61f7 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Tue, 24 Oct 2023 06:06:08 +0000 Subject: [PATCH 006/146] wip --- .github/container/Dockerfile.jax | 4 ++-- .github/workflows/_build_base.yaml | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index fd8bcc597..32bae519a 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -54,8 +54,8 @@ ENV CUDA_DEVICE_MAX_CONNECTIONS=1 ENV NCCL_IB_SL=1 ENV NCCL_NVLS_ENABLE=0 -COPY --from=jax-builder ${SRC_PATH_JAX} ${SRC_PATH_JAX} -COPY --from=jax-builder ${SRC_PATH_XLA} ${SRC_PATH_XLA} +COPY --from=builder ${SRC_PATH_JAX} ${SRC_PATH_JAX} +COPY --from=builder ${SRC_PATH_XLA} ${SRC_PATH_XLA} ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ RUN mkdir -p /opt/pip-tools.d diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml index 572ba2b62..4e23ef1cb 100644 --- a/.github/workflows/_build_base.yaml +++ b/.github/workflows/_build_base.yaml @@ -50,6 +50,12 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + driver-opts: | + image=moby/buildkit:v0.12.1 + - name: Set docker metadata id: meta uses: docker/metadata-action@v4 @@ -63,12 +69,6 @@ jobs: labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - with: - driver-opts: | - image=moby/buildkit:v0.12.1 - - name: Build docker images uses: docker/build-push-action@v4 with: From 9ad94c87475b466249ee803bfa5308c361890b1d Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Tue, 24 Oct 2023 07:07:54 +0000 Subject: [PATCH 007/146] wip --- .github/workflows/_sandbox.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index d49ba584f..f85e53901 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -22,6 +22,8 @@ jobs: with: ARCHITECTURE: amd64 BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + REF_JAX: jax-v0.4.19 + REF_XLA: ecb73da4b7b2e3b54aa6d6b7f08a5c662bb19c6e secrets: inherit build-t5x: From 1aac03a9e4cc29c23764ddfb11d7379d2e33e02a Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Wed, 25 Oct 2023 05:21:19 +0000 Subject: [PATCH 008/146] use full clone --- .github/container/get-source.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/get-source.sh b/.github/container/get-source.sh index 9dfb483a4..53ea3f418 100755 --- a/.github/container/get-source.sh +++ b/.github/container/get-source.sh @@ -86,7 +86,7 @@ echo "Fetching $GIT_REPO#$GIT_REF to $INSTALL_DIR" set -ex -git clone --depth 1 ${GIT_REPO} ${INSTALL_DIR} +git clone ${GIT_REPO} ${INSTALL_DIR} pushd ${INSTALL_DIR} git checkout ${GIT_REF} git submodule init From 42ec9bb1a0ebe489f6ebe430c80f894cfe459b9f Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Wed, 25 Oct 2023 05:38:30 +0000 Subject: [PATCH 009/146] update pip-tools script --- .github/container/pip-finalize.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 63880bdd0..fdb21fb0b 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -1,6 +1,8 @@ #!/bin/bash -pip-compile /opt/pip-tools.d/*.in -o /opt/pip-tools.d/requirements.txt +set -ex -o pipefail + +pip-compile $(ls /opt/pip-tools.d/*.in) -o /opt/pip-tools.d/requirements.txt pip-sync /opt/pip-tools.d/requirements.txt From 2f02023d3fddbbfc20a34bbed757b81fc81cd910 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Wed, 25 Oct 2023 06:10:37 +0000 Subject: [PATCH 010/146] update pip-tools script --- .github/container/Dockerfile.base | 1 + .github/container/get-source.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 887fc555e..e02e57c31 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -25,6 +25,7 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* RUN pip install --upgrade --no-cache-dir pip pip-tools && rm -rf ~/.cache/* +RUN mkdir -p /opt/pip-tools.d ADD --chmod=777 get-source.sh /usr/local/bin ADD --chmod=777 pip-finalize.sh /usr/local/bin diff --git a/.github/container/get-source.sh b/.github/container/get-source.sh index 53ea3f418..695df93f3 100755 --- a/.github/container/get-source.sh +++ b/.github/container/get-source.sh @@ -84,7 +84,7 @@ fi echo "Fetching $GIT_REPO#$GIT_REF to $INSTALL_DIR" -set -ex +set -ex -o pipefail git clone ${GIT_REPO} ${INSTALL_DIR} pushd ${INSTALL_DIR} From ad12e78c178f1b6ff5334c09343a9794c05d6626 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Wed, 25 Oct 2023 06:16:11 +0000 Subject: [PATCH 011/146] update pip-tools script --- .github/container/pip-finalize.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index fdb21fb0b..220d40507 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -2,7 +2,7 @@ set -ex -o pipefail -pip-compile $(ls /opt/pip-tools.d/*.in) -o /opt/pip-tools.d/requirements.txt +pip-compile $(ls /opt/pip-tools.d/manifest.*) -o /opt/pip-tools.d/requirements.txt pip-sync /opt/pip-tools.d/requirements.txt From bdc34c8154b8aed1c1206a1b18304d72f43c845a Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Wed, 25 Oct 2023 06:47:25 +0000 Subject: [PATCH 012/146] fix t5x dockerfile --- .github/container/Dockerfile.t5x | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index 30d4dacb4..d7ee96ee8 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -13,7 +13,7 @@ FROM ${BASE_IMAGE} as staging ARG REPO_T5X ARG REF_T5X -RUN get-source.sh -f ${REF_T5X} -r ${REF_T5X} -d ${SRC_PATH_T5X} -o /opt/pip-tools.d/manifest.t5x +RUN get-source.sh -f ${REPO_T5X} -r ${REF_T5X} -d ${SRC_PATH_T5X} -o /opt/pip-tools.d/manifest.t5x ADD test-t5x.sh /usr/local/bin From a5c478ec741c63bc6594f338192cc1840d48027b Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Wed, 25 Oct 2023 07:02:41 +0000 Subject: [PATCH 013/146] fix t5x dockerfile --- .github/container/Dockerfile.t5x | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index d7ee96ee8..6cb12374f 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -13,6 +13,7 @@ FROM ${BASE_IMAGE} as staging ARG REPO_T5X ARG REF_T5X +ARG SRC_PATH_T5X RUN get-source.sh -f ${REPO_T5X} -r ${REF_T5X} -d ${SRC_PATH_T5X} -o /opt/pip-tools.d/manifest.t5x ADD test-t5x.sh /usr/local/bin From b4fd2d820e8e68e3635636e37ba3507e66d6d486 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Wed, 25 Oct 2023 20:11:49 +0000 Subject: [PATCH 014/146] test flax hack --- .github/container/pip-finalize.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 220d40507..73e886b3a 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -2,6 +2,8 @@ set -ex -o pipefail +sed -i "s|flax @ git+https://github.com/google/flax#egg=flax||g" + pip-compile $(ls /opt/pip-tools.d/manifest.*) -o /opt/pip-tools.d/requirements.txt pip-sync /opt/pip-tools.d/requirements.txt From fe6e3d80e70dcb73dbebe79853fba471bbce7858 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 25 Oct 2023 21:22:47 +0100 Subject: [PATCH 015/146] flax hack --- .github/container/pip-finalize.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 73e886b3a..1ec6bbe1b 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -2,7 +2,7 @@ set -ex -o pipefail -sed -i "s|flax @ git+https://github.com/google/flax#egg=flax||g" +sed -i "s|flax @ git+https://github.com/google/flax#egg=flax||g" /opt/pip-tools.d/manifest.* pip-compile $(ls /opt/pip-tools.d/manifest.*) -o /opt/pip-tools.d/requirements.txt From ad83f219d79aa1861b6d9fc8fa10dce0689cc96f Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Thu, 2 Nov 2023 18:26:54 +0000 Subject: [PATCH 016/146] hack for git top of tree Flax dependency --- .github/container/Dockerfile.jax | 5 ++++- .github/container/pip-finalize.sh | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 32bae519a..495bc7b3b 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -63,7 +63,10 @@ RUN echo "-e ${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax RUN echo "$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax ## Flax -RUN get-source.sh -f https://github.com/google/flax.git -r v0.7.4 -d /opt/flax -o /opt/pip-tools.d/manifest.flax +# RUN get-source.sh -f https://github.com/google/flax.git -r v0.7.5 -d /opt/flax -o /opt/pip-tools.d/manifest.flax +## Temporary WAR Part 1 (Part 2 in pip-finalize.sh) due to https://github.com/pypa/pip/issues/12380 +RUN get-source.sh -f https://github.com/google/flax.git -r v0.7.5 -d /opt/flax -o /dev/null +RUN echo "flax @ git+https://github.com/google/flax.git" > /opt/pip-tools.d/manifest.flax ## Transformer engine ARG REPO_TE diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 1ec6bbe1b..4fe3f1566 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -6,6 +6,8 @@ sed -i "s|flax @ git+https://github.com/google/flax#egg=flax||g" /opt/pip-tools. pip-compile $(ls /opt/pip-tools.d/manifest.*) -o /opt/pip-tools.d/requirements.txt +sed -i "s|flax @ git+https://github.com/google/flax.git|-e /opt/flax|g" /opt/pip-tools.d/requirements.txt + pip-sync /opt/pip-tools.d/requirements.txt rm -rf ~/.cache/* From b7e1a6cd5842bc9fe95d86cf03325ec8f2f56d29 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Fri, 3 Nov 2023 17:43:51 +0000 Subject: [PATCH 017/146] update URL req --- .github/container/Dockerfile.base | 6 ++++-- .github/container/Dockerfile.jax | 5 +---- .github/container/Dockerfile.t5x | 3 +-- .github/container/pip-finalize.sh | 6 +----- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index e02e57c31..f42bb9ef7 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -26,8 +26,10 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* RUN pip install --upgrade --no-cache-dir pip pip-tools && rm -rf ~/.cache/* RUN mkdir -p /opt/pip-tools.d -ADD --chmod=777 get-source.sh /usr/local/bin -ADD --chmod=777 pip-finalize.sh /usr/local/bin +ADD --chmod=777 \ + get-source.sh \ + pip-finalize.sh \ + /usr/local/bin/ ############################################################################### ## Install cuDNN diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 495bc7b3b..64a7bb763 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -63,10 +63,7 @@ RUN echo "-e ${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax RUN echo "$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax ## Flax -# RUN get-source.sh -f https://github.com/google/flax.git -r v0.7.5 -d /opt/flax -o /opt/pip-tools.d/manifest.flax -## Temporary WAR Part 1 (Part 2 in pip-finalize.sh) due to https://github.com/pypa/pip/issues/12380 -RUN get-source.sh -f https://github.com/google/flax.git -r v0.7.5 -d /opt/flax -o /dev/null -RUN echo "flax @ git+https://github.com/google/flax.git" > /opt/pip-tools.d/manifest.flax +RUN echo "flax @ git+https://github.com/google/flax.git@v0.7.5#egg=flax" > /opt/pip-tools.d/manifest.flax ## Transformer engine ARG REPO_TE diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index 6cb12374f..326c1524a 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -3,7 +3,6 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest ARG REPO_T5X=https://github.com/google-research/t5x.git ARG REF_T5X=main -ARG SRC_PATH_T5X=/opt/t5x ############################################################################### ## Download source and add auxiliary scripts @@ -14,7 +13,7 @@ FROM ${BASE_IMAGE} as staging ARG REPO_T5X ARG REF_T5X ARG SRC_PATH_T5X -RUN get-source.sh -f ${REPO_T5X} -r ${REF_T5X} -d ${SRC_PATH_T5X} -o /opt/pip-tools.d/manifest.t5x +RUN echo "t5x @ git+${REPO_T5X}@${REF_T5X}#egg=t5x" > /opt/pip-tools.d/manifest.t5x ADD test-t5x.sh /usr/local/bin diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 4fe3f1566..d8f469c75 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -2,12 +2,8 @@ set -ex -o pipefail -sed -i "s|flax @ git+https://github.com/google/flax#egg=flax||g" /opt/pip-tools.d/manifest.* - pip-compile $(ls /opt/pip-tools.d/manifest.*) -o /opt/pip-tools.d/requirements.txt -sed -i "s|flax @ git+https://github.com/google/flax.git|-e /opt/flax|g" /opt/pip-tools.d/requirements.txt - -pip-sync /opt/pip-tools.d/requirements.txt +pip install --src /opt -r /opt/pip-tools.d/requirements.txt rm -rf ~/.cache/* From 0dd9617878263c506479e93e837f83dddf50aa92 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Fri, 3 Nov 2023 18:18:30 +0000 Subject: [PATCH 018/146] update --- .github/workflows/_sandbox.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index f85e53901..7be9fb344 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -22,8 +22,8 @@ jobs: with: ARCHITECTURE: amd64 BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - REF_JAX: jax-v0.4.19 - REF_XLA: ecb73da4b7b2e3b54aa6d6b7f08a5c662bb19c6e + REF_JAX: jax-v0.4.20 + REF_XLA: ca31652cdbeb6ea187589dea546ff8019274f8b2 secrets: inherit build-t5x: From dc44fe44ee10a72f6c3aefa6b2e3cd0458a36320 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Fri, 3 Nov 2023 20:30:00 +0000 Subject: [PATCH 019/146] editability --- .github/container/Dockerfile.jax | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 64a7bb763..814ab4e0c 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -59,11 +59,11 @@ COPY --from=builder ${SRC_PATH_XLA} ${SRC_PATH_XLA} ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ RUN mkdir -p /opt/pip-tools.d -RUN echo "-e ${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax -RUN echo "$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax +RUN echo "-e jax @ file://${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax +RUN echo "jaxlib @ $(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax ## Flax -RUN echo "flax @ git+https://github.com/google/flax.git@v0.7.5#egg=flax" > /opt/pip-tools.d/manifest.flax +RUN echo "-e flax @ git+https://github.com/google/flax.git@v0.7.5#egg=flax" > /opt/pip-tools.d/manifest.flax ## Transformer engine ARG REPO_TE From acf2acf04b8d3eb639ce2a42c10d3b29a790070b Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Fri, 3 Nov 2023 20:31:32 +0000 Subject: [PATCH 020/146] editability --- .github/container/Dockerfile.jax | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 814ab4e0c..f8c1fb861 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -63,7 +63,7 @@ RUN echo "-e jax @ file://${SRC_PATH_JAX}" >> /opt/pip-tools.d/man RUN echo "jaxlib @ $(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax ## Flax -RUN echo "-e flax @ git+https://github.com/google/flax.git@v0.7.5#egg=flax" > /opt/pip-tools.d/manifest.flax +RUN echo "-e flax @ git+https://github.com/google/flax@v0.7.5#egg=flax" > /opt/pip-tools.d/manifest.flax ## Transformer engine ARG REPO_TE From 027cd63d232079a41d03390f57f5d2e597993c79 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Fri, 3 Nov 2023 20:48:40 +0000 Subject: [PATCH 021/146] editability --- .github/container/Dockerfile.jax | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index f8c1fb861..015ae254d 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -59,11 +59,11 @@ COPY --from=builder ${SRC_PATH_XLA} ${SRC_PATH_XLA} ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ RUN mkdir -p /opt/pip-tools.d -RUN echo "-e jax @ file://${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax +RUN echo "-e file://${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax RUN echo "jaxlib @ $(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax ## Flax -RUN echo "-e flax @ git+https://github.com/google/flax@v0.7.5#egg=flax" > /opt/pip-tools.d/manifest.flax +RUN echo "-e git+https://github.com/google/flax@v0.7.5#egg=flax" > /opt/pip-tools.d/manifest.flax ## Transformer engine ARG REPO_TE From 7c10ffa757004ce065f3b2a6cbfeca9aa8ccb040 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Fri, 3 Nov 2023 20:49:22 +0000 Subject: [PATCH 022/146] editability --- .github/container/Dockerfile.jax | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 015ae254d..bc76e2831 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -60,7 +60,7 @@ ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ RUN mkdir -p /opt/pip-tools.d RUN echo "-e file://${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax -RUN echo "jaxlib @ $(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax +RUN echo "$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax ## Flax RUN echo "-e git+https://github.com/google/flax@v0.7.5#egg=flax" > /opt/pip-tools.d/manifest.flax From f1046d21e616dc42ac804c30d16d1ad4b571a5e5 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Fri, 3 Nov 2023 20:51:55 +0000 Subject: [PATCH 023/146] editability --- .github/container/pip-finalize.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index d8f469c75..5e700bf63 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -2,8 +2,10 @@ set -ex -o pipefail -pip-compile $(ls /opt/pip-tools.d/manifest.*) -o /opt/pip-tools.d/requirements.txt +pushd /opt/pip-tools.d -pip install --src /opt -r /opt/pip-tools.d/requirements.txt +pip-compile $(ls manifest.*) -o requirements.txt + +pip-sync --pip-args '--src /opt' requirements.txt rm -rf ~/.cache/* From bbf2c2126a8d4191b5b82164318b86bfce9d87a6 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Fri, 3 Nov 2023 22:14:40 +0000 Subject: [PATCH 024/146] wip --- .github/container/Dockerfile.jax | 12 +++++++++--- .github/container/Dockerfile.t5x | 4 +++- .github/container/get-source.sh | 22 +++++++++++----------- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index bc76e2831..afdaf61d4 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -1,12 +1,15 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-toolbox:base ARG REPO_JAX="https://github.com/google/jax.git" ARG REPO_XLA="https://github.com/openxla/xla.git" +ARG REPO_FLAX="https://github.com/google/flax.git" ARG REPO_TE="https://github.com/NVIDIA/TransformerEngine.git" ARG REF_JAX=main ARG REF_XLA=main +ARG REF_FLAX=main ARG REF_TE=main ARG SRC_PATH_JAX=/opt/jax-source ARG SRC_PATH_XLA=/opt/xla-source +ARG SRC_PATH_FLAX=/opt/flax ARG SRC_PATH_TE=/opt/transformer-engine ARG BAZEL_CACHE=/tmp @@ -60,17 +63,20 @@ ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ RUN mkdir -p /opt/pip-tools.d RUN echo "-e file://${SRC_PATH_JAX}" >> /opt/pip-tools.d/manifest.jax -RUN echo "$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax +RUN echo "jaxlib @ file://$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax ## Flax -RUN echo "-e git+https://github.com/google/flax@v0.7.5#egg=flax" > /opt/pip-tools.d/manifest.flax +ARG REPO_FLAX +ARG REF_FLAX +ARG SRC_PATH_FLAX +RUN get-source.sh -f ${REPO_FLAX} -r ${REF_FLAX} -d ${SRC_PATH_FLAX} -m /opt/pip-tools.d/manifest.flax ## Transformer engine ARG REPO_TE ARG REF_TE ARG SRC_PATH_TE ENV NVTE_FRAMEWORK=jax -RUN get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} -o /opt/pip-tools.d/manifest.te +RUN get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} -m /opt/pip-tools.d/manifest.te # TODO: properly configure entrypoint diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index 326c1524a..b4472bd69 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -13,7 +13,9 @@ FROM ${BASE_IMAGE} as staging ARG REPO_T5X ARG REF_T5X ARG SRC_PATH_T5X -RUN echo "t5x @ git+${REPO_T5X}@${REF_T5X}#egg=t5x" > /opt/pip-tools.d/manifest.t5x +RUN get-source.sh -f ${REPO_T5X} -r ${REF_T5X} -d ${SRC_PATH_T5X} -m /opt/pip-tools.d/manifest.t5x +# remove head-of-tree specs from select dependencies +RUN sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" ${SRC_PATH_T5X}/setup.py ADD test-t5x.sh /usr/local/bin diff --git a/.github/container/get-source.sh b/.github/container/get-source.sh index 695df93f3..069abee0b 100755 --- a/.github/container/get-source.sh +++ b/.github/container/get-source.sh @@ -9,17 +9,17 @@ usage() { echo "Usage: $0 [OPTION]..." - echo " -d, --dir=PATH [Required] Local path to check out the source code." - echo " -f, --from=URL [Required] URL of the source repo." + echo " -d, --dir PATH [Required] Local path to check out the source code." + echo " -f, --from URL [Required] URL of the source repo." echo " -h, --help Print usage." echo " -i, --install Install the package immediately using pip install." - echo " -o, --output File to write pip manifests. Defaults to stdout" - echo " -r, --ref=REF Git commit SHA, branch name, or tag name to checkout. Uses default branch if not specified." + echo " -m, --manifest FILE Create a pip manifest file if specified" + echo " -r, --ref REF Git commit SHA, branch name, or tag name to checkout. Uses default branch if not specified." echo exit $1 } -args=$(getopt -o d:f:hio:r: --long dir:,from:,help,install,output:,ref: -- "$@") +args=$(getopt -o d:f:hi:m:r: --long dir:,from:,help,install,manifest:,ref: -- "$@") if [[ $? -ne 0 ]]; then exit 1 fi @@ -30,7 +30,7 @@ GIT_REPO="" GIT_REF="${GIT_REF:-HEAD}" INSTALL=${INSTALL:-0} INSTALL_DIR="" -OUTPUT_FILE="/dev/stdout" +MANIFEST_FILE="" eval set -- "$args" while [ : ]; do @@ -50,8 +50,8 @@ while [ : ]; do INSTALL=true shift ;; - -o | --output) - OUTPUT_FILE="$2" + -m | --manifest) + MANIFEST_FILE="$2" shift 2 ;; -r | --ref) @@ -95,7 +95,7 @@ popd if (( INSTALL == 1 )); then pip install -e ${INSTALL_DIR} -else - echo "Writing to $OUTPUT_FILE:" - echo "-e ${INSTALL_DIR}" | tee -a $OUTPUT_FILE +elif [[ -n "${MANIFEST_FILE}" ]]; then + echo "Writing to ${MANIFEST_FILE}:" + echo "-e file://${INSTALL_DIR}" | tee -a ${MANIFEST_FILE} fi From cd0d5d1213e96f694095c155fab3b2fd339f7169 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Mon, 6 Nov 2023 06:13:46 +0000 Subject: [PATCH 025/146] wip --- .github/container/Dockerfile.base | 4 ++++ .github/container/Dockerfile.jax | 6 ++++-- .github/container/Dockerfile.t5x | 14 ++++++++++++-- .github/container/install_lingvo_aarch64.sh | 2 -- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index f42bb9ef7..89d46edbb 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -24,6 +24,10 @@ RUN apt-get update && \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +RUN <> /opt/pip-tools.d/manifest.jax -RUN echo "jaxlib @ file://$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax +RUN <> /opt/pip-tools.d/manifest.jax +echo "jaxlib @ file://$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax +EOF ## Flax ARG REPO_FLAX diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index b4472bd69..3a8d76409 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -3,6 +3,7 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest ARG REPO_T5X=https://github.com/google-research/t5x.git ARG REF_T5X=main +ARG SRC_PATH_T5X=/opt/t5x ############################################################################### ## Download source and add auxiliary scripts @@ -13,9 +14,18 @@ FROM ${BASE_IMAGE} as staging ARG REPO_T5X ARG REF_T5X ARG SRC_PATH_T5X -RUN get-source.sh -f ${REPO_T5X} -r ${REF_T5X} -d ${SRC_PATH_T5X} -m /opt/pip-tools.d/manifest.t5x +RUN < Date: Mon, 6 Nov 2023 06:21:02 +0000 Subject: [PATCH 026/146] wip --- .github/container/Dockerfile.base | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 89d46edbb..04497ba5d 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -25,8 +25,8 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* RUN < Date: Mon, 6 Nov 2023 07:41:59 +0000 Subject: [PATCH 027/146] fix shell --- .github/container/Dockerfile.t5x | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index 3a8d76409..a66d866f6 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -17,7 +17,7 @@ ARG SRC_PATH_T5X RUN < Date: Mon, 6 Nov 2023 08:18:48 +0000 Subject: [PATCH 028/146] fix arg order --- .github/container/pip-finalize.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 5e700bf63..0b9de9526 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -4,7 +4,7 @@ set -ex -o pipefail pushd /opt/pip-tools.d -pip-compile $(ls manifest.*) -o requirements.txt +pip-compile -o requirements.txt $(ls manifest.*) pip-sync --pip-args '--src /opt' requirements.txt From cc66ce65e0f78f2f2b02816d66263a8eb46bca0b Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Mon, 6 Nov 2023 08:23:22 +0000 Subject: [PATCH 029/146] remove standalone TE build --- .github/container/Dockerfile.te | 12 ---- .github/container/install-te.sh | 85 ------------------------- .github/workflows/ci.yaml | 14 +--- .github/workflows/nightly-te-build.yaml | 64 ------------------- .github/workflows/nightly-te-test.yaml | 6 +- 5 files changed, 5 insertions(+), 176 deletions(-) delete mode 100644 .github/container/Dockerfile.te delete mode 100755 .github/container/install-te.sh delete mode 100644 .github/workflows/nightly-te-build.yaml diff --git a/.github/container/Dockerfile.te b/.github/container/Dockerfile.te deleted file mode 100644 index ffa2c9761..000000000 --- a/.github/container/Dockerfile.te +++ /dev/null @@ -1,12 +0,0 @@ -############################################################################### -## Transformer Engine -############################################################################### - -ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} -ARG REPO_TE="https://github.com/NVIDIA/TransformerEngine.git" -ARG REF_TE=main -ARG SRC_PATH_TE=/opt/transformer-engine - -ADD install-te.sh /usr/local/bin -RUN install-te.sh --from=${REPO_TE} --ref=${REF_TE} --dir=${SRC_PATH_TE} \ No newline at end of file diff --git a/.github/container/install-te.sh b/.github/container/install-te.sh deleted file mode 100755 index cfa78ff8f..000000000 --- a/.github/container/install-te.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash - -## Parse command-line arguments - -usage() { - echo "Usage: $0 [OPTION]..." - echo ' --defer When passed, will defer the installation of the main package. Can be installed afterwards with `pip install -r requirements-defer.txt` and any deferred cleanup commands can be run with `bash cleanup.sh`' - echo " -d, --dir=PATH Path to store TE source. Defaults to /opt/transformer-engine" - echo " -f, --from=URL URL of the TE repo. Defaults to https://github.com/NVIDIA/TransformerEngine.git" - echo " -h, --help Print usage." - echo " -r, --ref=REF Git commit hash or tag name that specifies the version of TE to install. Defaults to HEAD." - exit $1 -} - -args=$(getopt -o d:f:hr: --long defer,dir:,from:,help,ref: -- "$@") -if [[ $? -ne 0 ]]; then - exit 1 -fi - -eval set -- "$args" -while [ : ]; do - case "$1" in - --defer) - DEFER=true - shift - ;; - -d | --dir) - INSTALL_DIR="$2" - shift 2 - ;; - -f | --from) - TE_REPO="$2" - shift 2 - ;; - -h | --help) - usage - ;; - -r | --ref) - TE_REF="$2" - shift 2 - ;; - --) - shift; - break - ;; - esac -done - -if [[ $# -ge 1 ]]; then - echo "Un-recognized argument: $*" && echo - usage 1 -fi - -## Set default arguments if not provided via command-line - -DEFER=${DEFER:-false} -TE_REF="${TE_REF:-HEAD}" -TE_REPO="${TE_REPO:-https://github.com/NVIDIA/TransformerEngine.git}" -INSTALL_DIR="${INSTALL_DIR:-/opt/transformer-engine}" - -echo "Installing TE $TE_REF from $TE_REPO to $INSTALL_DIR" - -maybe_defer_pip_install() { - if [[ "$DEFER" = true ]]; then - echo "Deferring installation of 'pip install $*'" - echo "$*" >> /opt/requirements-defer.txt - else - pip install $@ - fi -} - -set -ex - -## Install dependencies - -pip install --no-cache-dir pybind11 ninja packaging - -## Install TE - -git clone ${TE_REPO} ${INSTALL_DIR} -cd ${INSTALL_DIR} -git checkout ${TE_REF} -git submodule init -git submodule update --recursive -maybe_defer_pip_install -e ${INSTALL_DIR} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a75dfac8b..a3f50b6e2 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -117,14 +117,6 @@ jobs: REF_JAX: ${{ needs.metadata.outputs.REF_JAX }} REPO_XLA: ${{ needs.metadata.outputs.REPO_XLA }} REF_XLA: ${{ needs.metadata.outputs.REF_XLA }} - secrets: inherit - - build-te: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_te.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} REF_TE: ${{ needs.metadata.outputs.REF_TE }} secrets: inherit @@ -170,8 +162,7 @@ jobs: secrets: inherit build-summary: - needs: [build-base, build-jax, build-te, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] - # needs: [build-base, build-jax, build-te, build-t5x, build-pax, build-pax-aarch64, build-rosetta-t5x, build-rosetta-pax] + needs: [build-base, build-jax, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] if: always() runs-on: ubuntu-22.04 steps: @@ -185,7 +176,6 @@ jobs: | ------------ | -------------------------------------------------- | | Base | ${{ needs.build-base.outputs.DOCKER_TAGS }} | | JAX | ${{ needs.build-jax.outputs.DOCKER_TAGS }} | - | JAX-TE | ${{ needs.build-te.outputs.DOCKER_TAGS }} | | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | @@ -208,7 +198,7 @@ jobs: needs: build-te uses: ./.github/workflows/_test_te.yaml with: - JAX_TE_IMAGE: ${{ needs.build-te.outputs.DOCKER_TAGS }} + JAX_TE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} secrets: inherit test-t5x: needs: build-t5x diff --git a/.github/workflows/nightly-te-build.yaml b/.github/workflows/nightly-te-build.yaml deleted file mode 100644 index 3fecd1067..000000000 --- a/.github/workflows/nightly-te-build.yaml +++ /dev/null @@ -1,64 +0,0 @@ -name: Nightly Transformer Engine build - -on: - workflow_run: - workflows: [Nightly JAX build] - types: [completed] - branches: [main] - workflow_dispatch: - inputs: - PUBLISH: - type: boolean - description: Publish dated images and update the 'latest' tag? - default: false - required: false - -env: - TARGET: jax-te - DOCKER_REGISTRY: ghcr.io/nvidia - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - - metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - steps: - - name: Set build date - id: date - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - needs: metadata - uses: ./.github/workflows/_build_te.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - secrets: inherit - - publish: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: jax-te - TARGET_TAGS: | - type=raw,value=latest,priority=1000 - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 \ No newline at end of file diff --git a/.github/workflows/nightly-te-test.yaml b/.github/workflows/nightly-te-test.yaml index c030af044..182cdf641 100644 --- a/.github/workflows/nightly-te-test.yaml +++ b/.github/workflows/nightly-te-test.yaml @@ -2,7 +2,7 @@ name: Nightly Transformer Engine test on: workflow_run: - workflows: [Nightly Transformer Engine build] + workflows: [Nightly JAX build] types: [completed] branches: [main] workflow_dispatch: @@ -11,7 +11,7 @@ on: type: string description: 'JAX-TE image build by NVIDIA/JAX-Toolbox' required: true - default: 'ghcr.io/nvidia/jax-te:latest' + default: 'ghcr.io/nvidia/jax:latest' PUBLISH: type: boolean description: Update status badge? @@ -24,7 +24,7 @@ permissions: packages: write # to upload container env: - DEFAULT_JAX_TE_IMAGE: 'ghcr.io/nvidia/jax-te:latest' + DEFAULT_JAX_TE_IMAGE: 'ghcr.io/nvidia/jax:latest' jobs: From fe8708ad083411dc0750c2bf802e9171f0c5c202 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Mon, 6 Nov 2023 08:47:37 +0000 Subject: [PATCH 030/146] build TE wheel in JAX --- .github/container/Dockerfile.jax | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index a1fbee54b..e9ff0a0cf 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -7,10 +7,10 @@ ARG REF_JAX=main ARG REF_XLA=main ARG REF_FLAX=main ARG REF_TE=main -ARG SRC_PATH_JAX=/opt/jax-source +ARG SRC_PATH_JAX=/opt/jax ARG SRC_PATH_XLA=/opt/xla-source ARG SRC_PATH_FLAX=/opt/flax -ARG SRC_PATH_TE=/opt/transformer-engine +ARG SRC_PATH_TE=/opt/transformer-engine-source ARG BAZEL_CACHE=/tmp ARG BUILD_DATE @@ -78,7 +78,11 @@ ARG REPO_TE ARG REF_TE ARG SRC_PATH_TE ENV NVTE_FRAMEWORK=jax -RUN get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} -m /opt/pip-tools.d/manifest.te +RUN <> /opt/pip-tools.d/manifest.te +EOF # TODO: properly configure entrypoint From b1e332e250801f6fe4c0199199cbf2e72512c07c Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Mon, 6 Nov 2023 08:53:11 +0000 Subject: [PATCH 031/146] pax wip --- .github/container/Dockerfile.pax.amd64 | 4 +- .github/container/Dockerfile.t5x | 5 +- .github/container/get-t5x.sh | 106 ------------------------- 3 files changed, 5 insertions(+), 110 deletions(-) delete mode 100755 .github/container/get-t5x.sh diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index b581ea716..1f33397c9 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -7,13 +7,13 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest FROM ${BASE_IMAGE} ADD install-pax.sh /usr/local/bin -ADD install-flax.sh /usr/local/bin -ADD install-te.sh /usr/local/bin ARG REPO_PAXML=https://github.com/google/paxml.git ARG REPO_PRAXIS=https://github.com/google/praxis.git ARG REF_PAXML=main ARG REF_PRAXIS=main +ARG SRC_PATH_PAXML=/opt/paxml +ARG SRC_PATH_PRAXIS=/opt/praxis RUN <<"EOF" bash -ex install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} install-flax.sh --defer diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index a66d866f6..8f84b3518 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -14,10 +14,10 @@ FROM ${BASE_IMAGE} as staging ARG REPO_T5X ARG REF_T5X ARG SRC_PATH_T5X -RUN <> /opt/cleanup.sh - else - $@ - fi -} - -maybe_defer_pip_install() { - if [[ "$DEFER" = true ]]; then - echo "Deferring installation of 'pip install $*'" - echo "$*" >> /opt/requirements-defer.txt - else - pip install $@ - fi -} - -set -ex - -## Install dependencies - -apt-get update -apt-get install -y \ - build-essential \ - cmake \ - clang \ - git - -## Install T5X - -T5X_INSTALLED_DIR=${INSTALL_DIR}/t5x - -git clone ${T5X_REPO} ${T5X_INSTALLED_DIR} -cd ${T5X_INSTALLED_DIR} -git checkout ${T5X_REF} -# We currently require installing editable (-e) to build a distribution since -# we edit the source in place and do not re-install -maybe_defer_pip_install -e ${T5X_INSTALLED_DIR}[gpu] - -maybe_defer_cleanup apt-get autoremove -y -maybe_defer_cleanup apt-get clean -maybe_defer_cleanup rm -rf /var/lib/apt/lists/* -maybe_defer_cleanup rm -rf ~/.cache/pip/ From 62ca85b912d9ff0d5dd6de656738acbb13935874 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Mon, 6 Nov 2023 09:01:58 +0000 Subject: [PATCH 032/146] add pax build --- .github/container/Dockerfile.pax.amd64 | 57 +++++++++++++++++--------- .github/workflows/_sandbox.yaml | 7 ++++ 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index 1f33397c9..10776e1e1 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -1,32 +1,49 @@ # syntax=docker/dockerfile:1-labs -############################################################################### -## Pax -############################################################################### ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} - -ADD install-pax.sh /usr/local/bin - ARG REPO_PAXML=https://github.com/google/paxml.git ARG REPO_PRAXIS=https://github.com/google/praxis.git ARG REF_PAXML=main ARG REF_PRAXIS=main ARG SRC_PATH_PAXML=/opt/paxml ARG SRC_PATH_PRAXIS=/opt/praxis -RUN <<"EOF" bash -ex -install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} -install-flax.sh --defer -install-te.sh --defer - -if [[ -f /opt/requirements-defer.txt ]]; then - # SKIP_HEAD_INSTALLS avoids having to install jax from Github source so that - # we do not overwrite the jax that was already installed. - SKIP_HEAD_INSTALLS=true pip install -r /opt/requirements-defer.txt -fi -if [[ -f /opt/cleanup.sh ]]; then - bash -ex /opt/cleanup.sh -fi + +############################################################################### +## Download source and add auxiliary scripts +############################################################################### + +FROM ${BASE_IMAGE} as staging +ARG REPO_PAXML +ARG REPO_PRAXIS +ARG REF_PAXML +ARG REF_PRAXIS +ARG SRC_PATH_PAXML +ARG SRC_PATH_PRAXIS + +RUN < Date: Mon, 6 Nov 2023 09:03:19 +0000 Subject: [PATCH 033/146] add pax build --- .github/workflows/_sandbox.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 4b17d83fb..7d703e143 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -38,7 +38,6 @@ jobs: needs: [build-jax] uses: ./.github/workflows/_build_pax.yaml with: - ARCHITECTURE: amd64 BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} secrets: inherit From ff6ec2aad90ac9a34a61b4dbf2d639f2af3a9f5a Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Mon, 6 Nov 2023 09:03:51 +0000 Subject: [PATCH 034/146] fix CI --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a3f50b6e2..51d3bce25 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -195,7 +195,7 @@ jobs: secrets: inherit test-te: - needs: build-te + needs: build-jax uses: ./.github/workflows/_test_te.yaml with: JAX_TE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} From 67df9b88656a47c11ccde3a822fdc637921b5273 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Mon, 6 Nov 2023 09:43:51 +0000 Subject: [PATCH 035/146] debug pax build --- .github/container/Dockerfile.pax.amd64 | 4 +-- .github/workflows/_sandbox.yaml | 47 +++++++++++++------------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index 10776e1e1..fcd5c08d9 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -24,8 +24,8 @@ RUN < Date: Mon, 6 Nov 2023 09:48:24 +0000 Subject: [PATCH 036/146] debug pax build --- .github/container/Dockerfile.pax.amd64 | 31 ++++++++++++++++---------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index fcd5c08d9..8d9c49864 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -24,18 +24,25 @@ RUN < Date: Mon, 6 Nov 2023 20:12:09 +0000 Subject: [PATCH 037/146] debug pax build --- .github/container/Dockerfile.pax.amd64 | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index 8d9c49864..af93d58af 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -25,24 +25,27 @@ get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} -m /opt/p get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} -m /opt/pip-tools.d/manifest.praxis pushd ${SRC_PATH_PAXML} -sed -i "s| @ git+https://github.com/google/flax||g" setup.py -sed -i "s| @ git+https://github.com/google/jax||g" setup.py +sed -i "s| @ git+https://github.com/google/flax||g" requirements.in +sed -i "s| @ git+https://github.com/google/jax||g" requirements.in if git diff --quiet; then echo "head-of-tree specs no longer present in select dependencies" exit 1 else git commit -a -m "remove head-of-tree specs from select dependencies" fi +popd pushd ${SRC_PATH_PRAXIS} -sed -i "s| @ git+https://github.com/google/flax||g" setup.py -sed -i "s| @ git+https://github.com/google/jax||g" setup.py +sed -i "s| @ git+https://github.com/google/flax||g" requirements.in +sed -i "s| @ git+https://github.com/google/jax||g" requirements.in if git diff --quiet; then echo "head-of-tree specs no longer present in select dependencies" exit 1 else git commit -a -m "remove head-of-tree specs from select dependencies" fi +popd + EOF ADD test-pax.sh /usr/local/bin From e8f87d2a22c42479109b1bbe53f8c60b15d6a788 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Mon, 6 Nov 2023 21:15:31 +0000 Subject: [PATCH 038/146] fix EOF --- .github/container/Dockerfile.base | 2 +- .github/container/Dockerfile.jax | 8 +++--- .github/container/Dockerfile.pax.amd64 | 36 ++++++++++---------------- .github/container/Dockerfile.t5x | 6 ++--- 4 files changed, 21 insertions(+), 31 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 04497ba5d..fe719ee24 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -24,7 +24,7 @@ RUN apt-get update && \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN <> /opt/pip-tools.d/manifest.jax echo "jaxlib @ file://$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax EOF @@ -73,15 +73,15 @@ ARG REF_FLAX ARG SRC_PATH_FLAX RUN get-source.sh -f ${REPO_FLAX} -r ${REF_FLAX} -d ${SRC_PATH_FLAX} -m /opt/pip-tools.d/manifest.flax -## Transformer engine +## Transformer engine: check out source and build wheel ARG REPO_TE ARG REF_TE ARG SRC_PATH_TE ENV NVTE_FRAMEWORK=jax -RUN <> /opt/pip-tools.d/manifest.te +echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/manifest.te EOF # TODO: properly configure entrypoint diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index af93d58af..7bc867d94 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -20,32 +20,22 @@ ARG REF_PRAXIS ARG SRC_PATH_PAXML ARG SRC_PATH_PRAXIS -RUN < Date: Tue, 7 Nov 2023 21:51:36 +0000 Subject: [PATCH 039/146] redesign workflow --- .github/container/Dockerfile.pax.arm64 | 62 ++++++--- .github/workflows/_build_jax.yaml | 6 +- .github/workflows/_build_pax.yaml | 172 +++++++++++++++---------- .github/workflows/_build_t5x.yaml | 68 +++++++++- .github/workflows/_sandbox.yaml | 49 +++---- 5 files changed, 234 insertions(+), 123 deletions(-) diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index cb0583e49..5834c78f8 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -1,10 +1,24 @@ # syntax=docker/dockerfile:1-labs + +ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest +ARG REPO_PAXML=https://github.com/google/paxml.git +ARG REPO_PRAXIS=https://github.com/google/praxis.git +ARG REF_PAXML=main +ARG REF_PRAXIS=main +ARG SRC_PATH_PAXML=/opt/paxml +ARG SRC_PATH_PRAXIS=/opt/praxis + ############################################################################### ## Pax for AArch64 ############################################################################### -ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} +FROM ${BASE_IMAGE} as staging +ARG REPO_PAXML +ARG REPO_PRAXIS +ARG REF_PAXML +ARG REF_PRAXIS +ARG SRC_PATH_PAXML +ARG SRC_PATH_PRAXIS # We need to build some packages from source, bring some dependencies. RUN apt-get update && \ @@ -19,26 +33,14 @@ RUN apt-get update && \ && \ apt-get autoremove -y && apt-get clean && rm -rf /var/lib/apt/lists - RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-arm64 -O /usr/bin/bazel && \ chmod a+x /usr/bin/bazel - # Lingvo ADD install_lingvo_aarch64.sh /opt/ ADD lingvo.patch /opt/ RUN /opt/install_lingvo_aarch64.sh -ADD install-pax.sh /usr/local/bin -RUN install-pax.sh - -ADD install-te.sh /usr/local/bin -RUN install-te.sh -# Lingvo has pinned TF to 2.13, so we need to downgrade the pydantic version so that its -# transitive dependency on typing-extensions satisfies TF 2.13's req of typing-extensions>=3.6.6,<4.6.0. -# This version of pydantic is the latest version that satisfies the typing-extensions requirement -RUN pip install pydantic==1.10.13 - # Install T5 now, Pip will build the wheel from source, it needs Rust. RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > /tmp/rustup.sh && \ echo "be3535b3033ff5e0ecc4d589a35d3656f681332f860c5fd6684859970165ddcc /tmp/rustup.sh" | sha256sum --check && \ @@ -54,11 +56,31 @@ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > /tmp/rustup.sh & rm /root/.bashrc.save && \ rm -Rf /root/.cache /tmp/* +# paxml + praxis +RUN <<"EOF" bash -ex +get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} -m /opt/pip-tools.d/manifest.paxml +get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} -m /opt/pip-tools.d/manifest.praxis + +for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do + pushd ${src} + sed -i "s| @ git+https://github.com/google/flax||g" requirements.in + sed -i "s| @ git+https://github.com/google/jax||g" requirements.in + if git diff --quiet; then + echo "URL specs no longer present in select dependencies for ${src}" + exit 1 + else + git commit -a -m "remove URL specs from select dependencies for ${src}" + fi + popd +done +EOF + ADD test-pax.sh /usr/local/bin -# TODO: Utilize these build-args and use them when installing pax -# ARG REPO_PAXML=https://github.com/google/paxml.git -# ARG REPO_PRAXIS=https://github.com/google/praxis.git -# ARG REF_PAXML=main -# ARG REF_PRAXIS=main -# install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### + +FROM staging as final + +RUN pip-finalize.sh diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index bb93115f5..e6a40942a 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -193,11 +193,11 @@ jobs: if [[ ${outcome} == "success" ]]; then badge_message="pass" badge_color=brightgreen - summary="JAX build on ${{ matrix.PLATFORM }}: $badge_message" + summary="JAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" else badge_message="fail" badge_color=red - summary="JAX build on ${{ matrix.PLATFORM }}: $badge_message" + summary="JAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" fi to_json \ @@ -224,7 +224,7 @@ jobs: # - name: Upload image name file as artifact # uses: actions/upload-artifact@v3 # with: - # name: image-name-jax-${{ matrix.PLATFORM }} + # name: image-name-jax-${{ inputs.ARCHITECTURE }} # path: image-name.txt - name: Upload sitrep and badge diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 84012afa8..9491ac7d0 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -3,6 +3,10 @@ name: ~build Pax container on: workflow_call: inputs: + ARCHITECTURE: + type: string + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true BASE_IMAGE: type: string description: 'Base docker image that provides JAX' @@ -33,10 +37,23 @@ on: description: Git commit, tag, or branch for Praxis required: false default: main + ARTIFACT_NAME: + type: string + description: 'Name of the artifact zip file' + required: false + default: 'artifact-pax-build' + BADGE_FILENAME: + type: string + description: 'Name of the endpoint JSON file for shields.io badge' + required: false + default: 'badge-pax-build' outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.merge.outputs.DOCKER_TAGS }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build.outputs.DOCKER_TAG_FINAL }} + DOCKER_TAG_STAGING: + description: "Tags of the 'staging' image built" + value: ${{ jobs.build.outputs.DOCKER_TAG_STAGING }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -49,11 +66,12 @@ permissions: jobs: build: - strategy: - fail-fast: false - matrix: - PLATFORM: [amd64, arm64] - runs-on: [self-hosted, "${{ matrix.PLATFORM }}", small] + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] + env: + BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json + outputs: + DOCKER_TAG_FINAL: ${{ steps.meta-staging.outputs.tags }} + DOCKER_TAG_STAGING: ${{ steps.meta-final.outputs.tags }} steps: - name: Print environment variables run: env @@ -68,8 +86,14 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Set docker metadata - id: meta + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + driver-opts: | + image=moby/buildkit:v0.12.1 + + - name: Set docker metadata - staging + id: meta-staging uses: docker/metadata-action@v4 with: images: | @@ -77,25 +101,20 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-upstream-pax-${{ matrix.PLATFORM }} + type=raw,value=${{ github.run_id }}-upstream-pax-${{ inputs.ARCHITECTURE }}-staging labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - with: - driver-opts: | - image=moby/buildkit:v0.12.1 - - - name: Build docker images + - name: Build staging image uses: docker/build-push-action@v4 with: context: .github/container push: true - file: .github/container/Dockerfile.pax.${{ matrix.PLATFORM }} - platforms: linux/${{ matrix.PLATFORM }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + file: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} + platforms: linux/${{ inputs.ARCHITECTURE }} + target: staging + tags: ${{ steps.meta-staging.outputs.tags }} + labels: ${{ steps.meta-staging.outputs.labels }} build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} @@ -104,42 +123,8 @@ jobs: REF_PAXML=${{ inputs.REF_PAXML }} REF_PRAXIS=${{ inputs.REF_PRAXIS }} - # Temporary workaround until the following issues are solved: - # https://github.com/orgs/community/discussions/17245 - # https://github.com/actions/runner/pull/2477 - # https://github.com/orgs/community/discussions/26639 - - name: Save image name as text file - shell: bash -x -e {0} - run: | - echo "${{ steps.meta.outputs.tags }}" >> image-name.txt - - - name: Upload image name file as artifact - uses: actions/upload-artifact@v3 - with: - name: image-name-upstream-pax-${{ matrix.PLATFORM }} - path: image-name.txt - - merge: - runs-on: ubuntu-latest - needs: build - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - steps: - # TODO: currently downloading all artifacts of the entire workflow - # Revise when this request is fulfilled: - # https://github.com/actions/download-artifact/issues/214 - - name: Download image name files into separate folders - uses: actions/download-artifact@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta + - name: Set docker metadata - final + id: meta-final uses: docker/metadata-action@v4 with: images: | @@ -147,21 +132,66 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-upstream-pax-multiarch + type=raw,value=${{ github.run_id }}-upstream-pax-${{ inputs.ARCHITECTURE }} labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Combine images into a single multi-arch image + - name: Build final image + uses: docker/build-push-action@v4 + with: + context: .github/container + push: true + file: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} + platforms: linux/${{ inputs.ARCHITECTURE }} + target: final + tags: ${{ steps.meta-staging.outputs.tags }} + labels: ${{ steps.meta-staging.outputs.labels }} + build-args: | + BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BUILD_DATE=${{ inputs.BUILD_DATE }} + REPO_PAXML=${{ inputs.REPO_PAXML }} + REPO_PRAXIS=${{ inputs.REPO_PRAXIS }} + REF_PAXML=${{ inputs.REF_PAXML }} + REF_PRAXIS=${{ inputs.REF_PRAXIS }} + + - name: Generate sitrep + if: success() || failure() shell: bash -x -e {0} run: | - docker manifest create ${{ steps.meta.outputs.tags }} $( - for IMAGE in $(cat image-name-upstream-pax-*/image-name.txt); do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${{ steps.meta.outputs.tags }} + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='PAX ${{ inputs.ARCHITECTURE }} build' + tags="${{ steps.meta-final.outputs.tags }}" + digest="${{ steps.build-final.outputs.digest }}" + outcome="${{ steps.build-final.outcome }}" + + if [[ ${outcome} == "success" ]]; then + badge_message="pass" + badge_color=brightgreen + summary="PAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" + else + badge_message="fail" + badge_color=red + summary="PAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" + fi + + to_json \ + summary \ + badge_label tags digest outcome \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="${badge_message}" \ + color="${badge_color}" \ + to_json schemaVersion label message color \ + > ${{ env.BADGE_FILENAME_FULL }} + + - name: Upload sitrep and badge + uses: actions/upload-artifact@v3 + with: + name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }} + path: | + sitrep.json + ${{ env.BADGE_FILENAME_FULL }} diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index 19cadcf45..bc2879a45 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -37,10 +37,23 @@ on: description: Git commit, tag, or branch for TE required: false default: main + ARTIFACT_NAME: + type: string + description: 'Name of the artifact zip file' + required: false + default: 'artifact-t5x-build' + BADGE_FILENAME: + type: string + description: 'Name of the endpoint JSON file for shields.io badge' + required: false + default: 'badge-t5x-build' outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.build.outputs.DOCKER_TAGS }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build.outputs.DOCKER_TAG_FINAL }} + DOCKER_TAG_STAGING: + description: "Tags of the 'staging' image built" + value: ${{ jobs.build.outputs.DOCKER_TAG_STAGING }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -53,9 +66,12 @@ permissions: jobs: build: - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] + env: + BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json + outputs: + DOCKER_TAG_FINAL: ${{ steps.meta-staging.outputs.tags }} + DOCKER_TAG_STAGING: ${{ steps.meta-final.outputs.tags }} steps: - name: Print environment variables run: env @@ -136,3 +152,45 @@ jobs: REF_T5X=${{ inputs.REF_T5X }} REPO_TE=${{ inputs.REPO_TE }} REF_TE=${{ inputs.REF_TE }} + + - name: Generate sitrep + if: success() || failure() + shell: bash -x -e {0} + run: | + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='T5X ${{ inputs.ARCHITECTURE }} build' + tags="${{ steps.meta-final.outputs.tags }}" + digest="${{ steps.build-final.outputs.digest }}" + outcome="${{ steps.build-final.outcome }}" + + if [[ ${outcome} == "success" ]]; then + badge_message="pass" + badge_color=brightgreen + summary="T5X build on ${{ inputs.ARCHITECTURE }}: $badge_message" + else + badge_message="fail" + badge_color=red + summary="T5X build on ${{ inputs.ARCHITECTURE }}: $badge_message" + fi + + to_json \ + summary \ + badge_label tags digest outcome \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="${badge_message}" \ + color="${badge_color}" \ + to_json schemaVersion label message color \ + > ${{ env.BADGE_FILENAME_FULL }} + + - name: Upload sitrep and badge + uses: actions/upload-artifact@v3 + with: + name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }} + path: | + sitrep.json + ${{ env.BADGE_FILENAME_FULL }} diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index defce51cb..024d92463 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -10,36 +10,37 @@ permissions: jobs: - # build-base: - # uses: ./.github/workflows/_build_base.yaml - # with: - # ARCHITECTURE: amd64 - # secrets: inherit + build-base: + uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: amd64 + secrets: inherit - # build-jax: - # needs: [build-base] - # uses: ./.github/workflows/_build_jax.yaml - # with: - # ARCHITECTURE: amd64 - # BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - # REF_JAX: jax-v0.4.20 - # REF_XLA: ca31652cdbeb6ea187589dea546ff8019274f8b2 - # secrets: inherit + build-jax: + needs: [build-base] + uses: ./.github/workflows/_build_jax.yaml + with: + ARCHITECTURE: amd64 + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + REF_JAX: jax-v0.4.20 + REF_XLA: ca31652cdbeb6ea187589dea546ff8019274f8b2 + secrets: inherit - # build-t5x: - # needs: [build-jax] - # uses: ./.github/workflows/_build_t5x.yaml - # with: - # ARCHITECTURE: amd64 - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} - # secrets: inherit + build-t5x: + needs: [build-jax] + uses: ./.github/workflows/_build_t5x.yaml + with: + ARCHITECTURE: amd64 + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + secrets: inherit build-pax: - # needs: [build-jax] + needs: [build-jax] uses: ./.github/workflows/_build_pax.yaml with: - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} - BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-staging + ARCHITECTURE: amd64 + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-staging secrets: inherit # merge: From e2c34b4770b5ffa19e43485cd561d298bd6c4686 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Tue, 7 Nov 2023 22:32:17 +0000 Subject: [PATCH 040/146] fix job step id --- .github/workflows/_build_jax.yaml | 28 ++++++++++++++-------------- .github/workflows/_build_pax.yaml | 24 +++++++++++++----------- .github/workflows/_build_t5x.yaml | 24 +++++++++++++----------- .github/workflows/_sandbox.yaml | 9 +++++++++ 4 files changed, 49 insertions(+), 36 deletions(-) diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index e6a40942a..7b457f445 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -70,8 +70,8 @@ jobs: env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: - DOCKER_TAG_FINAL: ${{ steps.meta-staging.outputs.tags }} - DOCKER_TAG_STAGING: ${{ steps.meta-final.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.staging-metadata.outputs.tags }} + DOCKER_TAG_STAGING: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables run: env @@ -108,7 +108,7 @@ jobs: image=moby/buildkit:v0.12.1 - name: Set docker metadata - staging - id: meta-staging + id: staging-metadata uses: docker/metadata-action@v4 with: images: | @@ -121,7 +121,7 @@ jobs: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build staging image - id: build-staging + id: staging-build uses: docker/build-push-action@v4 with: context: .github/container @@ -129,8 +129,8 @@ jobs: file: .github/container/Dockerfile.jax platforms: linux/${{ inputs.ARCHITECTURE }} target: staging - tags: ${{ steps.meta-staging.outputs.tags }} - labels: ${{ steps.meta-staging.outputs.labels }} + tags: ${{ steps.staging-metadata.outputs.tags }} + labels: ${{ steps.staging-metadata.outputs.labels }} ssh: default secret-files: | "SSH_KNOWN_HOSTS=${{ steps.ssh-known-hosts.outputs.FILE }}" @@ -144,7 +144,7 @@ jobs: REF_XLA=${{ inputs.REF_XLA }} - name: Set docker metadata - final - id: meta-final + id: final-metadata uses: docker/metadata-action@v4 with: images: | @@ -157,15 +157,15 @@ jobs: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build final image - id: build-final + id: final-build uses: docker/build-push-action@v4 with: context: .github/container push: true file: .github/container/Dockerfile.jax platforms: linux/${{ inputs.ARCHITECTURE }} - tags: ${{ steps.meta-final.outputs.tags }} - labels: ${{ steps.meta-final.outputs.labels }} + tags: ${{ steps.final-metadata.outputs.tags }} + labels: ${{ steps.final-metadata.outputs.labels }} ssh: default secret-files: | "SSH_KNOWN_HOSTS=${{ steps.ssh-known-hosts.outputs.FILE }}" @@ -186,9 +186,9 @@ jobs: source .github/workflows/scripts/to_json.sh badge_label='JAX ${{ inputs.ARCHITECTURE }} build' - tags="${{ steps.meta-final.outputs.tags }}" - digest="${{ steps.build-final.outputs.digest }}" - outcome="${{ steps.build-final.outcome }}" + tags="${{ steps.final-metadata.outputs.tags }}" + digest="${{ steps.final-build.outputs.digest }}" + outcome="${{ steps.final-build.outcome }}" if [[ ${outcome} == "success" ]]; then badge_message="pass" @@ -219,7 +219,7 @@ jobs: # - name: Save image name as text file # shell: bash -x -e {0} # run: | - # echo "${{ steps.meta-final.outputs.tags }}" >> image-name.txt + # echo "${{ steps.final-metadata.outputs.tags }}" >> image-name.txt # - name: Upload image name file as artifact # uses: actions/upload-artifact@v3 diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 9491ac7d0..27b1c2e6c 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -70,8 +70,8 @@ jobs: env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: - DOCKER_TAG_FINAL: ${{ steps.meta-staging.outputs.tags }} - DOCKER_TAG_STAGING: ${{ steps.meta-final.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.staging-metadata.outputs.tags }} + DOCKER_TAG_STAGING: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables run: env @@ -93,7 +93,7 @@ jobs: image=moby/buildkit:v0.12.1 - name: Set docker metadata - staging - id: meta-staging + id: staging-metadata uses: docker/metadata-action@v4 with: images: | @@ -106,6 +106,7 @@ jobs: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build staging image + id: staging-build uses: docker/build-push-action@v4 with: context: .github/container @@ -113,8 +114,8 @@ jobs: file: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} platforms: linux/${{ inputs.ARCHITECTURE }} target: staging - tags: ${{ steps.meta-staging.outputs.tags }} - labels: ${{ steps.meta-staging.outputs.labels }} + tags: ${{ steps.staging-metadata.outputs.tags }} + labels: ${{ steps.staging-metadata.outputs.labels }} build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} @@ -124,7 +125,7 @@ jobs: REF_PRAXIS=${{ inputs.REF_PRAXIS }} - name: Set docker metadata - final - id: meta-final + id: final-metadata uses: docker/metadata-action@v4 with: images: | @@ -137,6 +138,7 @@ jobs: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build final image + id: final-build uses: docker/build-push-action@v4 with: context: .github/container @@ -144,8 +146,8 @@ jobs: file: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} platforms: linux/${{ inputs.ARCHITECTURE }} target: final - tags: ${{ steps.meta-staging.outputs.tags }} - labels: ${{ steps.meta-staging.outputs.labels }} + tags: ${{ steps.final-metadata.outputs.tags }} + labels: ${{ steps.final-metadata.outputs.labels }} build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} @@ -162,9 +164,9 @@ jobs: source .github/workflows/scripts/to_json.sh badge_label='PAX ${{ inputs.ARCHITECTURE }} build' - tags="${{ steps.meta-final.outputs.tags }}" - digest="${{ steps.build-final.outputs.digest }}" - outcome="${{ steps.build-final.outcome }}" + tags="${{ steps.final-metadata.outputs.tags }}" + digest="${{ steps.final-build.outputs.digest }}" + outcome="${{ steps.final-build.outcome }}" if [[ ${outcome} == "success" ]]; then badge_message="pass" diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index bc2879a45..e65bfc847 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -70,8 +70,8 @@ jobs: env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: - DOCKER_TAG_FINAL: ${{ steps.meta-staging.outputs.tags }} - DOCKER_TAG_STAGING: ${{ steps.meta-final.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.staging-metadata.outputs.tags }} + DOCKER_TAG_STAGING: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables run: env @@ -93,7 +93,7 @@ jobs: image=moby/buildkit:v0.12.1 - name: Set docker metadata - staging - id: meta-staging + id: staging-metadata uses: docker/metadata-action@v4 with: images: | @@ -106,6 +106,7 @@ jobs: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build staging image + id: staging-build uses: docker/build-push-action@v4 with: context: .github/container @@ -113,8 +114,8 @@ jobs: file: .github/container/Dockerfile.t5x platforms: linux/${{ inputs.ARCHITECTURE }} target: staging - tags: ${{ steps.meta-staging.outputs.tags }} - labels: ${{ steps.meta-staging.outputs.labels }} + tags: ${{ steps.staging-metadata.outputs.tags }} + labels: ${{ steps.staging-metadata.outputs.labels }} build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} @@ -124,7 +125,7 @@ jobs: REF_TE=${{ inputs.REF_TE }} - name: Set docker metadata - final - id: meta-final + id: final-metadata uses: docker/metadata-action@v4 with: images: | @@ -137,14 +138,15 @@ jobs: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build final image + id: final-build uses: docker/build-push-action@v4 with: context: .github/container push: true file: .github/container/Dockerfile.t5x platforms: linux/${{ inputs.ARCHITECTURE }} - tags: ${{ steps.meta-staging.outputs.tags }} - labels: ${{ steps.meta-staging.outputs.labels }} + tags: ${{ steps.final-metadata.outputs.tags }} + labels: ${{ steps.final-metadata.outputs.labels }} build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} @@ -161,9 +163,9 @@ jobs: source .github/workflows/scripts/to_json.sh badge_label='T5X ${{ inputs.ARCHITECTURE }} build' - tags="${{ steps.meta-final.outputs.tags }}" - digest="${{ steps.build-final.outputs.digest }}" - outcome="${{ steps.build-final.outcome }}" + tags="${{ steps.final-metadata.outputs.tags }}" + digest="${{ steps.final-build.outputs.digest }}" + outcome="${{ steps.final-build.outcome }}" if [[ ${outcome} == "success" ]]; then badge_message="pass" diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 024d92463..95db25792 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -43,6 +43,15 @@ jobs: # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-staging secrets: inherit + finalize: + if: always() + # TODO: use dynamic matrix to make dependencies self-updating + needs: [build-jax, build-t5x, build-pax] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: false + secrets: inherit + # merge: # runs-on: ubuntu-latest # needs: build From 7ee441b3f2c19ead1aebc87d5c0f10550de846a6 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Tue, 7 Nov 2023 22:34:08 +0000 Subject: [PATCH 041/146] arm64 build --- .github/workflows/_sandbox.yaml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 95db25792..0b2b9d03e 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -8,19 +8,22 @@ permissions: actions: write # to cancel previous workflows packages: write # to upload container +env: + ARCHITECTURE: arm64 + jobs: build-base: uses: ./.github/workflows/_build_base.yaml with: - ARCHITECTURE: amd64 + ARCHITECTURE: ${{ env.ARCHITECTURE }} secrets: inherit build-jax: needs: [build-base] uses: ./.github/workflows/_build_jax.yaml with: - ARCHITECTURE: amd64 + ARCHITECTURE: ${{ env.ARCHITECTURE }} BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} REF_JAX: jax-v0.4.20 REF_XLA: ca31652cdbeb6ea187589dea546ff8019274f8b2 @@ -30,7 +33,7 @@ jobs: needs: [build-jax] uses: ./.github/workflows/_build_t5x.yaml with: - ARCHITECTURE: amd64 + ARCHITECTURE: ${{ env.ARCHITECTURE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} secrets: inherit @@ -38,7 +41,7 @@ jobs: needs: [build-jax] uses: ./.github/workflows/_build_pax.yaml with: - ARCHITECTURE: amd64 + ARCHITECTURE: ${{ env.ARCHITECTURE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-staging secrets: inherit From 4f4d909eea1e4d76a2456b7390d8b4153c0a9846 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Tue, 7 Nov 2023 22:43:51 +0000 Subject: [PATCH 042/146] arm64 build --- .github/workflows/_sandbox.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 0b2b9d03e..51b62e100 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -16,14 +16,14 @@ jobs: build-base: uses: ./.github/workflows/_build_base.yaml with: - ARCHITECTURE: ${{ env.ARCHITECTURE }} + ARCHITECTURE: $ARCHITECTURE secrets: inherit build-jax: needs: [build-base] uses: ./.github/workflows/_build_jax.yaml with: - ARCHITECTURE: ${{ env.ARCHITECTURE }} + ARCHITECTURE: $ARCHITECTURE BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} REF_JAX: jax-v0.4.20 REF_XLA: ca31652cdbeb6ea187589dea546ff8019274f8b2 @@ -33,7 +33,7 @@ jobs: needs: [build-jax] uses: ./.github/workflows/_build_t5x.yaml with: - ARCHITECTURE: ${{ env.ARCHITECTURE }} + ARCHITECTURE: $ARCHITECTURE BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} secrets: inherit @@ -41,7 +41,7 @@ jobs: needs: [build-jax] uses: ./.github/workflows/_build_pax.yaml with: - ARCHITECTURE: ${{ env.ARCHITECTURE }} + ARCHITECTURE: $ARCHITECTURE BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-staging secrets: inherit From b77724437b3558ceada011ba96be2f698d25d2ab Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 8 Nov 2023 00:23:11 +0000 Subject: [PATCH 043/146] arm64 build --- .github/workflows/_sandbox.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 51b62e100..fe6f7d179 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -16,14 +16,14 @@ jobs: build-base: uses: ./.github/workflows/_build_base.yaml with: - ARCHITECTURE: $ARCHITECTURE + ARCHITECTURE: "${{ env.ARCHITECTURE }}" secrets: inherit build-jax: needs: [build-base] uses: ./.github/workflows/_build_jax.yaml with: - ARCHITECTURE: $ARCHITECTURE + ARCHITECTURE: "${{ env.ARCHITECTURE }}" BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} REF_JAX: jax-v0.4.20 REF_XLA: ca31652cdbeb6ea187589dea546ff8019274f8b2 @@ -33,7 +33,7 @@ jobs: needs: [build-jax] uses: ./.github/workflows/_build_t5x.yaml with: - ARCHITECTURE: $ARCHITECTURE + ARCHITECTURE: "${{ env.ARCHITECTURE }}" BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} secrets: inherit @@ -41,7 +41,7 @@ jobs: needs: [build-jax] uses: ./.github/workflows/_build_pax.yaml with: - ARCHITECTURE: $ARCHITECTURE + ARCHITECTURE: "${{ env.ARCHITECTURE }}" BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-staging secrets: inherit From c4c22afe7deab3b824670e32be8dbb90ef44ad3e Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 8 Nov 2023 00:23:58 +0000 Subject: [PATCH 044/146] arm64 build --- .github/workflows/_sandbox.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index fe6f7d179..e52727fa1 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -16,14 +16,14 @@ jobs: build-base: uses: ./.github/workflows/_build_base.yaml with: - ARCHITECTURE: "${{ env.ARCHITECTURE }}" + ARCHITECTURE: arm64 secrets: inherit build-jax: needs: [build-base] uses: ./.github/workflows/_build_jax.yaml with: - ARCHITECTURE: "${{ env.ARCHITECTURE }}" + ARCHITECTURE: arm64 BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} REF_JAX: jax-v0.4.20 REF_XLA: ca31652cdbeb6ea187589dea546ff8019274f8b2 @@ -33,7 +33,7 @@ jobs: needs: [build-jax] uses: ./.github/workflows/_build_t5x.yaml with: - ARCHITECTURE: "${{ env.ARCHITECTURE }}" + ARCHITECTURE: arm64 BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} secrets: inherit @@ -41,7 +41,7 @@ jobs: needs: [build-jax] uses: ./.github/workflows/_build_pax.yaml with: - ARCHITECTURE: "${{ env.ARCHITECTURE }}" + ARCHITECTURE: arm64 BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-staging secrets: inherit From 736246a1ad1a91f0d44aa020c93e443504040dfd Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 8 Nov 2023 00:31:52 +0000 Subject: [PATCH 045/146] add sitrep to base build --- .github/workflows/_build_base.yaml | 112 ++++++++++++++--------------- .github/workflows/_sandbox.yaml | 44 ++++++++++++ 2 files changed, 98 insertions(+), 58 deletions(-) diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml index 4e23ef1cb..a01655973 100644 --- a/.github/workflows/_build_base.yaml +++ b/.github/workflows/_build_base.yaml @@ -17,6 +17,16 @@ on: description: "Build date in YYYY-MM-DD format" required: false default: 'NOT SPECIFIED' + ARTIFACT_NAME: + type: string + description: 'Name of the artifact zip file' + required: false + default: 'artifact-base-build' + BADGE_FILENAME: + type: string + description: 'Name of the endpoint JSON file for shields.io badge' + required: false + default: 'badge-base-build' outputs: DOCKER_TAG: description: "Tag of the image built" @@ -34,6 +44,8 @@ jobs: build: runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] + env: + BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: DOCKER_TAG: ${{ steps.meta.outputs.tags }} steps: @@ -70,6 +82,7 @@ jobs: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build docker images + id: build uses: docker/build-push-action@v4 with: context: .github/container @@ -82,61 +95,44 @@ jobs: BUILD_DATE=${{ inputs.BUILD_DATE }} ${{ inputs.BASE_IMAGE != 'latest' && format('BASE_IMAGE={0}', inputs.BASE_IMAGE) }} - # # Temporary workaround until the following issues are solved: - # # https://github.com/orgs/community/discussions/17245 - # # https://github.com/actions/runner/pull/2477 - # # https://github.com/orgs/community/discussions/26639 - # - name: Save image name as text file - # shell: bash -x -e {0} - # run: | - # echo "${{ steps.meta.outputs.tags }}" >> image-name.txt - - # - name: Upload image name file as artifact - # uses: actions/upload-artifact@v3 - # with: - # name: image-name-base-${{ inputs.ARCHITECTURE }} - # path: image-name.txt - - # merge: - # runs-on: ubuntu-latest - # needs: build - # outputs: - # DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - # steps: - # - name: Download image name files into separate folders - # uses: actions/download-artifact@v3 - - # - name: Login to GitHub Container Registry - # uses: docker/login-action@v2 - # with: - # registry: ghcr.io - # username: ${{ github.repository_owner }} - # password: ${{ secrets.GITHUB_TOKEN }} - - # - name: Set docker metadata - # id: meta - # uses: docker/metadata-action@v4 - # with: - # images: | - # ${{ env.UPLD_IMAGE }} - # flavor: | - # latest=false - # tags: | - # type=raw,value=${{ github.run_id }}-base-multiarch - # labels: - # org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - # - name: Combine images into a single multi-arch image - # shell: bash -x -e {0} - # run: | - # docker manifest create ${{ steps.meta.outputs.tags }} $( - # for IMAGE in $(cat image-name-base-*/image-name.txt); do - # REPO=$(echo $IMAGE | cut -d: -f1) - # DIGEST=$( - # docker manifest inspect $IMAGE |\ - # jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - # ) - # echo $REPO@${DIGEST} - # done - # ) - # docker manifest push ${{ steps.meta.outputs.tags }} + - name: Generate sitrep + if: success() || failure() + shell: bash -x -e {0} + run: | + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='Base image ${{ inputs.ARCHITECTURE }} build' + tags="${{ steps.meta.outputs.tags }}" + digest="${{ steps.build.outputs.digest }}" + outcome="${{ steps.build.outcome }}" + + if [[ ${outcome} == "success" ]]; then + badge_message="pass" + badge_color=brightgreen + summary="Base image build on ${{ inputs.ARCHITECTURE }}: $badge_message" + else + badge_message="fail" + badge_color=red + summary="Base image build on ${{ inputs.ARCHITECTURE }}: $badge_message" + fi + + to_json \ + summary \ + badge_label tags digest outcome \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="${badge_message}" \ + color="${badge_color}" \ + to_json schemaVersion label message color \ + > ${{ env.BADGE_FILENAME_FULL }} + + - name: Upload sitrep and badge + uses: actions/upload-artifact@v3 + with: + name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }} + path: | + sitrep.json + ${{ env.BADGE_FILENAME_FULL }} diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index e52727fa1..f380570d3 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -101,3 +101,47 @@ jobs: # done # ) # docker manifest push ${{ steps.meta.outputs.tags }} + + # merge: + # runs-on: ubuntu-latest + # needs: build + # outputs: + # DOCKER_TAGS: ${{ steps.meta.outputs.tags }} + # steps: + # - name: Download image name files into separate folders + # uses: actions/download-artifact@v3 + + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v2 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + + # - name: Set docker metadata + # id: meta + # uses: docker/metadata-action@v4 + # with: + # images: | + # ${{ env.UPLD_IMAGE }} + # flavor: | + # latest=false + # tags: | + # type=raw,value=${{ github.run_id }}-base-multiarch + # labels: + # org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + + # - name: Combine images into a single multi-arch image + # shell: bash -x -e {0} + # run: | + # docker manifest create ${{ steps.meta.outputs.tags }} $( + # for IMAGE in $(cat image-name-base-*/image-name.txt); do + # REPO=$(echo $IMAGE | cut -d: -f1) + # DIGEST=$( + # docker manifest inspect $IMAGE |\ + # jq -r '.manifests[] | select(.platform.os == "linux") | .digest' + # ) + # echo $REPO@${DIGEST} + # done + # ) + # docker manifest push ${{ steps.meta.outputs.tags }} From bc4b6db7f6cfc76f29aaa4072886cce43f7f1305 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 8 Nov 2023 04:38:52 +0000 Subject: [PATCH 046/146] lingvo --- .github/container/Dockerfile.pax.arm64 | 4 +- .github/container/install_lingvo_aarch64.sh | 4 +- .github/workflows/_sandbox.yaml | 49 +++++++++++---------- .github/workflows/_summary.yaml | 10 +++++ 4 files changed, 40 insertions(+), 27 deletions(-) diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 5834c78f8..6abe3d1c5 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -58,8 +58,8 @@ RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > /tmp/rustup.sh & # paxml + praxis RUN <<"EOF" bash -ex -get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} -m /opt/pip-tools.d/manifest.paxml -get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} -m /opt/pip-tools.d/manifest.praxis +get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} -m /opt/pip-tools.d/manifest.pax +get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} -m /opt/pip-tools.d/manifest.pax for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do pushd ${src} diff --git a/.github/container/install_lingvo_aarch64.sh b/.github/container/install_lingvo_aarch64.sh index ffc928b7f..b4f93c784 100755 --- a/.github/container/install_lingvo_aarch64.sh +++ b/.github/container/install_lingvo_aarch64.sh @@ -45,7 +45,9 @@ pip install patchelf # running the tests entirely by uncommentin the following line. # SKIP_TEST=1 PYTHON_MINOR_VERSION=10 pip_package/build.sh -pip install /tmp/lingvo/dist/lingvo*linux_aarch64.whl +# pip install /tmp/lingvo/dist/lingvo*linux_aarch64.whl +cp /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt +echo "lingvo @ file://$(ls /opt/lingvo*linux_aarch64.whl)" >> /opt/pip-tools.d/manifest.pax popd rm -Rf *lingvo* rm -Rf /root/.cache diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index f380570d3..bb63319c4 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -13,36 +13,37 @@ env: jobs: - build-base: - uses: ./.github/workflows/_build_base.yaml - with: - ARCHITECTURE: arm64 - secrets: inherit - - build-jax: - needs: [build-base] - uses: ./.github/workflows/_build_jax.yaml - with: - ARCHITECTURE: arm64 - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - REF_JAX: jax-v0.4.20 - REF_XLA: ca31652cdbeb6ea187589dea546ff8019274f8b2 - secrets: inherit - - build-t5x: - needs: [build-jax] - uses: ./.github/workflows/_build_t5x.yaml - with: - ARCHITECTURE: arm64 - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} - secrets: inherit + # build-base: + # uses: ./.github/workflows/_build_base.yaml + # with: + # ARCHITECTURE: arm64 + # secrets: inherit + + # build-jax: + # needs: [build-base] + # uses: ./.github/workflows/_build_jax.yaml + # with: + # ARCHITECTURE: arm64 + # BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + # REF_JAX: jax-v0.4.20 + # REF_XLA: ca31652cdbeb6ea187589dea546ff8019274f8b2 + # secrets: inherit + + # build-t5x: + # needs: [build-jax] + # uses: ./.github/workflows/_build_t5x.yaml + # with: + # ARCHITECTURE: arm64 + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + # secrets: inherit build-pax: needs: [build-jax] uses: ./.github/workflows/_build_pax.yaml with: ARCHITECTURE: arm64 - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6792001145-jax-arm64-staging # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-staging secrets: inherit diff --git a/.github/workflows/_summary.yaml b/.github/workflows/_summary.yaml index d0f453d0c..645c105d1 100644 --- a/.github/workflows/_summary.yaml +++ b/.github/workflows/_summary.yaml @@ -16,3 +16,13 @@ jobs: find -name "sitrep.json" | while read -s f; do cat "$f" | jq -r '.summary' | tee -a $GITHUB_STEP_SUMMARY done + + # - name: Concatenate all sitreps + # shell: bash -x -e {0} + # run: | + # # combine all sitreps files into a single file, where each sitrep json sits + # # in a field named by the folder that contained it + + # find -name "sitrep.json" | while read -s f; do + # echo "$(dirname $f): $(cat $f)," >> + # done From ce1cf9459fc5eeda5386874feb69069cc85e8319 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 8 Nov 2023 04:44:07 +0000 Subject: [PATCH 047/146] lingvo --- .github/workflows/_sandbox.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index bb63319c4..f63c5fe38 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -38,7 +38,7 @@ jobs: # secrets: inherit build-pax: - needs: [build-jax] + # needs: [build-jax] uses: ./.github/workflows/_build_pax.yaml with: ARCHITECTURE: arm64 From 9ac736784079b13d2422c2d72b524f5cafabd210 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 8 Nov 2023 04:44:50 +0000 Subject: [PATCH 048/146] lingvo --- .github/workflows/_sandbox.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index f63c5fe38..49b470eca 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -47,14 +47,14 @@ jobs: # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-staging secrets: inherit - finalize: - if: always() - # TODO: use dynamic matrix to make dependencies self-updating - needs: [build-jax, build-t5x, build-pax] - uses: ./.github/workflows/_finalize.yaml - with: - PUBLISH_BADGE: false - secrets: inherit + # finalize: + # if: always() + # # TODO: use dynamic matrix to make dependencies self-updating + # needs: [build-jax, build-t5x, build-pax] + # uses: ./.github/workflows/_finalize.yaml + # with: + # PUBLISH_BADGE: false + # secrets: inherit # merge: # runs-on: ubuntu-latest From 44d302639433fe345f62f98390ef430461cf087e Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 8 Nov 2023 23:38:19 +0000 Subject: [PATCH 049/146] refactor pax arm64 build --- .github/container/Dockerfile.pax.arm64 | 114 +++++++++++++++----- .github/container/install-pax.sh | 106 +----------------- .github/container/install_lingvo_aarch64.sh | 20 ++-- 3 files changed, 101 insertions(+), 139 deletions(-) diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 6abe3d1c5..4751d91b2 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -9,16 +9,11 @@ ARG SRC_PATH_PAXML=/opt/paxml ARG SRC_PATH_PRAXIS=/opt/praxis ############################################################################### -## Pax for AArch64 +## build dependencies ############################################################################### -FROM ${BASE_IMAGE} as staging -ARG REPO_PAXML -ARG REPO_PRAXIS -ARG REF_PAXML -ARG REF_PRAXIS -ARG SRC_PATH_PAXML -ARG SRC_PATH_PRAXIS +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as wheel-builder # We need to build some packages from source, bring some dependencies. RUN apt-get update && \ @@ -36,28 +31,89 @@ RUN apt-get update && \ RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-arm64 -O /usr/bin/bazel && \ chmod a+x /usr/bin/bazel -# Lingvo -ADD install_lingvo_aarch64.sh /opt/ -ADD lingvo.patch /opt/ -RUN /opt/install_lingvo_aarch64.sh - -# Install T5 now, Pip will build the wheel from source, it needs Rust. -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > /tmp/rustup.sh && \ - echo "be3535b3033ff5e0ecc4d589a35d3656f681332f860c5fd6684859970165ddcc /tmp/rustup.sh" | sha256sum --check && \ - bash /tmp/rustup.sh -y && \ - export PATH=$PATH:/root/.cargo/bin && \ - pip install t5 && \ - rm -Rf /root/.cargo /root/.rustup && \ - mv /root/.profile /root/.profile.save && \ - grep -v cargo /root/.profile.save > /root/.profile && \ - rm /root/.profile.save && \ - mv /root/.bashrc /root/.bashrc.save && \ - grep -v cargo /root/.bashrc.save > /root/.bashrc && \ - rm /root/.bashrc.save && \ - rm -Rf /root/.cache /tmp/* +RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 protobuf==3.20 patchelf + +# build lingvo +RUN <<"EOT" bash -exu + +set -o pipefail + +INSTALL_DIR="${INSTALL_DIR:-/opt}" +LINGVO_REF="${LINGVO_REF:-HEAD}" +LINGVO_REPO="${LINGVO_REPO:-https://github.com/tensorflow/lingvo.git}" +LINGVO_INSTALLED_DIR=${INSTALL_DIR}/lingvo + +git clone ${LINGVO_REPO} ${LINGVO_INSTALLED_DIR} +# get-source.sh -f ${LINGVO_REPO} -r ${LINGVO_REF} -d ${LINGVO_INSTALLED_DIR} + + +pushd ${LINGVO_INSTALLED_DIR} +git fetch origin pull/329/head:pr329 +git cherry-pick --allow-empty pr329 + +# Disable 2 flaky tests here +patch -p1 <> /opt/pip-tools.d/manifest.pax + +COPY --from=wheel-builder /opt/tensorflow-text.whl /opt/ +RUN echo "tensorflow-text @ file:///opt/tensorflow-text.whl" >> /opt/pip-tools.d/manifest.pax # paxml + praxis -RUN <<"EOF" bash -ex +RUN <<"EOT" bash -ex +echo "tensorflow==2.13.0" >> /opt/pip-tools.d/manifest.pax +echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/manifest.pax +echo "auditwheel" >> /opt/pip-tools.d/manifest.pax + get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} -m /opt/pip-tools.d/manifest.pax get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} -m /opt/pip-tools.d/manifest.pax @@ -73,7 +129,7 @@ for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do fi popd done -EOF +EOT ADD test-pax.sh /usr/local/bin diff --git a/.github/container/install-pax.sh b/.github/container/install-pax.sh index 083f4ce59..f03ad790b 100755 --- a/.github/container/install-pax.sh +++ b/.github/container/install-pax.sh @@ -1,101 +1,3 @@ -#!/bin/bash -exu - -## Parse command-line arguments - -usage() { - echo "Usage: $0 [OPTION]..." - echo ' --defer When passed, will defer the installation of the main package. Can be installed afterwards with `pip install -r requirements-defer.txt` and any deferred cleanup commands can be run with `bash cleanup.sh`' - echo " -d, --dir=PATH Path to store Pax source. Defaults to /opt" - echo " --from_paxml=URL URL of the Paxml repo. Defaults to https://github.com/google/paxml.git" - echo " --from_praxis=URL URL of the Praxis repo. Defaults to https://github.com/google/praxis.git" - echo " -h, --help Print usage." - echo " --ref_paxml=REF Git commit hash or tag name that specifies the version of Paxml to install. Defaults to HEAD." - echo " --ref_praxis=REF Git commit hash or tag name that specifies the version of Praxis to install. Defaults to HEAD." - exit $1 -} - -args=$(getopt -o d:h --long defer,dir:,from_paxml:,from_praxis:,help,ref_paxml:,ref_praxis: -- "$@") -if [[ $? -ne 0 ]]; then - exit 1 -fi - -eval set -- "$args" -while [ : ]; do - case "$1" in - --defer) - DEFER=true - shift - ;; - -d | --dir) - INSTALL_DIR="$2" - shift 2 - ;; - --from_paxml) - PAXML_REPO="$2" - shift 2 - ;; - --from_praxis) - PRAXIS_REPO="$2" - shift 2 - ;; - -h | --help) - usage - ;; - --ref_paxml) - PAXML_REF="$2" - shift 2 - ;; - --ref_praxis) - PRAXIS_REF="$2" - shift 2 - ;; - --) - shift; - break - ;; - esac -done - -if [[ $# -ge 1 ]]; then - echo "Un-recognized argument: $*" && echo - usage 1 -fi - -## Set default arguments if not provided via command-line - -DEFER=${DEFER:-false} -PAXML_REF="${PAXML_REF:-HEAD}" -PAXML_REPO="${PAXML_REPO:-https://github.com/google/paxml.git}" -PRAXIS_REF="${PRAXIS_REF:-HEAD}" -PRAXIS_REPO="${PRAXIS_REPO:-https://github.com/google/praxis.git}" -INSTALL_DIR="${INSTALL_DIR:-/opt}" - -echo "Installing Paxml $PAXML_REF from $PAXML_REPO and $PRAXIS_REF from $PRAXIS_REPO to $INSTALL_DIR" - -maybe_defer_cleanup() { - if [[ "$DEFER" = true ]]; then - echo "# Cleanup from: $0" - echo "$*" >> /opt/cleanup.sh - else - $@ - fi -} - -maybe_defer_pip_install() { - if [[ "$DEFER" = true ]]; then - echo "Deferring installation of 'pip install $*'" - for arg in $@; do - if [[ $arg == "-e" ]]; then - echo -n "$arg " >>/opt/requirements-defer.txt - else - echo "$arg" >> /opt/requirements-defer.txt - fi - done - else - pip install $@ - fi -} - set -ex ## Install Praxis @@ -106,8 +8,8 @@ pushd ${PRAXIS_INSTALLED_DIR} git checkout ${PRAXIS_REF} if [[ $(uname -m) == "aarch64" ]]; then # These dependencies are broken on ARM64 right now, we handle them separately - sed -i 's/^tensorflow/#tensorflow/' praxis/pip_package/requirements.txt requirements.in - sed -i 's/^lingvo/#lingvo/' praxis/pip_package/requirements.txt requirements.in + # sed -i 's/^tensorflow/#tensorflow/' praxis/pip_package/requirements.txt requirements.in + # sed -i 's/^lingvo/#lingvo/' praxis/pip_package/requirements.txt requirements.in sed -i 's/^scikit-learn/#scikit-learn/' praxis/pip_package/requirements.txt requirements.in fi popd @@ -121,8 +23,8 @@ git checkout ${PAXML_REF} if [[ $(uname -m) == "aarch64" ]]; then # These dependencies are broken on ARM64 right now, we handle them separately pip install chex==0.1.7 - sed -i 's/^tensorflow/#tensorflow/' paxml/pip_package/requirements.txt requirements.in - sed -i 's/^lingvo/#lingvo/' paxml/pip_package/requirements.txt requirements.in + # sed -i 's/^tensorflow/#tensorflow/' paxml/pip_package/requirements.txt requirements.in + # sed -i 's/^lingvo/#lingvo/' paxml/pip_package/requirements.txt requirements.in sed -i 's/^scikit-learn/#scikit-learn/' paxml/pip_package/requirements.txt requirements.in sed -i 's/^t5/#t5/' paxml/pip_package/requirements.txt requirements.in sed -i 's/^jax/#jax/' paxml/pip_package/requirements.txt requirements.in diff --git a/.github/container/install_lingvo_aarch64.sh b/.github/container/install_lingvo_aarch64.sh index b4f93c784..84c36ae23 100755 --- a/.github/container/install_lingvo_aarch64.sh +++ b/.github/container/install_lingvo_aarch64.sh @@ -6,15 +6,16 @@ LINGVO_REPO="${LINGVO_REPO:-https://github.com/tensorflow/lingvo.git}" ## Install tensorflow-text cd ${INSTALL_DIR} -pip install tensorflow_datasets==4.9.2 # force a recent version to have latest protobuf dep -pip install auditwheel -pip install tensorflow==2.13.0 +# pip install tensorflow_datasets==4.9.2 # force a recent version to have latest protobuf dep +# pip install auditwheel +# pip install tensorflow==2.13.0 git clone http://github.com/tensorflow/text.git pushd text git checkout v2.13.0 ./oss_scripts/run_build.sh -find * | grep '.whl$' -pip install ./tensorflow_text-*.whl +echo "tensorflow-text @ file://$PWD/$(ls *.whl)" >> /opt/pip-tools.d/manifest.pax +# find * | grep '.whl$' +# pip install ./tensorflow_text-*.whl popd rm -Rf text @@ -37,9 +38,12 @@ patch -p1 < /opt/lingvo.patch sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt -pip install -r docker/dev.requirements.txt -pip install protobuf==3.20 -pip install patchelf +# pip install -r docker/dev.requirements.txt +# pip install protobuf==3.20 +# pip install patchelf +echo "-r $PWD/docker/dev.requirements.txt" >> /opt/pip-tools.d/manifest.pax +echo "-r protobuf==3.20" >> /opt/pip-tools.d/manifest.pax +echo "-r patchelf" >> /opt/pip-tools.d/manifest.pax # Some tests are flaky right now (see the patch abovbe), if needed we can skip # running the tests entirely by uncommentin the following line. From 5c47fcbf6f54636f503a6c7699dc122f6846bdf5 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 8 Nov 2023 23:46:00 +0000 Subject: [PATCH 050/146] refactor pax arm64 build wip --- .github/container/Dockerfile.pax.arm64 | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 4751d91b2..977be3950 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -31,7 +31,13 @@ RUN apt-get update && \ RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-arm64 -O /usr/bin/bazel && \ chmod a+x /usr/bin/bazel -RUN pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 protobuf==3.20 patchelf +RUN pip install \ + tensorflow_datasets==4.9.2 \ + auditwheel \ + tensorflow==2.13.0 + + # protobuf==3.20 \ + # patchelf # build lingvo RUN <<"EOT" bash -exu @@ -46,7 +52,6 @@ LINGVO_INSTALLED_DIR=${INSTALL_DIR}/lingvo git clone ${LINGVO_REPO} ${LINGVO_INSTALLED_DIR} # get-source.sh -f ${LINGVO_REPO} -r ${LINGVO_REF} -d ${LINGVO_INSTALLED_DIR} - pushd ${LINGVO_INSTALLED_DIR} git fetch origin pull/329/head:pr329 git cherry-pick --allow-empty pr329 From 139e539ed4d313448aba656254893257d44b9d5e Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 8 Nov 2023 23:54:11 +0000 Subject: [PATCH 051/146] refactor pax arm64 build wip --- .github/container/Dockerfile.pax.arm64 | 38 ++++++++++++-------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 977be3950..d9bb6eaea 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -31,19 +31,26 @@ RUN apt-get update && \ RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-arm64 -O /usr/bin/bazel && \ chmod a+x /usr/bin/bazel -RUN pip install \ - tensorflow_datasets==4.9.2 \ - auditwheel \ - tensorflow==2.13.0 +FROM wheel-builder as tftext-builder - # protobuf==3.20 \ - # patchelf +# build tensorflow-text 2.13.0 from source +RUN <<"EOT" bash -exu +set -o pipefail +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 +mkdir -p /opt +pushd /opt +git clone http://github.com/tensorflow/text.git +pushd text +git checkout v2.13.0 +./oss_scripts/run_build.sh +popd +EOT + +FROM wheel-builder as lingvo-builder # build lingvo RUN <<"EOT" bash -exu - set -o pipefail - INSTALL_DIR="${INSTALL_DIR:-/opt}" LINGVO_REF="${LINGVO_REF:-HEAD}" LINGVO_REPO="${LINGVO_REPO:-https://github.com/tensorflow/lingvo.git}" @@ -76,7 +83,7 @@ EOF sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt -pip install -r docker/dev.requirements.txt +pip install -r docker/dev.requirements.txt protobuf==3.20 patchelf # Some tests are flaky right now (see the patch abovbe), if needed we can skip # running the tests entirely by uncommentin the following line. @@ -85,15 +92,6 @@ PYTHON_MINOR_VERSION=10 pip_package/build.sh cp /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt/lingvo.whl EOT -# build tensorflow-text 2.13.0 from source -RUN <<"EOT" bash -exu -git clone http://github.com/tensorflow/text.git -pushd text -git checkout v2.13.0 -./oss_scripts/run_build.sh -popd -EOT - ############################################################################### ## Pax for AArch64 ############################################################################### @@ -107,10 +105,10 @@ ARG REF_PRAXIS ARG SRC_PATH_PAXML ARG SRC_PATH_PRAXIS -COPY --from=wheel-builder /opt/lingvo.whl /opt/ +COPY --from=lingvo-builder /opt/lingvo.whl /opt/ RUN echo "lingvo @ file:///opt/lingvo.whl" >> /opt/pip-tools.d/manifest.pax -COPY --from=wheel-builder /opt/tensorflow-text.whl /opt/ +COPY --from=tftext-builder /opt/tensorflow-text.whl /opt/ RUN echo "tensorflow-text @ file:///opt/tensorflow-text.whl" >> /opt/pip-tools.d/manifest.pax # paxml + praxis From 5e2a5ad04f4926b7b31d441a25d721d10030260b Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Wed, 8 Nov 2023 23:55:09 +0000 Subject: [PATCH 052/146] refactor pax arm64 build wip --- .github/container/Dockerfile.pax.arm64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index d9bb6eaea..c5935f426 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -64,7 +64,7 @@ git fetch origin pull/329/head:pr329 git cherry-pick --allow-empty pr329 # Disable 2 flaky tests here -patch -p1 < Date: Thu, 9 Nov 2023 00:12:17 +0000 Subject: [PATCH 053/146] refactor pax arm64 build wip --- .github/container/Dockerfile.pax.arm64 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index c5935f426..a18e3cdfd 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -83,7 +83,9 @@ EOF sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt -pip install -r docker/dev.requirements.txt protobuf==3.20 patchelf +sed -i 's/protobuf==3.20.3/protobuf==3.20/' docker/dev.requirements.txt +sed -i 's/patchelf==0.17.2.1/patchelf/' docker/dev.requirements.txt +pip install -r docker/dev.requirements.txt # Some tests are flaky right now (see the patch abovbe), if needed we can skip # running the tests entirely by uncommentin the following line. From 095926496e1fbf463e2057427ee2728ee8031e9f Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 06:34:15 +0000 Subject: [PATCH 054/146] pax arm64 --- .github/container/Dockerfile.pax.arm64 | 102 ++++++++++++-------- .github/container/install_lingvo_aarch64.sh | 57 ----------- .github/container/lingvo.patch | 13 --- 3 files changed, 64 insertions(+), 108 deletions(-) delete mode 100755 .github/container/install_lingvo_aarch64.sh delete mode 100644 .github/container/lingvo.patch diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index a18e3cdfd..bea1b555f 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -9,57 +9,64 @@ ARG SRC_PATH_PAXML=/opt/paxml ARG SRC_PATH_PRAXIS=/opt/praxis ############################################################################### -## build dependencies +## build tensorflow-text and lingvo, which do not have working arm64 pip wheels ############################################################################### ARG BASE_IMAGE FROM ${BASE_IMAGE} as wheel-builder # We need to build some packages from source, bring some dependencies. -RUN apt-get update && \ - apt-get update && \ - apt-get install -y \ - bat \ - curl \ - git \ - gnupg \ - rsync \ - liblzma-dev \ - && \ - apt-get autoremove -y && apt-get clean && rm -rf /var/lib/apt/lists +RUN <> /opt/pip-tools.d/manifest.pax +RUN <> /opt/pip-tools.d/manifest.pax -COPY --from=tftext-builder /opt/tensorflow-text.whl /opt/ -RUN echo "tensorflow-text @ file:///opt/tensorflow-text.whl" >> /opt/pip-tools.d/manifest.pax +COPY --from=tftext-builder /opt/tensorflow-text/tensorflow_text*.whl /opt/ +RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/manifest.pax # paxml + praxis RUN <<"EOT" bash -ex echo "tensorflow==2.13.0" >> /opt/pip-tools.d/manifest.pax echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/manifest.pax +echo "chex==0.1.7" >> /opt/pip-tools.d/manifest.pax echo "auditwheel" >> /opt/pip-tools.d/manifest.pax get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} -m /opt/pip-tools.d/manifest.pax @@ -124,13 +138,25 @@ get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} -m /opt/p for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do pushd ${src} - sed -i "s| @ git+https://github.com/google/flax||g" requirements.in - sed -i "s| @ git+https://github.com/google/jax||g" requirements.in + + for pattern in \ + "s| @ git+https://github.com/google/flax||g" \ + "s| @ git+https://github.com/google/jax||g" \ + "s|^tensorflow|#tensorflow|" \ + "s|^lingvo|#lingvo|" \ + "s|^scikit-learn|#scikit-learn|" \ + "s|^t5|#t5|" \ + "s|^protobuf|#protobuf|" \ + "s|^numpy|#numpy|" \ + ; do + sed -i "${pattern}" */pip_package/requirements.txt requirements.in + done + if git diff --quiet; then - echo "URL specs no longer present in select dependencies for ${src}" + echo "broken dependencies no longer present in ${src}" exit 1 else - git commit -a -m "remove URL specs from select dependencies for ${src}" + git commit -a -m "remove broken dependencies from ${src}" fi popd done diff --git a/.github/container/install_lingvo_aarch64.sh b/.github/container/install_lingvo_aarch64.sh deleted file mode 100755 index 84c36ae23..000000000 --- a/.github/container/install_lingvo_aarch64.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -exu -set -o pipefail -INSTALL_DIR="${INSTALL_DIR:-/opt}" -LINGVO_REF="${LINGVO_REF:-HEAD}" -LINGVO_REPO="${LINGVO_REPO:-https://github.com/tensorflow/lingvo.git}" - -## Install tensorflow-text -cd ${INSTALL_DIR} -# pip install tensorflow_datasets==4.9.2 # force a recent version to have latest protobuf dep -# pip install auditwheel -# pip install tensorflow==2.13.0 -git clone http://github.com/tensorflow/text.git -pushd text -git checkout v2.13.0 -./oss_scripts/run_build.sh -echo "tensorflow-text @ file://$PWD/$(ls *.whl)" >> /opt/pip-tools.d/manifest.pax -# find * | grep '.whl$' -# pip install ./tensorflow_text-*.whl -popd -rm -Rf text - -## Install lingvo -LINGVO_INSTALLED_DIR=${INSTALL_DIR}/lingvo - -[[ -d lingvo ]] || git clone ${LINGVO_REPO} ${LINGVO_INSTALLED_DIR} - -pushd ${LINGVO_INSTALLED_DIR} -# Local patches, two PR waiting to be merged + one custom patch -# git fetch origin pull/326/head:pr326 ## merged upstream -# git fetch origin pull/328/head:pr328 ## merged upstream -git fetch origin pull/329/head:pr329 -# git cherry-pick pr326 pr328 pr329 ## pr326, pr328 merged -git cherry-pick --allow-empty pr329 - -# Disable 2 flaky tests here -patch -p1 < /opt/lingvo.patch - -sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt -sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt -sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt -# pip install -r docker/dev.requirements.txt -# pip install protobuf==3.20 -# pip install patchelf -echo "-r $PWD/docker/dev.requirements.txt" >> /opt/pip-tools.d/manifest.pax -echo "-r protobuf==3.20" >> /opt/pip-tools.d/manifest.pax -echo "-r patchelf" >> /opt/pip-tools.d/manifest.pax - -# Some tests are flaky right now (see the patch abovbe), if needed we can skip -# running the tests entirely by uncommentin the following line. -# SKIP_TEST=1 -PYTHON_MINOR_VERSION=10 pip_package/build.sh -# pip install /tmp/lingvo/dist/lingvo*linux_aarch64.whl -cp /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt -echo "lingvo @ file://$(ls /opt/lingvo*linux_aarch64.whl)" >> /opt/pip-tools.d/manifest.pax -popd -rm -Rf *lingvo* -rm -Rf /root/.cache diff --git a/.github/container/lingvo.patch b/.github/container/lingvo.patch deleted file mode 100644 index c4184a09f..000000000 --- a/.github/container/lingvo.patch +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/pip_package/build.sh b/pip_package/build.sh -index ef62c432e..659e78956 100755 ---- a/pip_package/build.sh -+++ b/pip_package/build.sh -@@ -89,7 +89,7 @@ bazel clean - bazel build $@ ... - if ! [[ $SKIP_TESTS ]]; then - # Just test the core for the purposes of the pip package. -- bazel test $@ lingvo/core/... -+ bazel test $@ lingvo/core/... -- -//lingvo/tasks/mt:model_test -//lingvo/core:saver_test - fi - - DST_DIR="/tmp/lingvo/dist" From 314db99da3b4a83a2276641c71c0cb8e4d75b9a4 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 08:37:31 +0000 Subject: [PATCH 055/146] redesign CI --- .github/workflows/_ci_amd64.yaml | 201 +++++++++++++++++++++ .github/workflows/_ci_arm64.yaml | 160 ++++++++++++++++ .github/workflows/ci.yaml | 182 ++----------------- .github/workflows/cuda-121-jax-pin.yaml | 190 ------------------- .github/workflows/cuda-122-jax-pin.yaml | 189 ------------------- .github/workflows/pax-cuda-121.yaml | 185 ------------------- .github/workflows/scripts/parse_git_src.sh | 8 + 7 files changed, 388 insertions(+), 727 deletions(-) create mode 100644 .github/workflows/_ci_amd64.yaml create mode 100644 .github/workflows/_ci_arm64.yaml delete mode 100644 .github/workflows/cuda-121-jax-pin.yaml delete mode 100644 .github/workflows/cuda-122-jax-pin.yaml delete mode 100644 .github/workflows/pax-cuda-121.yaml create mode 100644 .github/workflows/scripts/parse_git_src.sh diff --git a/.github/workflows/_ci_amd64.yaml b/.github/workflows/_ci_amd64.yaml new file mode 100644 index 000000000..e4b1bf6c1 --- /dev/null +++ b/.github/workflows/_ci_amd64.yaml @@ -0,0 +1,201 @@ +name: CI (amd64) + +on: + workflow_call: + inputs: + CUDA_IMAGE: + type: string + required: true + SRC_JAX: + type: string + required: true + SRC_XLA: + type: string + required: true + SRC_TE: + type: string + required: true + SRC_T5X: + type: string + required: true + SRC_PAXML: + type: string + required: true + SRC_PRAXIS: + type: string + required: true + +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container + +jobs: + + metadata: + runs-on: ubuntu-22.04 + outputs: + BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + REPO_JAX: ${{ steps.parse-inputs.outputs.REPO_JAX }} + REF_JAX: ${{ steps.parse-inputs.outputs.REF_JAX }} + REPO_XLA: ${{ steps.parse-inputs.outputs.REPO_XLA }} + REF_XLA: ${{ steps.parse-inputs.outputs.REF_XLA }} + REPO_TE: ${{ steps.parse-inputs.outputs.REPO_TE }} + REF_TE: ${{ steps.parse-inputs.outputs.REF_TE }} + REPO_T5X: ${{ steps.parse-inputs.outputs.REPO_T5X }} + REF_T5X: ${{ steps.parse-inputs.outputs.REF_T5X }} + REPO_PAXML: ${{ steps.parse-inputs.outputs.REPO_PAXML }} + REF_PAXML: ${{ steps.parse-inputs.outputs.REF_PAXML }} + REPO_PRAXIS: ${{ steps.parse-inputs.outputs.REPO_PRAXIS }} + REF_PRAXIS: ${{ steps.parse-inputs.outputs.REF_PRAXIS }} + steps: + - name: Set build date + id: date + shell: bash -x -e {0} + run: | + BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') + echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + + - name: split input "repo#ref" into repo and ref parts + id: parse-inputs + shell: bash -x -e {0} + run: | + source .github/workflows/scripts/parse_git_src.sh + + # default values are for `pull_request` event types + parse_git_src JAX "${{ inputs.SRC_JAX }}" + parse_git_src XLA "${{ inputs.SRC_XLA }}" + parse_git_src TE "${{ inputs.SRC_TE }}" + parse_git_src T5X "${{ inputs.SRC_T5X }}" + parse_git_src PAXML "${{ inputs.SRC_PAXML }}" + parse_git_src PRAXIS "${{ inputs.SRC_PRAXIS }}" + + build-base: + needs: metadata + uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: amd64 + BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + secrets: inherit + + build-jax: + needs: [metadata, build-base] + uses: ./.github/workflows/_build_jax.yaml + with: + ARCHITECTURE: amd64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAGS }} + REPO_JAX: ${{ needs.metadata.outputs.REPO_JAX }} + REF_JAX: ${{ needs.metadata.outputs.REF_JAX }} + REPO_XLA: ${{ needs.metadata.outputs.REPO_XLA }} + REF_XLA: ${{ needs.metadata.outputs.REF_XLA }} + REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} + REF_TE: ${{ needs.metadata.outputs.REF_TE }} + secrets: inherit + + build-t5x: + needs: [metadata, build-jax] + uses: ./.github/workflows/_build_t5x.yaml + with: + ARCHITECTURE: amd64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} + REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} + secrets: inherit + + build-pax: + needs: [metadata, build-jax] + uses: ./.github/workflows/_build_pax.yaml + with: + ARCHITECTURE: amd64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + REPO_PAXML: ${{ needs.metadata.outputs.REPO_PAXML }} + REF_PAXML: ${{ needs.metadata.outputs.REF_PAXML }} + REPO_PRAXIS: ${{ needs.metadata.outputs.REPO_PRAXIS }} + REF_PRAXIS: ${{ needs.metadata.outputs.REF_PRAXIS }} + secrets: inherit + + # build-rosetta-t5x: + # uses: ./.github/workflows/_build_rosetta.yaml + # needs: [metadata, build-t5x] + # with: + # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_STAGING }} + # BASE_LIBRARY: t5x + # secrets: inherit + + # build-rosetta-pax: + # uses: ./.github/workflows/_build_rosetta.yaml + # needs: [metadata, build-pax] + # with: + # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_STAGING }} + # BASE_LIBRARY: pax + # secrets: inherit + + build-summary: + # needs: [build-base, build-jax, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] + needs: [build-base, build-jax, build-t5x, build-pax] + if: always() + runs-on: ubuntu-22.04 + steps: + - name: Generate job summary for container build + shell: bash -x -e {0} + run: | + cat > $GITHUB_STEP_SUMMARY << EOF + # Images created + + | Image | Link | + | ------------ | -------------------------------------------------- | + | Base | ${{ needs.build-base.outputs.DOCKER_TAGS }} | + | JAX | ${{ needs.build-jax.outputs.DOCKER_TAGS }} | + | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | + | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | + EOF + + # | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | + # | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | + + # test-distribution: + # needs: metadata + # uses: ./.github/workflows/_test_distribution.yaml + # secrets: inherit + + test-jax: + needs: build-jax + uses: ./.github/workflows/_test_jax.yaml + with: + JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-te: + needs: build-jax + uses: ./.github/workflows/_test_te.yaml + with: + JAX_TE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-t5x: + needs: build-t5x + uses: ./.github/workflows/_test_t5x.yaml + with: + T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-pax: + needs: build-pax + uses: ./.github/workflows/_test_pax.yaml + with: + PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + # test-vit: + # needs: build-rosetta-t5x + # uses: ./.github/workflows/_test_vit.yaml + # with: + # ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} + # secrets: inherit + \ No newline at end of file diff --git a/.github/workflows/_ci_arm64.yaml b/.github/workflows/_ci_arm64.yaml new file mode 100644 index 000000000..520bffae5 --- /dev/null +++ b/.github/workflows/_ci_arm64.yaml @@ -0,0 +1,160 @@ +name: CI (arm64) + +on: + workflow_call: + inputs: + CUDA_IMAGE: + type: string + required: true + SRC_JAX: + type: string + required: true + SRC_XLA: + type: string + required: true + SRC_TE: + type: string + required: true + SRC_T5X: + type: string + required: true + SRC_PAXML: + type: string + required: true + SRC_PRAXIS: + type: string + required: true + +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container + +jobs: + + metadata: + runs-on: ubuntu-22.04 + outputs: + BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + REPO_JAX: ${{ steps.parse-inputs.outputs.REPO_JAX }} + REF_JAX: ${{ steps.parse-inputs.outputs.REF_JAX }} + REPO_XLA: ${{ steps.parse-inputs.outputs.REPO_XLA }} + REF_XLA: ${{ steps.parse-inputs.outputs.REF_XLA }} + REPO_TE: ${{ steps.parse-inputs.outputs.REPO_TE }} + REF_TE: ${{ steps.parse-inputs.outputs.REF_TE }} + REPO_T5X: ${{ steps.parse-inputs.outputs.REPO_T5X }} + REF_T5X: ${{ steps.parse-inputs.outputs.REF_T5X }} + REPO_PAXML: ${{ steps.parse-inputs.outputs.REPO_PAXML }} + REF_PAXML: ${{ steps.parse-inputs.outputs.REF_PAXML }} + REPO_PRAXIS: ${{ steps.parse-inputs.outputs.REPO_PRAXIS }} + REF_PRAXIS: ${{ steps.parse-inputs.outputs.REF_PRAXIS }} + steps: + - name: Set build date + id: date + shell: bash -x -e {0} + run: | + BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') + echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + + - name: split input "repo#ref" into repo and ref parts + id: parse-inputs + shell: bash -x -e {0} + run: | + source .github/workflows/scripts/parse_git_src.sh + + # default values are for `pull_request` event types + parse_git_src JAX "${{ inputs.SRC_JAX }}" + parse_git_src XLA "${{ inputs.SRC_XLA }}" + parse_git_src TE "${{ inputs.SRC_TE }}" + parse_git_src T5X "${{ inputs.SRC_T5X }}" + parse_git_src PAXML "${{ inputs.SRC_PAXML }}" + parse_git_src PRAXIS "${{ inputs.SRC_PRAXIS }}" + + build-base: + needs: metadata + uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: arm64 + BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + secrets: inherit + + build-jax: + needs: [metadata, build-base] + uses: ./.github/workflows/_build_jax.yaml + with: + ARCHITECTURE: arm64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAGS }} + REPO_JAX: ${{ needs.metadata.outputs.REPO_JAX }} + REF_JAX: ${{ needs.metadata.outputs.REF_JAX }} + REPO_XLA: ${{ needs.metadata.outputs.REPO_XLA }} + REF_XLA: ${{ needs.metadata.outputs.REF_XLA }} + REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} + REF_TE: ${{ needs.metadata.outputs.REF_TE }} + secrets: inherit + + build-t5x: + needs: [metadata, build-jax] + uses: ./.github/workflows/_build_t5x.yaml + with: + ARCHITECTURE: arm64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} + REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} + secrets: inherit + + build-pax: + needs: [metadata, build-jax] + uses: ./.github/workflows/_build_pax.yaml + with: + ARCHITECTURE: arm64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + REPO_PAXML: ${{ needs.metadata.outputs.REPO_PAXML }} + REF_PAXML: ${{ needs.metadata.outputs.REF_PAXML }} + REPO_PRAXIS: ${{ needs.metadata.outputs.REPO_PRAXIS }} + REF_PRAXIS: ${{ needs.metadata.outputs.REF_PRAXIS }} + secrets: inherit + + # build-rosetta-t5x: + # uses: ./.github/workflows/_build_rosetta.yaml + # needs: [metadata, build-t5x] + # with: + # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_STAGING }} + # BASE_LIBRARY: t5x + # secrets: inherit + + # build-rosetta-pax: + # uses: ./.github/workflows/_build_rosetta.yaml + # needs: [metadata, build-pax] + # with: + # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_STAGING }} + # BASE_LIBRARY: pax + # secrets: inherit + + build-summary: + # needs: [build-base, build-jax, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] + needs: [build-base, build-jax, build-t5x, build-pax] + if: always() + runs-on: ubuntu-22.04 + steps: + - name: Generate job summary for container build + shell: bash -x -e {0} + run: | + cat > $GITHUB_STEP_SUMMARY << EOF + # Images created + + | Image | Link | + | ------------ | -------------------------------------------------- | + | Base | ${{ needs.build-base.outputs.DOCKER_TAGS }} | + | JAX | ${{ needs.build-jax.outputs.DOCKER_TAGS }} | + | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | + | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | + EOF + + # | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | + # | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 51d3bce25..40324de0d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -53,174 +53,30 @@ permissions: jobs: - metadata: - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - REPO_JAX: ${{ steps.parse-inputs.outputs.REPO_JAX }} - REF_JAX: ${{ steps.parse-inputs.outputs.REF_JAX }} - REPO_XLA: ${{ steps.parse-inputs.outputs.REPO_XLA }} - REF_XLA: ${{ steps.parse-inputs.outputs.REF_XLA }} - REPO_TE: ${{ steps.parse-inputs.outputs.REPO_TE }} - REF_TE: ${{ steps.parse-inputs.outputs.REF_TE }} - REPO_T5X: ${{ steps.parse-inputs.outputs.REPO_T5X }} - REF_T5X: ${{ steps.parse-inputs.outputs.REF_T5X }} - REPO_PAXML: ${{ steps.parse-inputs.outputs.REPO_PAXML }} - REF_PAXML: ${{ steps.parse-inputs.outputs.REF_PAXML }} - REPO_PRAXIS: ${{ steps.parse-inputs.outputs.REPO_PRAXIS }} - REF_PRAXIS: ${{ steps.parse-inputs.outputs.REF_PRAXIS }} - steps: - - name: Set build date - id: date - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - - - name: Parse inputs - id: parse-inputs - shell: bash -x -e {0} - run: | - # split input in the format of repo#ref into repo and ref parts - parse_git_src() { - PACKAGE=$1 - INPUT="$2" - DEFAULT="$3" - SRC="${INPUT:-${DEFAULT}}" - echo "REPO_${PACKAGE}=$(echo "${SRC}" | cut -f1 -d#)" >> $GITHUB_OUTPUT - echo "REF_${PACKAGE}=$(echo "${SRC}" | cut -f2 -d#)" >> $GITHUB_OUTPUT - } - - # default values are for `pull_request`` event types - parse_git_src JAX "${{ inputs.SRC_JAX }}" "https://github.com/google/jax.git#main" - parse_git_src XLA "${{ inputs.SRC_XLA }}" "https://github.com/openxla/xla.git#main" - parse_git_src TE "${{ inputs.SRC_TE }}" "https://github.com/NVIDIA/TransformerEngine.git#main" - parse_git_src T5X "${{ inputs.SRC_T5X }}" "https://github.com/google-research/t5x.git#main" - parse_git_src PAXML "${{ inputs.SRC_PAXML }}" "https://github.com/google/paxml.git#main" - parse_git_src PRAXIS "${{ inputs.SRC_PRAXIS }}" "https://github.com/google/praxis.git#main" - - build-base: - needs: metadata - uses: ./.github/workflows/_build_base.yaml - with: - BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - secrets: inherit - - build-jax: - needs: [metadata, build-base] - uses: ./.github/workflows/_build_jax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAGS }} - REPO_JAX: ${{ needs.metadata.outputs.REPO_JAX }} - REF_JAX: ${{ needs.metadata.outputs.REF_JAX }} - REPO_XLA: ${{ needs.metadata.outputs.REPO_XLA }} - REF_XLA: ${{ needs.metadata.outputs.REF_XLA }} - REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} - REF_TE: ${{ needs.metadata.outputs.REF_TE }} - secrets: inherit - - build-t5x: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_t5x.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} - REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} - secrets: inherit - - build-pax: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_pax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - REPO_PAXML: ${{ needs.metadata.outputs.REPO_PAXML }} - REF_PAXML: ${{ needs.metadata.outputs.REF_PAXML }} - REPO_PRAXIS: ${{ needs.metadata.outputs.REPO_PRAXIS }} - REF_PRAXIS: ${{ needs.metadata.outputs.REF_PRAXIS }} - secrets: inherit - - build-rosetta-t5x: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-t5x] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - BASE_LIBRARY: t5x - secrets: inherit - - build-rosetta-pax: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-pax] + amd64: + uses: ./.github/workflows/_ci_amd64.yaml with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - BASE_LIBRARY: pax + CUDA_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} + SRC_JAX: ${{ inputs.SRC_JAX || "https://github.com/google/jax.git#main" }} + SRC_XLA: ${{ inputs.SRC_XLA || "https://github.com/openxla/xla.git#main"}} + SRC_TE: ${{ inputs.SRC_TE || "https://github.com/NVIDIA/TransformerEngine.git#main"}} + SRC_T5X: ${{ inputs.SRC_T5X || "https://github.com/google-research/t5x.git#main"}} + SRC_PAXML: ${{ inputs.SRC_PAXML || "https://github.com/google/paxml.git#main"}} + SRC_PRAXIS: ${{ inputs.SRC_PRAXIS || "https://github.com/google/praxis.git#main"}} secrets: inherit - - build-summary: - needs: [build-base, build-jax, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | Base | ${{ needs.build-base.outputs.DOCKER_TAGS }} | - | JAX | ${{ needs.build-jax.outputs.DOCKER_TAGS }} | - | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | - | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | - | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | - EOF - - test-distribution: - needs: metadata - uses: ./.github/workflows/_test_distribution.yaml - secrets: inherit - - test-jax: - needs: build-jax - uses: ./.github/workflows/_test_jax.yaml - with: - JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - secrets: inherit - - test-te: - needs: build-jax - uses: ./.github/workflows/_test_te.yaml - with: - JAX_TE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - secrets: inherit - test-t5x: - needs: build-t5x - uses: ./.github/workflows/_test_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - secrets: inherit - - test-pax: - needs: build-pax - uses: ./.github/workflows/_test_pax.yaml + + arm64: + uses: ./.github/workflows/_ci_arm64.yaml with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} + CUDA_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} + SRC_JAX: ${{ inputs.SRC_JAX || "https://github.com/google/jax.git#main" }} + SRC_XLA: ${{ inputs.SRC_XLA || "https://github.com/openxla/xla.git#main"}} + SRC_TE: ${{ inputs.SRC_TE || "https://github.com/NVIDIA/TransformerEngine.git#main"}} + SRC_T5X: ${{ inputs.SRC_T5X || "https://github.com/google-research/t5x.git#main"}} + SRC_PAXML: ${{ inputs.SRC_PAXML || "https://github.com/google/paxml.git#main"}} + SRC_PRAXIS: ${{ inputs.SRC_PRAXIS || "https://github.com/google/praxis.git#main"}} secrets: inherit - test-vit: - needs: build-rosetta-t5x - uses: ./.github/workflows/_test_vit.yaml - with: - ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} - secrets: inherit - finalize: if: always() # TODO: use dynamic matrix to make dependencies self-updating diff --git a/.github/workflows/cuda-121-jax-pin.yaml b/.github/workflows/cuda-121-jax-pin.yaml deleted file mode 100644 index 829d0ae9d..000000000 --- a/.github/workflows/cuda-121-jax-pin.yaml +++ /dev/null @@ -1,190 +0,0 @@ -name: Nightly Containers on CUDA 12.1 (JAX pinned) - -on: - schedule: - - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC - workflow_dispatch: - inputs: - JAX_BASE_IMAGE: - type: string - description: 'Base Multiarch JAX Image' - default: 'ghcr.io/nvidia/jax-toolbox-internal:6473019396-jax-multiarch' - required: true - REPO_T5X: - type: string - description: URL of T5X repository to check out - required: false - default: "https://github.com/nvjax-svc-0/t5x.git" - REF_T5X: - type: string - description: Git commit, tag, or branch for T5X - required: false - default: unpin-tfds-gpu-extra - REPO_TE: - type: string - description: URL of TE repository to check out - required: false - default: "https://github.com/NVIDIA/TransformerEngine.git" - REF_TE: - type: string - description: Git commit, tag, or branch for TE - required: false - default: v0.13 - PUBLISH: - type: boolean - description: Publish dated images and update the 'latest' tag? - default: false - required: false - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -env: - DEFAULT_JAX_BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6473019396-jax-multiarch - DEFAULT_REPO_T5X: https://github.com/nvjax-svc-0/t5x.git - DEFAULT_REF_T5X: unpin-tfds-gpu-extra - DEFAULT_REPO_TE: https://github.com/NVIDIA/TransformerEngine.git - DEFAULT_REF_TE: v0.13 - -jobs: - - metadata: - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.meta.outputs.BUILD_DATE }} - JAX_BASE_IMAGE: ${{ steps.meta.outputs.JAX_BASE_IMAGE}} - REPO_T5X: ${{ steps.meta.outputs.REPO_T5X }} - REF_T5X: ${{ steps.meta.outputs.REF_T5X }} - REPO_TE: ${{ steps.meta.outputs.REPO_TE }} - REF_TE: ${{ steps.meta.outputs.REF_TE }} - steps: - - name: Set build date and base image - id: meta - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - if [[ -z "${{ inputs.JAX_BASE_IMAGE }}" ]]; then - echo "JAX_BASE_IMAGE=${{ env.DEFAULT_JAX_BASE_IMAGE }}" >> $GITHUB_OUTPUT - else - echo "JAX_BASE_IMAGE=${{ inputs.JAX_BASE_IMAGE }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REPO_T5X }}" ]]; then - echo "REPO_T5X=${{ env.DEFAULT_REPO_T5X }}" >> $GITHUB_OUTPUT - else - echo "REPO_T5X=${{ inputs.REPO_T5X }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REF_T5X }}" ]]; then - echo "REF_T5X=${{ env.DEFAULT_REF_T5X }}" >> $GITHUB_OUTPUT - else - echo "REF_T5X=${{ inputs.REF_T5X }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REPO_TE }}" ]]; then - echo "REPO_TE=${{ env.DEFAULT_REPO_TE }}" >> $GITHUB_OUTPUT - else - echo "REPO_TE=${{ inputs.REPO_TE }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REF_TE }}" ]]; then - echo "REF_TE=${{ env.DEFAULT_REF_TE }}" >> $GITHUB_OUTPUT - else - echo "REF_TE=${{ inputs.REF_TE }}" >> $GITHUB_OUTPUT - fi - - build-pax: - needs: [metadata] - uses: ./.github/workflows/_build_pax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - secrets: inherit - - build-rosetta-pax: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-pax] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - BASE_LIBRARY: pax - PLATFORMS: '["amd64"]' - secrets: inherit - - build-t5x: - needs: [metadata] - uses: ./.github/workflows/_build_t5x.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} - REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} - REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} - REF_TE: ${{ needs.metadata.outputs.REF_TE }} - secrets: inherit - - build-rosetta-t5x: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-t5x] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - BASE_LIBRARY: t5x - PLATFORMS: '["amd64"]' - secrets: inherit - - build-summary: - needs: [metadata, build-t5x, build-rosetta-t5x, build-pax, build-rosetta-pax] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | JAX (input) | ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} | - | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | - | ROSETTA(T5X) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | - | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | - EOF - - test-jax: - needs: metadata - uses: ./.github/workflows/_test_jax.yaml - with: - JAX_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - secrets: inherit - - test-pax: - needs: build-pax - uses: ./.github/workflows/_test_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - secrets: inherit - - test-t5x: - needs: build-t5x - uses: ./.github/workflows/_test_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - secrets: inherit - - # TODO(terry): This is missing the rosetta tests which can only be added - # After a fix for the TB log collision is pushed. - - finalize: - if: always() - # TODO: use dynamic matrix to make dependencies self-updating - needs: [build-summary, test-jax, test-pax] - uses: ./.github/workflows/_finalize.yaml - with: - PUBLISH_BADGE: false - secrets: inherit diff --git a/.github/workflows/cuda-122-jax-pin.yaml b/.github/workflows/cuda-122-jax-pin.yaml deleted file mode 100644 index 3ea4f053b..000000000 --- a/.github/workflows/cuda-122-jax-pin.yaml +++ /dev/null @@ -1,189 +0,0 @@ -name: Nightly Containers on CUDA 12.2 (JAX pinned) - -on: - schedule: - - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC - workflow_dispatch: - inputs: - JAX_BASE_IMAGE: - type: string - description: 'Base Multiarch JAX Image' - default: 'ghcr.io/nvidia/jax-toolbox-internal:6475553977-jax-multiarch' - required: true - REPO_T5X: - type: string - description: URL of T5X repository to check out - required: false - default: "https://github.com/nvjax-svc-0/t5x.git" - REF_T5X: - type: string - description: Git commit, tag, or branch for T5X - required: false - default: unpin-tfds-gpu-extra - REPO_TE: - type: string - description: URL of TE repository to check out - required: false - default: "https://github.com/NVIDIA/TransformerEngine.git" - REF_TE: - type: string - description: Git commit, tag, or branch for TE - required: false - default: v0.13 - PUBLISH: - type: boolean - description: Publish dated images and update the 'latest' tag? - default: false - required: false - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -env: - DEFAULT_JAX_BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6475553977-jax-multiarch - DEFAULT_REPO_T5X: https://github.com/nvjax-svc-0/t5x.git - DEFAULT_REF_T5X: unpin-tfds-gpu-extra - DEFAULT_REPO_TE: https://github.com/NVIDIA/TransformerEngine.git - DEFAULT_REF_TE: v0.13 - -jobs: - - metadata: - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.meta.outputs.BUILD_DATE }} - JAX_BASE_IMAGE: ${{ steps.meta.outputs.JAX_BASE_IMAGE}} - REPO_T5X: ${{ steps.meta.outputs.REPO_T5X }} - REF_T5X: ${{ steps.meta.outputs.REF_T5X }} - REPO_TE: ${{ steps.meta.outputs.REPO_TE }} - REF_TE: ${{ steps.meta.outputs.REF_TE }} - steps: - - name: Set build date and base image - id: meta - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - if [[ -z "${{ inputs.JAX_BASE_IMAGE }}" ]]; then - echo "JAX_BASE_IMAGE=${{ env.DEFAULT_JAX_BASE_IMAGE }}" >> $GITHUB_OUTPUT - else - echo "JAX_BASE_IMAGE=${{ inputs.JAX_BASE_IMAGE }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REPO_T5X }}" ]]; then - echo "REPO_T5X=${{ env.DEFAULT_REPO_T5X }}" >> $GITHUB_OUTPUT - else - echo "REPO_T5X=${{ inputs.REPO_T5X }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REF_T5X }}" ]]; then - echo "REF_T5X=${{ env.DEFAULT_REF_T5X }}" >> $GITHUB_OUTPUT - else - echo "REF_T5X=${{ inputs.REF_T5X }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REPO_TE }}" ]]; then - echo "REPO_TE=${{ env.DEFAULT_REPO_TE }}" >> $GITHUB_OUTPUT - else - echo "REPO_TE=${{ inputs.REPO_TE }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REF_TE }}" ]]; then - echo "REF_TE=${{ env.DEFAULT_REF_TE }}" >> $GITHUB_OUTPUT - else - echo "REF_TE=${{ inputs.REF_TE }}" >> $GITHUB_OUTPUT - fi - - build-pax: - needs: [metadata] - uses: ./.github/workflows/_build_pax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - secrets: inherit - - build-rosetta-pax: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-pax] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - BASE_LIBRARY: pax - secrets: inherit - - build-t5x: - needs: [metadata] - uses: ./.github/workflows/_build_t5x.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} - REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} - REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} - REF_TE: ${{ needs.metadata.outputs.REF_TE }} - secrets: inherit - - build-rosetta-t5x: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-t5x] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - BASE_LIBRARY: t5x - PLATFORMS: '["amd64"]' - secrets: inherit - - build-summary: - needs: [metadata, build-t5x, build-rosetta-t5x, build-pax, build-rosetta-pax] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | JAX (input) | ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} | - | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | - | ROSETTA(T5X) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | - | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | - EOF - - test-jax: - needs: metadata - uses: ./.github/workflows/_test_jax.yaml - with: - JAX_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - secrets: inherit - - test-pax: - needs: build-pax - uses: ./.github/workflows/_test_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - secrets: inherit - - test-t5x: - needs: build-t5x - uses: ./.github/workflows/_test_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - secrets: inherit - - # TODO(terry): This is missing the rosetta tests which can only be added - # After a fix for the TB log collision is pushed. - - finalize: - if: always() - # TODO: use dynamic matrix to make dependencies self-updating - needs: [build-summary, test-jax, test-pax] - uses: ./.github/workflows/_finalize.yaml - with: - PUBLISH_BADGE: false - secrets: inherit diff --git a/.github/workflows/pax-cuda-121.yaml b/.github/workflows/pax-cuda-121.yaml deleted file mode 100644 index c468872fe..000000000 --- a/.github/workflows/pax-cuda-121.yaml +++ /dev/null @@ -1,185 +0,0 @@ -name: Nightly Containers on CUDA 12.1 - -on: - schedule: - - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC - workflow_dispatch: - inputs: - PUBLISH: - type: boolean - description: Publish dated images and update the 'latest' tag? - default: false - required: false - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - - metadata: - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - steps: - - name: Set build date - id: date - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - - build-base: - needs: metadata - uses: ./.github/workflows/_build_base.yaml - with: - BASE_IMAGE: 'nvidia/cuda:12.1.1-devel-ubuntu22.04' - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - secrets: inherit - - build-jax: - needs: [metadata, build-base] - uses: ./.github/workflows/_build_jax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAGS }} - secrets: inherit - - build-pax: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_pax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - secrets: inherit - - build-rosetta-pax: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-pax] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - BASE_LIBRARY: pax - PLATFORMS: '["amd64"]' - secrets: inherit - - build-t5x: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_t5x.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - secrets: inherit - - build-rosetta-t5x: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-t5x] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - BASE_LIBRARY: t5x - PLATFORMS: '["amd64"]' - secrets: inherit - - build-summary: - needs: [build-base, build-jax, build-t5x, build-rosetta-t5x, build-pax, build-rosetta-pax] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | Base | ${{ needs.build-base.outputs.DOCKER_TAGS }} | - | JAX | ${{ needs.build-jax.outputs.DOCKER_TAGS }} | - | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | - | ROSETTA(T5X) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | - | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | - EOF - - publish-upstream-pax: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build-pax] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - TARGET_IMAGE: upstream-pax - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }}-cuda-12.1,priority=900 - - publish-pax: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build-rosetta-pax] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} - TARGET_IMAGE: pax - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }}-cuda-12.1,priority=900 - - publish-upstream-t5x: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build-t5x] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - TARGET_IMAGE: upstream-t5x - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }}-cuda-12.1,priority=900 - - publish-t5x: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build-rosetta-t5x] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} - TARGET_IMAGE: t5x - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }}-cuda-12.1,priority=900 - - test-jax: - needs: build-jax - uses: ./.github/workflows/_test_jax.yaml - with: - JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - secrets: inherit - - test-pax: - needs: build-pax - uses: ./.github/workflows/_test_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - secrets: inherit - - test-t5x: - needs: build-t5x - uses: ./.github/workflows/_test_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - secrets: inherit - - # TODO(terry): This is missing the rosetta tests which can only be added - # After a fix for the TB log collision is pushed. - - finalize: - if: always() - # TODO: use dynamic matrix to make dependencies self-updating - needs: [build-summary, test-jax, test-pax] - uses: ./.github/workflows/_finalize.yaml - with: - PUBLISH_BADGE: false - secrets: inherit diff --git a/.github/workflows/scripts/parse_git_src.sh b/.github/workflows/scripts/parse_git_src.sh new file mode 100644 index 000000000..16e95cdf9 --- /dev/null +++ b/.github/workflows/scripts/parse_git_src.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +parse_git_src() { + PACKAGE=$1 + SRC="$2" + echo "REPO_${PACKAGE}=$(echo "${SRC}" | cut -f1 -d#)" >> $GITHUB_OUTPUT + echo "REF_${PACKAGE}=$(echo "${SRC}" | cut -f2 -d#)" >> $GITHUB_OUTPUT +} \ No newline at end of file From 8943e9fd860effbb52742474643c8caea6498e9a Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 08:44:42 +0000 Subject: [PATCH 056/146] redesign CI --- .github/workflows/_ci_amd64.yaml | 36 ++++++++++++-------------------- .github/workflows/_ci_arm64.yaml | 36 ++++++++++++-------------------- .github/workflows/ci.yaml | 26 +++++++++++++++++++++++ 3 files changed, 52 insertions(+), 46 deletions(-) diff --git a/.github/workflows/_ci_amd64.yaml b/.github/workflows/_ci_amd64.yaml index e4b1bf6c1..8b985dcf6 100644 --- a/.github/workflows/_ci_amd64.yaml +++ b/.github/workflows/_ci_amd64.yaml @@ -24,6 +24,19 @@ on: SRC_PRAXIS: type: string required: true + outputs: + TAG_BASE: + description: "Tags of the base image built" + value: ${{ jobs.build-base.outputs.DOCKER_TAGS }} + TAG_JAX: + description: "Tags of the JAX image built" + value: ${{ jobs.build-jax.outputs.DOCKER_TAG_FINAL }} + TAG_T5X: + description: "Tags of the T5X image built" + value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_FINAL }} + TAG_PAX: + description: "Tags of the PAX image built" + value: ${{ jobs.build-pax.outputs.DOCKER_TAG_FINAL }} permissions: contents: read # to fetch code @@ -136,29 +149,6 @@ jobs: # BASE_LIBRARY: pax # secrets: inherit - build-summary: - # needs: [build-base, build-jax, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] - needs: [build-base, build-jax, build-t5x, build-pax] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | Base | ${{ needs.build-base.outputs.DOCKER_TAGS }} | - | JAX | ${{ needs.build-jax.outputs.DOCKER_TAGS }} | - | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | - | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | - EOF - - # | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - # | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | - # test-distribution: # needs: metadata # uses: ./.github/workflows/_test_distribution.yaml diff --git a/.github/workflows/_ci_arm64.yaml b/.github/workflows/_ci_arm64.yaml index 520bffae5..1bc17e956 100644 --- a/.github/workflows/_ci_arm64.yaml +++ b/.github/workflows/_ci_arm64.yaml @@ -24,6 +24,19 @@ on: SRC_PRAXIS: type: string required: true + outputs: + TAG_BASE: + description: "Tags of the base image built" + value: ${{ jobs.build-base.outputs.DOCKER_TAGS }} + TAG_JAX: + description: "Tags of the JAX image built" + value: ${{ jobs.build-jax.outputs.DOCKER_TAG_FINAL }} + TAG_T5X: + description: "Tags of the T5X image built" + value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_FINAL }} + TAG_PAX: + description: "Tags of the PAX image built" + value: ${{ jobs.build-pax.outputs.DOCKER_TAG_FINAL }} permissions: contents: read # to fetch code @@ -135,26 +148,3 @@ jobs: # BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_STAGING }} # BASE_LIBRARY: pax # secrets: inherit - - build-summary: - # needs: [build-base, build-jax, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] - needs: [build-base, build-jax, build-t5x, build-pax] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | Base | ${{ needs.build-base.outputs.DOCKER_TAGS }} | - | JAX | ${{ needs.build-jax.outputs.DOCKER_TAGS }} | - | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | - | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | - EOF - - # | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - # | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 40324de0d..357c462c0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -77,6 +77,32 @@ jobs: SRC_PRAXIS: ${{ inputs.SRC_PRAXIS || "https://github.com/google/praxis.git#main"}} secrets: inherit + build-summary: + needs: [amd64, arm64] + if: always() + runs-on: ubuntu-22.04 + steps: + - name: Generate job summary for container build + shell: bash -x -e {0} + run: | + cat > $GITHUB_STEP_SUMMARY << EOF + # Images created + + | Image | Link | + | ------------ | -------------------------------------------------- | + | Base | ${{ needs.amd64.outputs.TAG_BASE }} | + | | ${{ needs.arm64.outputs.TAG_BASE }} | + | JAX | ${{ needs.amd64.outputs.TAG_JAX }} | + | | ${{ needs.arm64.outputs.TAG_JAX }} | + | T5X | ${{ needs.amd64.outputs.TAG_T5X }} | + | | ${{ needs.arm64.outputs.TAG_T5X }} | + | PAX | ${{ needs.amd64.outputs.TAG_PAX }} | + | | ${{ needs.arm64.outputs.TAG_PAX }} | + EOF + + # | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | + # | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | + finalize: if: always() # TODO: use dynamic matrix to make dependencies self-updating From 94115baa6eca9117e77dd470f520b31d87d2c2de Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 08:50:44 +0000 Subject: [PATCH 057/146] refactor CI --- .github/workflows/_finalize.yaml | 88 ++++++++++++++++++++++++++-- .github/workflows/_summary.yaml | 28 --------- .github/workflows/_upload_badge.yaml | 77 ------------------------ 3 files changed, 84 insertions(+), 109 deletions(-) delete mode 100644 .github/workflows/_summary.yaml delete mode 100644 .github/workflows/_upload_badge.yaml diff --git a/.github/workflows/_finalize.yaml b/.github/workflows/_finalize.yaml index b77e93074..eacd212f2 100644 --- a/.github/workflows/_finalize.yaml +++ b/.github/workflows/_finalize.yaml @@ -11,12 +11,92 @@ on: jobs: upload-badge: - uses: ./.github/workflows/_upload_badge.yaml - secrets: inherit + runs-on: ubuntu-22.04 + env: + # Name/bash regex for shields.io endpoint JSON files + BADGE_FILES: '*badge*.json' + outputs: + GIST_ID: ${{ steps.extract-id.outputs.GIST_ID }} + steps: + - name: Download artifacts specified by input + uses: actions/download-artifact@v3 + + - name: Collect all badge files to temporary folder + id: collect + shell: bash -x -e {0} + run: | + workdir=$(mktemp -d) + find -name "${BADGE_FILES}" | while read -s f; do + cp "$f" $workdir + done + echo "WORKDIR=$workdir" >> $GITHUB_OUTPUT + + - name: Upload badge files to gist + id: upload + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.NVJAX_GIST_TOKEN }} + script: | + const currentDateTime = new Date().toISOString(); + const gistDescription = + `Badge endpoint files from Workflow: ${{ github.workflow }}, ` + + `Run ID: ${{ github.run_id }}, ` + + `Repository: ${{ github.repository }}, ` + + `Event: ${{ github.event_name }}, ` + + `Created: ${currentDateTime}`; + + const fs = require('fs').promises; + const workdir = '${{ steps.collect.outputs.WORKDIR }}' + const files = await fs.readdir(workdir); + + gist = await github.rest.gists.create({ + description: gistDescription, + public: false, + files: Object.fromEntries( + await Promise.all( + files.map( + async filename => { + const content = await fs.readFile(`${workdir}/${filename}`, 'utf8'); + return [filename, { content }]; + } + ) + ) + ) + }); + + console.log(gist) + + return gist.data.id; + + - name: Return Gist ID + id: extract-id + shell: bash -x -e {0} + run: | + GIST_ID="${{ steps.upload.outputs.result }}" + echo "GIST_ID=${GIST_ID//\"/}" >> $GITHUB_OUTPUT report: - uses: ./.github/workflows/_summary.yaml - secrets: inherit + runs-on: ubuntu-22.04 + steps: + - name: Download artifacts + uses: actions/download-artifact@v3 + + - name: Write output to step summary + shell: bash -x -e {0} + run: | + find -name "sitrep.json" | while read -s f; do + cat "$f" | jq -r '.summary' | tee -a $GITHUB_STEP_SUMMARY + done + + # - name: Concatenate all sitreps + # shell: bash -x -e {0} + # run: | + # # combine all sitreps files into a single file, where each sitrep json sits + # # in a field named by the folder that contained it + + # find -name "sitrep.json" | while read -s f; do + # echo "$(dirname $f): $(cat $f)," >> + # done publish-badge: needs: [upload-badge] diff --git a/.github/workflows/_summary.yaml b/.github/workflows/_summary.yaml deleted file mode 100644 index 645c105d1..000000000 --- a/.github/workflows/_summary.yaml +++ /dev/null @@ -1,28 +0,0 @@ -name: ~create summary for an entire workflow run - -on: - workflow_call: - -jobs: - action: - runs-on: ubuntu-22.04 - steps: - - name: Download artifacts - uses: actions/download-artifact@v3 - - - name: Write output to step summary - shell: bash -x -e {0} - run: | - find -name "sitrep.json" | while read -s f; do - cat "$f" | jq -r '.summary' | tee -a $GITHUB_STEP_SUMMARY - done - - # - name: Concatenate all sitreps - # shell: bash -x -e {0} - # run: | - # # combine all sitreps files into a single file, where each sitrep json sits - # # in a field named by the folder that contained it - - # find -name "sitrep.json" | while read -s f; do - # echo "$(dirname $f): $(cat $f)," >> - # done diff --git a/.github/workflows/_upload_badge.yaml b/.github/workflows/_upload_badge.yaml deleted file mode 100644 index 936bdcae7..000000000 --- a/.github/workflows/_upload_badge.yaml +++ /dev/null @@ -1,77 +0,0 @@ -name: ~upload shields.io endpoint json files as a GitHub Gist - -on: - workflow_call: - inputs: - BADGE_FILES: - type: string - description: 'Name/bash regex for shields.io endpoint JSON files' - default: '*badge*.json' - required: false - outputs: - GIST_ID: - description: 'Id of the created Gist' - value: ${{ jobs.action.outputs.GIST_ID }} - -jobs: - action: - runs-on: ubuntu-22.04 - outputs: - GIST_ID: ${{ steps.extract-id.outputs.GIST_ID }} - steps: - - name: Download artifacts specified by input - uses: actions/download-artifact@v3 - - - name: Collect all badge files to temporary folder - id: collect - shell: bash -x -e {0} - run: | - workdir=$(mktemp -d) - find -name "${{ inputs.BADGE_FILES }}" | while read -s f; do - cp "$f" $workdir - done - echo "WORKDIR=$workdir" >> $GITHUB_OUTPUT - - - name: Upload badge files to gist - id: upload - uses: actions/github-script@v6 - with: - github-token: ${{ secrets.NVJAX_GIST_TOKEN }} - script: | - const currentDateTime = new Date().toISOString(); - const gistDescription = - `Badge endpoint files from Workflow: ${{ github.workflow }}, ` + - `Run ID: ${{ github.run_id }}, ` + - `Repository: ${{ github.repository }}, ` + - `Event: ${{ github.event_name }}, ` + - `Created: ${currentDateTime}`; - - const fs = require('fs').promises; - const workdir = '${{ steps.collect.outputs.WORKDIR }}' - const files = await fs.readdir(workdir); - - gist = await github.rest.gists.create({ - description: gistDescription, - public: false, - files: Object.fromEntries( - await Promise.all( - files.map( - async filename => { - const content = await fs.readFile(`${workdir}/${filename}`, 'utf8'); - return [filename, { content }]; - } - ) - ) - ) - }); - - console.log(gist) - - return gist.data.id; - - - name: Return Gist ID - id: extract-id - shell: bash -x -e {0} - run: | - GIST_ID="${{ steps.upload.outputs.result }}" - echo "GIST_ID=${GIST_ID//\"/}" >> $GITHUB_OUTPUT From ee11851b53c6de837c3f3b98361f418e7baec4f1 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 08:51:51 +0000 Subject: [PATCH 058/146] refactor CI --- .github/workflows/ci.yaml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 357c462c0..a95481817 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -57,24 +57,24 @@ jobs: uses: ./.github/workflows/_ci_amd64.yaml with: CUDA_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} - SRC_JAX: ${{ inputs.SRC_JAX || "https://github.com/google/jax.git#main" }} - SRC_XLA: ${{ inputs.SRC_XLA || "https://github.com/openxla/xla.git#main"}} - SRC_TE: ${{ inputs.SRC_TE || "https://github.com/NVIDIA/TransformerEngine.git#main"}} - SRC_T5X: ${{ inputs.SRC_T5X || "https://github.com/google-research/t5x.git#main"}} - SRC_PAXML: ${{ inputs.SRC_PAXML || "https://github.com/google/paxml.git#main"}} - SRC_PRAXIS: ${{ inputs.SRC_PRAXIS || "https://github.com/google/praxis.git#main"}} + SRC_JAX: ${{ inputs.SRC_JAX || 'https://github.com/google/jax.git#main' }} + SRC_XLA: ${{ inputs.SRC_XLA || 'https://github.com/openxla/xla.git#main'}} + SRC_TE: ${{ inputs.SRC_TE || 'https://github.com/NVIDIA/TransformerEngine.git#main'}} + SRC_T5X: ${{ inputs.SRC_T5X || 'https://github.com/google-research/t5x.git#main'}} + SRC_PAXML: ${{ inputs.SRC_PAXML || 'https://github.com/google/paxml.git#main'}} + SRC_PRAXIS: ${{ inputs.SRC_PRAXIS || 'https://github.com/google/praxis.git#main'}} secrets: inherit arm64: uses: ./.github/workflows/_ci_arm64.yaml with: CUDA_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} - SRC_JAX: ${{ inputs.SRC_JAX || "https://github.com/google/jax.git#main" }} - SRC_XLA: ${{ inputs.SRC_XLA || "https://github.com/openxla/xla.git#main"}} - SRC_TE: ${{ inputs.SRC_TE || "https://github.com/NVIDIA/TransformerEngine.git#main"}} - SRC_T5X: ${{ inputs.SRC_T5X || "https://github.com/google-research/t5x.git#main"}} - SRC_PAXML: ${{ inputs.SRC_PAXML || "https://github.com/google/paxml.git#main"}} - SRC_PRAXIS: ${{ inputs.SRC_PRAXIS || "https://github.com/google/praxis.git#main"}} + SRC_JAX: ${{ inputs.SRC_JAX || 'https://github.com/google/jax.git#main' }} + SRC_XLA: ${{ inputs.SRC_XLA || 'https://github.com/openxla/xla.git#main'}} + SRC_TE: ${{ inputs.SRC_TE || 'https://github.com/NVIDIA/TransformerEngine.git#main'}} + SRC_T5X: ${{ inputs.SRC_T5X || 'https://github.com/google-research/t5x.git#main'}} + SRC_PAXML: ${{ inputs.SRC_PAXML || 'https://github.com/google/paxml.git#main'}} + SRC_PRAXIS: ${{ inputs.SRC_PRAXIS || 'https://github.com/google/praxis.git#main'}} secrets: inherit build-summary: From 6b6fc928a974f7904f695860b37c7e874327ac3a Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 08:52:41 +0000 Subject: [PATCH 059/146] refactor CI --- .github/workflows/ci.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a95481817..65ba92b7c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -104,9 +104,8 @@ jobs: # | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | finalize: + needs: [amd64, arm64] if: always() - # TODO: use dynamic matrix to make dependencies self-updating - needs: [build-summary, test-distribution, test-jax, test-te, test-t5x, test-pax] uses: ./.github/workflows/_finalize.yaml with: PUBLISH_BADGE: false From cf66cfdbb58ff94819370ad0bf8c461b02e0a222 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 08:55:47 +0000 Subject: [PATCH 060/146] refactor CI --- .github/workflows/_finalize.yaml | 24 ++++++++++++++++++++++++ .github/workflows/ci.yaml | 26 -------------------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/.github/workflows/_finalize.yaml b/.github/workflows/_finalize.yaml index eacd212f2..5844af63c 100644 --- a/.github/workflows/_finalize.yaml +++ b/.github/workflows/_finalize.yaml @@ -10,6 +10,30 @@ on: required: false jobs: + # show-containers: + # runs-on: ubuntu-22.04 + # steps: + # - name: Generate job summary for container build + # shell: bash -x -e {0} + # run: | + # cat > $GITHUB_STEP_SUMMARY << EOF + # # Images created + + # | Image | Link | + # | ------------ | -------------------------------------------------- | + # | Base | ${{ needs.amd64.outputs.TAG_BASE }} | + # | | ${{ needs.arm64.outputs.TAG_BASE }} | + # | JAX | ${{ needs.amd64.outputs.TAG_JAX }} | + # | | ${{ needs.arm64.outputs.TAG_JAX }} | + # | T5X | ${{ needs.amd64.outputs.TAG_T5X }} | + # | | ${{ needs.arm64.outputs.TAG_T5X }} | + # | PAX | ${{ needs.amd64.outputs.TAG_PAX }} | + # | | ${{ needs.arm64.outputs.TAG_PAX }} | + # EOF + + # # | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | + # # | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | + upload-badge: runs-on: ubuntu-22.04 env: diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 65ba92b7c..30457c4a2 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -77,32 +77,6 @@ jobs: SRC_PRAXIS: ${{ inputs.SRC_PRAXIS || 'https://github.com/google/praxis.git#main'}} secrets: inherit - build-summary: - needs: [amd64, arm64] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | Base | ${{ needs.amd64.outputs.TAG_BASE }} | - | | ${{ needs.arm64.outputs.TAG_BASE }} | - | JAX | ${{ needs.amd64.outputs.TAG_JAX }} | - | | ${{ needs.arm64.outputs.TAG_JAX }} | - | T5X | ${{ needs.amd64.outputs.TAG_T5X }} | - | | ${{ needs.arm64.outputs.TAG_T5X }} | - | PAX | ${{ needs.amd64.outputs.TAG_PAX }} | - | | ${{ needs.arm64.outputs.TAG_PAX }} | - EOF - - # | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - # | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | - finalize: needs: [amd64, arm64] if: always() From 9fd45032cb12a9ff0bd1b033ce6226368c6659e3 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 08:58:52 +0000 Subject: [PATCH 061/146] refactor CI --- .github/workflows/_build_jax.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index 7b457f445..e395f6633 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -37,6 +37,16 @@ on: description: Git commit, tag, or branch for XLA required: false default: main + REPO_TE: + type: string + description: URL of transformer engine repository to check out + required: false + default: "https://github.com/openxla/xla.git" + REF_TE: + type: string + description: Git commit, tag, or branch for XLA + required: false + default: main ARTIFACT_NAME: type: string description: 'Name of the artifact zip file' @@ -140,8 +150,10 @@ jobs: BUILD_DATE=${{ inputs.BUILD_DATE }} REPO_JAX=${{ inputs.REPO_JAX }} REPO_XLA=${{ inputs.REPO_XLA }} + REPO_TE=${{ inputs.REPO_TE }} REF_JAX=${{ inputs.REF_JAX }} REF_XLA=${{ inputs.REF_XLA }} + REF_TE=${{ inputs.REF_TE }} - name: Set docker metadata - final id: final-metadata From f2e80a1344309777ca67a6a7524a0faf296cc263 Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Thu, 9 Nov 2023 01:11:19 -0800 Subject: [PATCH 062/146] file permission --- .github/workflows/scripts/parse_git_src.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 .github/workflows/scripts/parse_git_src.sh diff --git a/.github/workflows/scripts/parse_git_src.sh b/.github/workflows/scripts/parse_git_src.sh old mode 100644 new mode 100755 From 7db2f84f1c3e2a687a011389a230163a7832a0a8 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 09:15:23 +0000 Subject: [PATCH 063/146] refactor CI --- .github/workflows/_ci_amd64.yaml | 3 +++ .github/workflows/_ci_arm64.yaml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/_ci_amd64.yaml b/.github/workflows/_ci_amd64.yaml index 8b985dcf6..1da5f3508 100644 --- a/.github/workflows/_ci_amd64.yaml +++ b/.github/workflows/_ci_amd64.yaml @@ -62,6 +62,9 @@ jobs: REPO_PRAXIS: ${{ steps.parse-inputs.outputs.REPO_PRAXIS }} REF_PRAXIS: ${{ steps.parse-inputs.outputs.REF_PRAXIS }} steps: + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v3 + - name: Set build date id: date shell: bash -x -e {0} diff --git a/.github/workflows/_ci_arm64.yaml b/.github/workflows/_ci_arm64.yaml index 1bc17e956..a686dc4f0 100644 --- a/.github/workflows/_ci_arm64.yaml +++ b/.github/workflows/_ci_arm64.yaml @@ -62,6 +62,9 @@ jobs: REPO_PRAXIS: ${{ steps.parse-inputs.outputs.REPO_PRAXIS }} REF_PRAXIS: ${{ steps.parse-inputs.outputs.REF_PRAXIS }} steps: + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v3 + - name: Set build date id: date shell: bash -x -e {0} From 7090349501211344d7065bd97b5ddbcb7605b31c Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 09:19:09 +0000 Subject: [PATCH 064/146] refactor CI --- .github/workflows/_build_base.yaml | 4 +- .github/workflows/_build_jax.yaml | 6 +- .github/workflows/_build_pax.yaml | 6 +- .github/workflows/_build_t5x.yaml | 6 +- .github/workflows/_build_te.yaml | 90 ------------------------------ 5 files changed, 11 insertions(+), 101 deletions(-) delete mode 100644 .github/workflows/_build_te.yaml diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml index a01655973..db5a2bb01 100644 --- a/.github/workflows/_build_base.yaml +++ b/.github/workflows/_build_base.yaml @@ -30,7 +30,7 @@ on: outputs: DOCKER_TAG: description: "Tag of the image built" - value: ${{ jobs.build.outputs.DOCKER_TAG }} + value: ${{ jobs.build-base.outputs.DOCKER_TAG }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -42,7 +42,7 @@ permissions: jobs: - build: + build-base: runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index e395f6633..8d36374c5 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -60,10 +60,10 @@ on: outputs: DOCKER_TAG_FINAL: description: "Tags of the complete image built" - value: ${{ jobs.build.outputs.DOCKER_TAG_FINAL }} + value: ${{ jobs.build-jax.outputs.DOCKER_TAG_FINAL }} DOCKER_TAG_STAGING: description: "Tags of the 'staging' image built" - value: ${{ jobs.build.outputs.DOCKER_TAG_STAGING }} + value: ${{ jobs.build-jax.outputs.DOCKER_TAG_STAGING }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -75,7 +75,7 @@ permissions: jobs: - build: + build-jax: runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", large] env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 27b1c2e6c..e97c92f39 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -50,10 +50,10 @@ on: outputs: DOCKER_TAG_FINAL: description: "Tags of the complete image built" - value: ${{ jobs.build.outputs.DOCKER_TAG_FINAL }} + value: ${{ jobs.build-pax.outputs.DOCKER_TAG_FINAL }} DOCKER_TAG_STAGING: description: "Tags of the 'staging' image built" - value: ${{ jobs.build.outputs.DOCKER_TAG_STAGING }} + value: ${{ jobs.build-pax.outputs.DOCKER_TAG_STAGING }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -65,7 +65,7 @@ permissions: jobs: - build: + build-pax: runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index e65bfc847..a2202567a 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -50,10 +50,10 @@ on: outputs: DOCKER_TAG_FINAL: description: "Tags of the complete image built" - value: ${{ jobs.build.outputs.DOCKER_TAG_FINAL }} + value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_FINAL }} DOCKER_TAG_STAGING: description: "Tags of the 'staging' image built" - value: ${{ jobs.build.outputs.DOCKER_TAG_STAGING }} + value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_STAGING }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -65,7 +65,7 @@ permissions: jobs: - build: + build-t5x: runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json diff --git a/.github/workflows/_build_te.yaml b/.github/workflows/_build_te.yaml deleted file mode 100644 index d80f334ef..000000000 --- a/.github/workflows/_build_te.yaml +++ /dev/null @@ -1,90 +0,0 @@ -name: ~build Transformer Engine container - -on: - workflow_call: - inputs: - BASE_IMAGE: - type: string - description: 'Base docker image that provides JAX' - required: false - default: ghcr.io/nvidia/jax:latest - BUILD_DATE: - type: string - description: "Build date in YYYY-MM-DD format" - required: false - default: 'NOT SPECIFIED' - REPO_TE: - type: string - description: URL of TE repository to check out - required: false - default: "https://github.com/NVIDIA/TransformerEngine.git" - REF_TE: - type: string - description: Git commit, tag, or branch for TE - required: false - default: main - outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.build.outputs.DOCKER_TAGS }} - -env: - UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - - build: - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - runs-on: [self-hosted, x86, small] - steps: - - name: Print environment variables - run: env - - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-te - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - with: - driver-opts: | - image=moby/buildkit:v0.10.6 - - - name: Build docker images - uses: docker/build-push-action@v4 - with: - context: .github/container - push: true - file: .github/container/Dockerfile.te - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - build-args: | - BASE_IMAGE=${{ inputs.BASE_IMAGE }} - BUILD_DATE=${{ inputs.BUILD_DATE }} - REPO_TE=${{ inputs.REPO_TE }} - REF_TE=${{ inputs.REF_TE }} \ No newline at end of file From 6900d864c125ff1d3e9197daf2225385ad47057b Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 09:22:01 +0000 Subject: [PATCH 065/146] refactor CI --- .github/workflows/_test_jax.yaml | 2 +- .github/workflows/_test_pax.yaml | 8 ++++---- .github/workflows/_test_t5x.yaml | 10 +++++----- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/_test_jax.yaml b/.github/workflows/_test_jax.yaml index c7e26eec1..02f4de9a0 100644 --- a/.github/workflows/_test_jax.yaml +++ b/.github/workflows/_test_jax.yaml @@ -28,7 +28,7 @@ jobs: TIME: "01:00:00" secrets: inherit - unit-test: + jax-unit-test: strategy: fail-fast: false matrix: diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index b70e3c87a..80307f82b 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -1,4 +1,4 @@ -name: ~test Pax, MGMN +name: ~test Pax, multi-node on: workflow_call: @@ -20,7 +20,7 @@ on: jobs: - multi-gpu-multi-node: + pax-multi-node: strategy: matrix: PARALLEL_CONFIG: @@ -158,7 +158,7 @@ jobs: path: output/* metrics: - needs: multi-gpu-multi-node + needs: pax-multi-node runs-on: ubuntu-22.04 steps: @@ -196,7 +196,7 @@ jobs: publish-test: - needs: [multi-gpu-multi-node, metrics] + needs: [pax-multi-node, metrics] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index 4a4f9ab21..cb9d7d6b9 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -1,4 +1,4 @@ -name: ~test T5X, MGMN +name: ~test T5X, multi-node on: workflow_call: @@ -25,7 +25,7 @@ on: jobs: - single-process-multi-device: + t5x-multi-gpu: strategy: matrix: N_GPU: [1, 2, 4, 8] @@ -144,7 +144,7 @@ jobs: name: ${{ steps.meta.outputs.JOB_NAME }} path: output/* - multi-gpu-multi-node: + t5x-multi-node: strategy: matrix: N_GPU: [1, 2, 4, 8] @@ -269,7 +269,7 @@ jobs: path: output/* metrics: - needs: [multi-gpu-multi-node, single-process-multi-device] + needs: [t5x-multi-node, t5x-multi-gpu] runs-on: ubuntu-22.04 steps: @@ -307,7 +307,7 @@ jobs: publish-test: - needs: [multi-gpu-multi-node, single-process-multi-device, metrics] + needs: [t5x-multi-node, t5x-multi-gpu, metrics] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit From 06327c1c932b9d55fd3337e7801def9289c54891 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 09:24:22 +0000 Subject: [PATCH 066/146] refactor CI --- .github/workflows/_copy_gist.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_copy_gist.yaml b/.github/workflows/_copy_gist.yaml index 492d9fd70..fa2c09845 100644 --- a/.github/workflows/_copy_gist.yaml +++ b/.github/workflows/_copy_gist.yaml @@ -18,7 +18,7 @@ on: default: '.*' jobs: - action: + copy-gist: runs-on: ubuntu-22.04 steps: - name: copy badge to primary Gist From 618a3f59cc0df1e65f6ccf768a00162e888e0274 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 09:25:30 +0000 Subject: [PATCH 067/146] refactor CI --- .github/workflows/_runner_ondemand_slurm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_runner_ondemand_slurm.yaml b/.github/workflows/_runner_ondemand_slurm.yaml index 507a67139..1c676310a 100644 --- a/.github/workflows/_runner_ondemand_slurm.yaml +++ b/.github/workflows/_runner_ondemand_slurm.yaml @@ -19,7 +19,7 @@ on: jobs: - launch: + launch-slurm-runner: runs-on: ubuntu-latest steps: - name: Print environment variables From c659d3ceabf63c913b00d585d12c33dad67ec783 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 09:41:52 +0000 Subject: [PATCH 068/146] refactor CI --- .github/workflows/_ci_amd64.yaml | 2 +- .github/workflows/_ci_arm64.yaml | 2 +- .github/workflows/nightly-jax-build.yaml | 64 ++++++++++++++++++++---- 3 files changed, 55 insertions(+), 13 deletions(-) diff --git a/.github/workflows/_ci_amd64.yaml b/.github/workflows/_ci_amd64.yaml index 1da5f3508..4e2923e4f 100644 --- a/.github/workflows/_ci_amd64.yaml +++ b/.github/workflows/_ci_amd64.yaml @@ -101,7 +101,7 @@ jobs: with: ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAGS }} + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} REPO_JAX: ${{ needs.metadata.outputs.REPO_JAX }} REF_JAX: ${{ needs.metadata.outputs.REF_JAX }} REPO_XLA: ${{ needs.metadata.outputs.REPO_XLA }} diff --git a/.github/workflows/_ci_arm64.yaml b/.github/workflows/_ci_arm64.yaml index a686dc4f0..1515bd298 100644 --- a/.github/workflows/_ci_arm64.yaml +++ b/.github/workflows/_ci_arm64.yaml @@ -101,7 +101,7 @@ jobs: with: ARCHITECTURE: arm64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAGS }} + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} REPO_JAX: ${{ needs.metadata.outputs.REPO_JAX }} REF_JAX: ${{ needs.metadata.outputs.REF_JAX }} REPO_XLA: ${{ needs.metadata.outputs.REPO_XLA }} diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index fb39e8650..c03ebbaa1 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -37,28 +37,70 @@ jobs: run: | echo "PUBLISH=${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT - build: + build-amd64: needs: metadata uses: ./.github/workflows/_build_jax.yaml with: + ARCHITECTURE: amd64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + secrets: inherit + + build-arm64: + needs: metadata + uses: ./.github/workflows/_build_jax.yaml + with: + ARCHITECTURE: arm64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit publish: if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: jax - TARGET_TAGS: | - type=raw,value=latest,priority=1000 - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + runs-on: ubuntu-latest + needs: [metadata, build-amd64, build-arm64] + env: + UPLD_IMAGE: 'ghcr.io/nvidia/jax' + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set docker metadata + id: meta + uses: docker/metadata-action@v4 + with: + images: | + ${{ env.UPLD_IMAGE }} + flavor: | + latest=false + tags: | + type=raw,value=latest,priority=1000 + type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + labels: + org.opencontainers.image.created=${{ needs.metadata.outputs.BUILD_DATE }} + + - name: Combine images into a single multi-arch image + shell: bash -x -e {0} + run: | + for tag in ${{ steps.meta.outputs.tags }}; do + docker manifest create ${tag} $( + for IMAGE in ${{ needs.build-amd64.outputs.DOCKER_TAG_FINAL }} ${{ needs.build-arm64.outputs.DOCKER_TAG_FINAL }}; do + REPO=$(echo $IMAGE | cut -d: -f1) + DIGEST=$( + docker manifest inspect $IMAGE |\ + jq -r '.manifests[] | select(.platform.os == "linux") | .digest' + ) + echo $REPO@${DIGEST} + done + ) + docker manifest push ${tag} + done finalize: if: always() - needs: [metadata, build] + needs: [metadata, build-amd64, build-arm64] uses: ./.github/workflows/_finalize.yaml with: PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} From 8395604c32383c73658a02a401ec1a246a06dbbb Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 09:43:22 +0000 Subject: [PATCH 069/146] refactor CI --- .github/workflows/nightly-jax-build.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index c03ebbaa1..623f5707e 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -37,7 +37,7 @@ jobs: run: | echo "PUBLISH=${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT - build-amd64: + amd64: needs: metadata uses: ./.github/workflows/_build_jax.yaml with: @@ -45,7 +45,7 @@ jobs: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit - build-arm64: + arm64: needs: metadata uses: ./.github/workflows/_build_jax.yaml with: @@ -56,7 +56,7 @@ jobs: publish: if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) runs-on: ubuntu-latest - needs: [metadata, build-amd64, build-arm64] + needs: [metadata, amd64, arm64] env: UPLD_IMAGE: 'ghcr.io/nvidia/jax' steps: @@ -86,7 +86,7 @@ jobs: run: | for tag in ${{ steps.meta.outputs.tags }}; do docker manifest create ${tag} $( - for IMAGE in ${{ needs.build-amd64.outputs.DOCKER_TAG_FINAL }} ${{ needs.build-arm64.outputs.DOCKER_TAG_FINAL }}; do + for IMAGE in ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }}; do REPO=$(echo $IMAGE | cut -d: -f1) DIGEST=$( docker manifest inspect $IMAGE |\ @@ -100,7 +100,7 @@ jobs: finalize: if: always() - needs: [metadata, build-amd64, build-arm64] + needs: [metadata, amd64, arm64] uses: ./.github/workflows/_finalize.yaml with: PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} From 2ab0cc9cef944c6d381c2f1b7e19691d15ec48dd Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 09:47:30 +0000 Subject: [PATCH 070/146] refactor CI --- .github/workflows/nightly-jax-build.yaml | 8 +-- .github/workflows/weekly-base-build.yaml | 68 ++++++++++++++++++++---- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index 623f5707e..f50707358 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -16,6 +16,10 @@ permissions: actions: write # to cancel previous workflows packages: write # to upload container +env: + DOCKER_REGISTRY: ghcr.io/nvidia + DOCKER_IMAGE: jax + jobs: metadata: @@ -57,8 +61,6 @@ jobs: if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) runs-on: ubuntu-latest needs: [metadata, amd64, arm64] - env: - UPLD_IMAGE: 'ghcr.io/nvidia/jax' steps: - name: Login to GitHub Container Registry uses: docker/login-action@v2 @@ -72,7 +74,7 @@ jobs: uses: docker/metadata-action@v4 with: images: | - ${{ env.UPLD_IMAGE }} + ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE }} flavor: | latest=false tags: | diff --git a/.github/workflows/weekly-base-build.yaml b/.github/workflows/weekly-base-build.yaml index fad8d74f4..877d5fb95 100644 --- a/.github/workflows/weekly-base-build.yaml +++ b/.github/workflows/weekly-base-build.yaml @@ -12,8 +12,8 @@ on: required: false env: - TARGET: jax-toolbox DOCKER_REGISTRY: ghcr.io/nvidia + DOCKER_IMAGE: jax-toolbox permissions: contents: read # to fetch code @@ -34,21 +34,69 @@ jobs: BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - build: + amd64: needs: metadata uses: ./.github/workflows/_build_base.yaml with: + ARCHITECTURE: amd64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + secrets: inherit + + arm64: + needs: metadata + uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: arm64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit publish: if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit + runs-on: ubuntu-latest + needs: [metadata, amd64, arm64] + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set docker metadata + id: meta + uses: docker/metadata-action@v4 + with: + images: | + ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE }} + flavor: | + latest=false + tags: | + type=raw,value=base,priority=1000 + type=raw,value=base-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + labels: + org.opencontainers.image.created=${{ needs.metadata.outputs.BUILD_DATE }} + + - name: Combine images into a single multi-arch image + shell: bash -x -e {0} + run: | + for tag in ${{ steps.meta.outputs.tags }}; do + docker manifest create ${tag} $( + for IMAGE in ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }}; do + REPO=$(echo $IMAGE | cut -d: -f1) + DIGEST=$( + docker manifest inspect $IMAGE |\ + jq -r '.manifests[] | select(.platform.os == "linux") | .digest' + ) + echo $REPO@${DIGEST} + done + ) + docker manifest push ${tag} + done + + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: jax-toolbox - TARGET_TAGS: | - type=raw,value=base,priority=1000 - type=raw,value=base-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 \ No newline at end of file + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} + secrets: inherit From 69c17fc2079208826045af9c81d5a955aa0ba67b Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 09:48:29 +0000 Subject: [PATCH 071/146] refactor CI --- .github/workflows/_sandbox.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 49b470eca..c5cba7537 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -1,7 +1,8 @@ name: "~Sandbox" on: - push: + workflow_dispatch: + # push: permissions: contents: read # to fetch code From d9400d332551f94c97dfa74510d1390f9820ead8 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 09:56:45 +0000 Subject: [PATCH 072/146] refactor CI --- .github/workflows/weekly-base-build.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/weekly-base-build.yaml b/.github/workflows/weekly-base-build.yaml index 877d5fb95..5dd643dd0 100644 --- a/.github/workflows/weekly-base-build.yaml +++ b/.github/workflows/weekly-base-build.yaml @@ -79,9 +79,9 @@ jobs: - name: Combine images into a single multi-arch image shell: bash -x -e {0} run: | - for tag in ${{ steps.meta.outputs.tags }}; do + for tag in $(echo "${{ steps.meta.outputs.tags }}"); do docker manifest create ${tag} $( - for IMAGE in ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }}; do + for IMAGE in ${{ needs.amd64.outputs.DOCKER_TAG }} ${{ needs.arm64.outputs.DOCKER_TAG }}; do REPO=$(echo $IMAGE | cut -d: -f1) DIGEST=$( docker manifest inspect $IMAGE |\ From 33dc9ac1d353bd021b1f79df1dfcb728c1489095 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 10:10:14 +0000 Subject: [PATCH 073/146] refactor CI --- .github/workflows/nightly-jax-build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index f50707358..18ef61fec 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -86,7 +86,7 @@ jobs: - name: Combine images into a single multi-arch image shell: bash -x -e {0} run: | - for tag in ${{ steps.meta.outputs.tags }}; do + for tag in $(echo "${{ steps.meta.outputs.tags }}"); do docker manifest create ${tag} $( for IMAGE in ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }}; do REPO=$(echo $IMAGE | cut -d: -f1) From 4fc18dc0c00b2eff06a4b2a20943e1f4645258db Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 14:28:56 +0000 Subject: [PATCH 074/146] fix output tag order --- .github/workflows/_build_jax.yaml | 10 +++++----- .github/workflows/_build_pax.yaml | 10 +++++----- .github/workflows/_build_t5x.yaml | 10 +++++----- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index 8d36374c5..3f72d3e22 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -58,12 +58,12 @@ on: required: false default: 'badge-jax-build' outputs: - DOCKER_TAG_FINAL: - description: "Tags of the complete image built" - value: ${{ jobs.build-jax.outputs.DOCKER_TAG_FINAL }} DOCKER_TAG_STAGING: description: "Tags of the 'staging' image built" value: ${{ jobs.build-jax.outputs.DOCKER_TAG_STAGING }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build-jax.outputs.DOCKER_TAG_FINAL }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -80,8 +80,8 @@ jobs: env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: - DOCKER_TAG_FINAL: ${{ steps.staging-metadata.outputs.tags }} - DOCKER_TAG_STAGING: ${{ steps.final-metadata.outputs.tags }} + DOCKER_TAG_STAGING: ${{ steps.staging-metadata.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables run: env diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index e97c92f39..489a194fb 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -48,12 +48,12 @@ on: required: false default: 'badge-pax-build' outputs: - DOCKER_TAG_FINAL: - description: "Tags of the complete image built" - value: ${{ jobs.build-pax.outputs.DOCKER_TAG_FINAL }} DOCKER_TAG_STAGING: description: "Tags of the 'staging' image built" value: ${{ jobs.build-pax.outputs.DOCKER_TAG_STAGING }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build-pax.outputs.DOCKER_TAG_FINAL }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -70,8 +70,8 @@ jobs: env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: - DOCKER_TAG_FINAL: ${{ steps.staging-metadata.outputs.tags }} - DOCKER_TAG_STAGING: ${{ steps.final-metadata.outputs.tags }} + DOCKER_TAG_STAGING: ${{ steps.staging-metadata.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables run: env diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index a2202567a..b761bf54d 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -48,12 +48,12 @@ on: required: false default: 'badge-t5x-build' outputs: - DOCKER_TAG_FINAL: - description: "Tags of the complete image built" - value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_FINAL }} DOCKER_TAG_STAGING: description: "Tags of the 'staging' image built" value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_STAGING }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_FINAL }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -70,8 +70,8 @@ jobs: env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: - DOCKER_TAG_FINAL: ${{ steps.staging-metadata.outputs.tags }} - DOCKER_TAG_STAGING: ${{ steps.final-metadata.outputs.tags }} + DOCKER_TAG_STAGING: ${{ steps.staging-metadata.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables run: env From b806a5af8e3285f8e79c4f5d21ecbadee35586f9 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 15:11:17 +0000 Subject: [PATCH 075/146] t5x arm64 build not ready yet --- .github/workflows/_ci_arm64.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/_ci_arm64.yaml b/.github/workflows/_ci_arm64.yaml index 1515bd298..d1c1f25d5 100644 --- a/.github/workflows/_ci_arm64.yaml +++ b/.github/workflows/_ci_arm64.yaml @@ -110,16 +110,16 @@ jobs: REF_TE: ${{ needs.metadata.outputs.REF_TE }} secrets: inherit - build-t5x: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_t5x.yaml - with: - ARCHITECTURE: arm64 - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} - REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} - REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} - secrets: inherit + # build-t5x: + # needs: [metadata, build-jax] + # uses: ./.github/workflows/_build_t5x.yaml + # with: + # ARCHITECTURE: arm64 + # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + # REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} + # REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} + # secrets: inherit build-pax: needs: [metadata, build-jax] From 8391f743ae6ccda15f1719f69ea7d251bb501db9 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 15:28:42 +0000 Subject: [PATCH 076/146] nightly T5X build --- .github/workflows/nightly-t5x-build.yaml | 101 +++++++++++++++++++---- 1 file changed, 84 insertions(+), 17 deletions(-) diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index 2a6eda333..ea48f900c 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -18,14 +18,37 @@ permissions: actions: write # to cancel previous workflows packages: write # to upload container +env: + DOCKER_REGISTRY: ghcr.io/nvidia + DOCKER_IMAGE: upstream-t5x + jobs: metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' runs-on: ubuntu-22.04 outputs: + PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} steps: + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + uses: styfle/cancel-workflow-action@0.12.0 + + - name: Determine if the resulting container should be 'published' + id: if-publish + shell: bash -x -e {0} + run: + # A container should be published if: + # 1) the workflow is triggered by workflow_dispatch and the PUBLISH input is true, or + # 2) the workflow is triggered by workflow_run (i.e., a nightly build) + echo "PUBLISH=${{ github.event_name == 'workflow_run' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT + - name: Set build date id: date shell: bash -x -e {0} @@ -33,28 +56,72 @@ jobs: BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + amd64: needs: metadata uses: ./.github/workflows/_build_t5x.yaml with: + ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit - publish: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: upstream-t5x - TARGET_TAGS: | - type=raw,value=latest,priority=1000 - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + arm64: + needs: metadata + runs-on: ubuntu-22.04 + outputs: + DOCKER_TAG_FINAL: '' + steps: + - name: Generate placeholder warning + shell: bash -x -e {0} + run: | + echo "WARNING: arm64 build is not yet supported" >> $GITHUB_OUTPUT - if-upstream-failed: + publish: + needs: [metadata, amd64, arm64] + if: ${{ needs.metadata.outputs.PUBLISH == 'true' }} runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set docker metadata + id: meta + uses: docker/metadata-action@v4 + with: + images: | + ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE }} + flavor: | + latest=false + tags: | + type=raw,value=latest,priority=1000 + type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + labels: + org.opencontainers.image.created=${{ needs.metadata.outputs.BUILD_DATE }} + + - name: Combine images into a single multi-arch image + shell: bash -x -e {0} + run: | + for tag in $(echo "${{ steps.meta.outputs.tags }}"); do + docker manifest create ${tag} $( + for IMAGE in ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }}; do + REPO=$(echo $IMAGE | cut -d: -f1) + DIGEST=$( + docker manifest inspect $IMAGE |\ + jq -r '.manifests[] | select(.platform.os == "linux") | .digest' + ) + echo $REPO@${DIGEST} + done + ) + docker manifest push ${tag} + done + + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} + secrets: inherit From 984a19a892eba121560d6f4d9874d2e5c7de345b Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 15:42:12 +0000 Subject: [PATCH 077/146] nightly T5X build --- .github/workflows/nightly-t5x-build.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index ea48f900c..e1150343d 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -36,9 +36,19 @@ jobs: run: | echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + # - name: Cancel workflow if upstream workflow did not success + # if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + # uses: styfle/cancel-workflow-action@0.12.0 + - name: Cancel workflow if upstream workflow did not success if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} - uses: styfle/cancel-workflow-action@0.12.0 + run: | + # call the GitHub API to cancel the workflow + curl \ + -X POST \ + -H "Accept: application/vnd.github.v3+json" \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + https://api.github.com/repos/${{ github.repository }}/actions/workflows/${{ github.event.workflow_run.workflow_id }}/cancel - name: Determine if the resulting container should be 'published' id: if-publish From 715f62cecff0e87d7d3e6cd785d5b441c68bec92 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 19:08:07 +0000 Subject: [PATCH 078/146] fix TE/T5X bug --- .github/container/Dockerfile.jax | 1 + .github/workflows/_build_jax.yaml | 2 +- .github/workflows/nightly-t5x-build.yaml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index badeb61ec..3415d63f1 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -79,6 +79,7 @@ ARG REF_TE ARG SRC_PATH_TE ENV NVTE_FRAMEWORK=jax RUN <<"EOF" bash -ex +set -o pipefail get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} cd ${SRC_PATH_TE} && python setup.py bdist_wheel && rm -rf build echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/manifest.te diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index 3f72d3e22..7cb0daede 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -41,7 +41,7 @@ on: type: string description: URL of transformer engine repository to check out required: false - default: "https://github.com/openxla/xla.git" + default: "https://github.com/NVIDIA/TransformerEngine.git" REF_TE: type: string description: Git commit, tag, or branch for XLA diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index e1150343d..222a0e467 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -83,7 +83,7 @@ jobs: - name: Generate placeholder warning shell: bash -x -e {0} run: | - echo "WARNING: arm64 build is not yet supported" >> $GITHUB_OUTPUT + echo "WARNING: arm64 build is not yet supported" publish: needs: [metadata, amd64, arm64] From 7f06cb87fc498c49a29b891d88bd1552658496b5 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 22:36:37 +0000 Subject: [PATCH 079/146] add TE examples and tests to wheel --- .github/container/Dockerfile.jax | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 3415d63f1..21b5b8128 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -78,10 +78,15 @@ ARG REPO_TE ARG REF_TE ARG SRC_PATH_TE ENV NVTE_FRAMEWORK=jax +ENV SRC_PATH_TE=${SRC_PATH_TE} RUN <<"EOF" bash -ex set -o pipefail get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} -cd ${SRC_PATH_TE} && python setup.py bdist_wheel && rm -rf build +pushd ${SRC_PATH_TE} +git remote add yhtang https://github.com/yhtang/TransformerEngine.git +git fetch yhtang +git merge yhtang/yhtang-add-nspkgs +python setup.py bdist_wheel && rm -rf build echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/manifest.te EOF From 92b6d0a0eb9028281a72a79006a2b8f9d06b417d Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 22:41:48 +0000 Subject: [PATCH 080/146] allow TE parallel build --- .github/container/Dockerfile.jax | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 21b5b8128..21dff8779 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -81,6 +81,7 @@ ENV NVTE_FRAMEWORK=jax ENV SRC_PATH_TE=${SRC_PATH_TE} RUN <<"EOF" bash -ex set -o pipefail +pip install ninja && rm -rf ~/.cache/pip get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} pushd ${SRC_PATH_TE} git remote add yhtang https://github.com/yhtang/TransformerEngine.git From c4f5b84f83555e88b0736d811e59b46d5355d99e Mon Sep 17 00:00:00 2001 From: Yu-Hang Tang Date: Thu, 9 Nov 2023 15:12:21 -0800 Subject: [PATCH 081/146] jax publish --- .github/workflows/_publish_container.yaml | 56 +++++++++---------- .github/workflows/nightly-jax-build.yaml | 67 ++++++++--------------- 2 files changed, 49 insertions(+), 74 deletions(-) diff --git a/.github/workflows/_publish_container.yaml b/.github/workflows/_publish_container.yaml index 40340cb6f..ca0ada8af 100644 --- a/.github/workflows/_publish_container.yaml +++ b/.github/workflows/_publish_container.yaml @@ -15,11 +15,11 @@ on: type: string description: 'Target docker tags in docker/metadata-action format:' required: true - EXPOSE_SINGLE_ARCH_IMAGES: - type: boolean - description: 'Also expose single-arch images:' - required: false - default: true + # EXPOSE_SINGLE_ARCH_IMAGES: + # type: boolean + # description: 'Also expose single-arch images:' + # required: false + # default: false outputs: DOCKER_TAGS: description: "Tags of the image published" @@ -85,30 +85,26 @@ jobs: docker buildx imagetools create --tag $tag ${{ steps.get-manifests.outputs.manifests }} done - - name: Skopeo Login to GitHub Container Registry - run: | - echo ${{ secrets.GITHUB_TOKEN }} | skopeo login --authfile - ghcr.io - - - name: Create single-arch images - if: ${{ inputs.EXPOSE_SINGLE_ARCH_IMAGES }} - shell: bash -x -e {0} - run: | - # Create new manifest list from extracted manifests - for manifest in ${{ steps.get-manifests.outputs.manifests }}; do - os=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.os') - arch=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.architecture') - for tag in $(echo "${{ steps.meta.outputs.tags }}"); do - skopeo copy --format v2s2 docker://$manifest docker://$tag-${os}-${arch} - done - done + # - name: Create single-arch images + # if: ${{ inputs.EXPOSE_SINGLE_ARCH_IMAGES }} + # shell: bash -x -e {0} + # run: | + # # Create new manifest list from extracted manifests + # for manifest in ${{ steps.get-manifests.outputs.manifests }}; do + # os=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.os') + # arch=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.architecture') + # for tag in $(echo "${{ steps.meta.outputs.tags }}"); do + # skopeo copy --format v2s2 docker://$manifest docker://$tag-${os}-${arch} + # done + # done - - name: Generate outputs and artifacts - shell: bash -x -e {0} - run: | - echo "${{ steps.meta.outputs.tags }}" > image-tags-${{ inputs.TARGET_IMAGE }}.txt + # - name: Generate outputs and artifacts + # shell: bash -x -e {0} + # run: | + # echo "${{ steps.meta.outputs.tags }}" > image-tags-${{ inputs.TARGET_IMAGE }}.txt - - name: Upload image tags as artifacts - uses: actions/upload-artifact@v3 - with: - name: image-tags-${{ inputs.TARGET_IMAGE }} - path: image-tags-${{ inputs.TARGET_IMAGE }}.txt + # - name: Upload image tags as artifacts + # uses: actions/upload-artifact@v3 + # with: + # name: image-tags-${{ inputs.TARGET_IMAGE }} + # path: image-tags-${{ inputs.TARGET_IMAGE }}.txt diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index 18ef61fec..a904578f6 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -16,10 +16,6 @@ permissions: actions: write # to cancel previous workflows packages: write # to upload container -env: - DOCKER_REGISTRY: ghcr.io/nvidia - DOCKER_IMAGE: jax - jobs: metadata: @@ -57,48 +53,31 @@ jobs: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit - publish: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - runs-on: ubuntu-latest + publish-staging: needs: [metadata, amd64, arm64] - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=latest,priority=1000 - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - labels: - org.opencontainers.image.created=${{ needs.metadata.outputs.BUILD_DATE }} + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_STAGING }} + ${{ needs.amd64.outputs.DOCKER_TAG_STAGING }} + TARGET_IMAGE: jax-toolbox + TARGET_TAGS: | + type=raw,value=jax-staging-latest,priority=1000 + type=raw,value=jax-staging-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - - name: Combine images into a single multi-arch image - shell: bash -x -e {0} - run: | - for tag in $(echo "${{ steps.meta.outputs.tags }}"); do - docker manifest create ${tag} $( - for IMAGE in ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }}; do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${tag} - done + publish-final: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + TARGET_IMAGE: jax + TARGET_TAGS: | + type=raw,value=latest,priority=1000 + type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 finalize: if: always() From bcfd0e4239a47aea943c106a93406b55a00e150d Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 23:34:16 +0000 Subject: [PATCH 082/146] rename staging to mealkit --- .github/container/Dockerfile.jax | 4 ++-- .github/container/Dockerfile.pax.amd64 | 4 ++-- .github/container/Dockerfile.pax.arm64 | 4 ++-- .github/container/Dockerfile.t5x | 4 ++-- .github/workflows/_build_jax.yaml | 24 ++++++++++++------------ .github/workflows/_build_pax.yaml | 24 ++++++++++++------------ .github/workflows/_build_t5x.yaml | 24 ++++++++++++------------ .github/workflows/_ci_amd64.yaml | 8 ++++---- .github/workflows/_ci_arm64.yaml | 8 ++++---- .github/workflows/_sandbox.yaml | 8 ++++---- .github/workflows/nightly-jax-build.yaml | 6 +++--- 11 files changed, 59 insertions(+), 59 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 21dff8779..fa3fd24e5 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -46,7 +46,7 @@ RUN build-jax.sh \ ############################################################################### ARG BASE_IMAGE -FROM ${BASE_IMAGE} as staging +FROM ${BASE_IMAGE} as mealkit ARG SRC_PATH_JAX ARG SRC_PATH_XLA ARG BUILD_DATE @@ -97,6 +97,6 @@ EOF ## Install primary packages and transitive dependencies ############################################################################### -FROM staging as final +FROM mealkit as final RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index 7bc867d94..26b0411ae 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -12,7 +12,7 @@ ARG SRC_PATH_PRAXIS=/opt/praxis ## Download source and add auxiliary scripts ############################################################################### -FROM ${BASE_IMAGE} as staging +FROM ${BASE_IMAGE} as mealkit ARG REPO_PAXML ARG REPO_PRAXIS ARG REF_PAXML @@ -44,6 +44,6 @@ ADD test-pax.sh /usr/local/bin ## Install accumulated packages from the base image and the previous stage ############################################################################### -FROM staging as final +FROM mealkit as final RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index bea1b555f..a4b281208 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -104,7 +104,7 @@ EOT ############################################################################### ARG BASE_IMAGE -FROM ${BASE_IMAGE} as staging +FROM ${BASE_IMAGE} as mealkit ARG REPO_PAXML ARG REPO_PRAXIS ARG REF_PAXML @@ -168,6 +168,6 @@ ADD test-pax.sh /usr/local/bin ## Install accumulated packages from the base image and the previous stage ############################################################################### -FROM staging as final +FROM mealkit as final RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index 8dd495b5b..cfdcaec47 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -9,7 +9,7 @@ ARG SRC_PATH_T5X=/opt/t5x ## Download source and add auxiliary scripts ############################################################################### -FROM ${BASE_IMAGE} as staging +FROM ${BASE_IMAGE} as mealkit ARG REPO_T5X ARG REF_T5X @@ -34,6 +34,6 @@ ADD test-t5x.sh /usr/local/bin ## Install accumulated packages from the base image and the previous stage ############################################################################### -FROM staging as final +FROM mealkit as final RUN pip-finalize.sh diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index 7cb0daede..205c5da2d 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -58,9 +58,9 @@ on: required: false default: 'badge-jax-build' outputs: - DOCKER_TAG_STAGING: - description: "Tags of the 'staging' image built" - value: ${{ jobs.build-jax.outputs.DOCKER_TAG_STAGING }} + DOCKER_TAG_MEALKIT: + description: "Tags of the 'mealkit' image built" + value: ${{ jobs.build-jax.outputs.DOCKER_TAG_MEALKIT }} DOCKER_TAG_FINAL: description: "Tags of the complete image built" value: ${{ jobs.build-jax.outputs.DOCKER_TAG_FINAL }} @@ -80,7 +80,7 @@ jobs: env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: - DOCKER_TAG_STAGING: ${{ steps.staging-metadata.outputs.tags }} + DOCKER_TAG_MEALKIT: ${{ steps.mealkit-metadata.outputs.tags }} DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables @@ -117,8 +117,8 @@ jobs: driver-opts: | image=moby/buildkit:v0.12.1 - - name: Set docker metadata - staging - id: staging-metadata + - name: Set docker metadata - mealkit + id: mealkit-metadata uses: docker/metadata-action@v4 with: images: | @@ -126,21 +126,21 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-jax-${{ inputs.ARCHITECTURE }}-staging + type=raw,value=${{ github.run_id }}-jax-${{ inputs.ARCHITECTURE }}-mealkit labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Build staging image - id: staging-build + - name: Build mealkit image + id: mealkit-build uses: docker/build-push-action@v4 with: context: .github/container push: true file: .github/container/Dockerfile.jax platforms: linux/${{ inputs.ARCHITECTURE }} - target: staging - tags: ${{ steps.staging-metadata.outputs.tags }} - labels: ${{ steps.staging-metadata.outputs.labels }} + target: mealkit + tags: ${{ steps.mealkit-metadata.outputs.tags }} + labels: ${{ steps.mealkit-metadata.outputs.labels }} ssh: default secret-files: | "SSH_KNOWN_HOSTS=${{ steps.ssh-known-hosts.outputs.FILE }}" diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 489a194fb..0abaa9644 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -48,9 +48,9 @@ on: required: false default: 'badge-pax-build' outputs: - DOCKER_TAG_STAGING: - description: "Tags of the 'staging' image built" - value: ${{ jobs.build-pax.outputs.DOCKER_TAG_STAGING }} + DOCKER_TAG_MEALKIT: + description: "Tags of the 'mealkit' image built" + value: ${{ jobs.build-pax.outputs.DOCKER_TAG_MEALKIT }} DOCKER_TAG_FINAL: description: "Tags of the complete image built" value: ${{ jobs.build-pax.outputs.DOCKER_TAG_FINAL }} @@ -70,7 +70,7 @@ jobs: env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: - DOCKER_TAG_STAGING: ${{ steps.staging-metadata.outputs.tags }} + DOCKER_TAG_MEALKIT: ${{ steps.mealkit-metadata.outputs.tags }} DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables @@ -92,8 +92,8 @@ jobs: driver-opts: | image=moby/buildkit:v0.12.1 - - name: Set docker metadata - staging - id: staging-metadata + - name: Set docker metadata - mealkit + id: mealkit-metadata uses: docker/metadata-action@v4 with: images: | @@ -101,21 +101,21 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-upstream-pax-${{ inputs.ARCHITECTURE }}-staging + type=raw,value=${{ github.run_id }}-upstream-pax-${{ inputs.ARCHITECTURE }}-mealkit labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Build staging image - id: staging-build + - name: Build mealkit image + id: mealkit-build uses: docker/build-push-action@v4 with: context: .github/container push: true file: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} platforms: linux/${{ inputs.ARCHITECTURE }} - target: staging - tags: ${{ steps.staging-metadata.outputs.tags }} - labels: ${{ steps.staging-metadata.outputs.labels }} + target: mealkit + tags: ${{ steps.mealkit-metadata.outputs.tags }} + labels: ${{ steps.mealkit-metadata.outputs.labels }} build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index b761bf54d..1e3a961cb 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -48,9 +48,9 @@ on: required: false default: 'badge-t5x-build' outputs: - DOCKER_TAG_STAGING: - description: "Tags of the 'staging' image built" - value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_STAGING }} + DOCKER_TAG_MEALKIT: + description: "Tags of the 'mealkit' image built" + value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_MEALKIT }} DOCKER_TAG_FINAL: description: "Tags of the complete image built" value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_FINAL }} @@ -70,7 +70,7 @@ jobs: env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: - DOCKER_TAG_STAGING: ${{ steps.staging-metadata.outputs.tags }} + DOCKER_TAG_MEALKIT: ${{ steps.mealkit-metadata.outputs.tags }} DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables @@ -92,8 +92,8 @@ jobs: driver-opts: | image=moby/buildkit:v0.12.1 - - name: Set docker metadata - staging - id: staging-metadata + - name: Set docker metadata - mealkit + id: mealkit-metadata uses: docker/metadata-action@v4 with: images: | @@ -101,21 +101,21 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-upstream-t5x-${{ inputs.ARCHITECTURE }}-staging + type=raw,value=${{ github.run_id }}-upstream-t5x-${{ inputs.ARCHITECTURE }}-mealkit labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Build staging image - id: staging-build + - name: Build mealkit image + id: mealkit-build uses: docker/build-push-action@v4 with: context: .github/container push: true file: .github/container/Dockerfile.t5x platforms: linux/${{ inputs.ARCHITECTURE }} - target: staging - tags: ${{ steps.staging-metadata.outputs.tags }} - labels: ${{ steps.staging-metadata.outputs.labels }} + target: mealkit + tags: ${{ steps.mealkit-metadata.outputs.tags }} + labels: ${{ steps.mealkit-metadata.outputs.labels }} build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} diff --git a/.github/workflows/_ci_amd64.yaml b/.github/workflows/_ci_amd64.yaml index 4e2923e4f..e4c2402d6 100644 --- a/.github/workflows/_ci_amd64.yaml +++ b/.github/workflows/_ci_amd64.yaml @@ -116,7 +116,7 @@ jobs: with: ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} secrets: inherit @@ -127,7 +127,7 @@ jobs: with: ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} REPO_PAXML: ${{ needs.metadata.outputs.REPO_PAXML }} REF_PAXML: ${{ needs.metadata.outputs.REF_PAXML }} REPO_PRAXIS: ${{ needs.metadata.outputs.REPO_PRAXIS }} @@ -139,7 +139,7 @@ jobs: # needs: [metadata, build-t5x] # with: # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_STAGING }} + # BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_MEALKIT }} # BASE_LIBRARY: t5x # secrets: inherit @@ -148,7 +148,7 @@ jobs: # needs: [metadata, build-pax] # with: # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_STAGING }} + # BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_MEALKIT }} # BASE_LIBRARY: pax # secrets: inherit diff --git a/.github/workflows/_ci_arm64.yaml b/.github/workflows/_ci_arm64.yaml index d1c1f25d5..2a778cb15 100644 --- a/.github/workflows/_ci_arm64.yaml +++ b/.github/workflows/_ci_arm64.yaml @@ -116,7 +116,7 @@ jobs: # with: # ARCHITECTURE: arm64 # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} # REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} # REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} # secrets: inherit @@ -127,7 +127,7 @@ jobs: with: ARCHITECTURE: arm64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} REPO_PAXML: ${{ needs.metadata.outputs.REPO_PAXML }} REF_PAXML: ${{ needs.metadata.outputs.REF_PAXML }} REPO_PRAXIS: ${{ needs.metadata.outputs.REPO_PRAXIS }} @@ -139,7 +139,7 @@ jobs: # needs: [metadata, build-t5x] # with: # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_STAGING }} + # BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_MEALKIT }} # BASE_LIBRARY: t5x # secrets: inherit @@ -148,6 +148,6 @@ jobs: # needs: [metadata, build-pax] # with: # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_STAGING }} + # BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_MEALKIT }} # BASE_LIBRARY: pax # secrets: inherit diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index c5cba7537..6dd970030 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -35,7 +35,7 @@ jobs: # uses: ./.github/workflows/_build_t5x.yaml # with: # ARCHITECTURE: arm64 - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} # secrets: inherit build-pax: @@ -43,9 +43,9 @@ jobs: uses: ./.github/workflows/_build_pax.yaml with: ARCHITECTURE: arm64 - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_STAGING }} - BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6792001145-jax-arm64-staging - # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-staging + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6792001145-jax-arm64-mealkit + # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-mealkit secrets: inherit # finalize: diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index a904578f6..280ffbdae 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -53,7 +53,7 @@ jobs: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit - publish-staging: + publish-mealkit: needs: [metadata, amd64, arm64] if: needs.metadata.outputs.PUBLISH == 'true' uses: ./.github/workflows/_publish_container.yaml @@ -63,8 +63,8 @@ jobs: ${{ needs.amd64.outputs.DOCKER_TAG_STAGING }} TARGET_IMAGE: jax-toolbox TARGET_TAGS: | - type=raw,value=jax-staging-latest,priority=1000 - type=raw,value=jax-staging-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + type=raw,value=jax-mealkit-latest,priority=1000 + type=raw,value=jax-mealkit-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 publish-final: needs: [metadata, amd64, arm64] From 12a2fd68911b6a26a9e8d52b78c75ebb5322fde6 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 23:44:39 +0000 Subject: [PATCH 083/146] fix nightly --- .github/workflows/_build_pax.yaml | 2 +- .github/workflows/_build_t5x.yaml | 2 +- .github/workflows/nightly-jax-build.yaml | 4 ++-- .github/workflows/nightly-t5x-build.yaml | 30 ++++++++++++++++++++---- 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 0abaa9644..33c95a2a0 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -11,7 +11,7 @@ on: type: string description: 'Base docker image that provides JAX' required: false - default: ghcr.io/nvidia/jax:latest + default: ghcr.io/nvidia/jax-toolbox:jax-mealkit-latest BUILD_DATE: type: string description: "Build date in YYYY-MM-DD format" diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index 1e3a961cb..b1e7e53c4 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -11,7 +11,7 @@ on: type: string description: 'Base docker image that provides JAX' required: false - default: ghcr.io/nvidia/jax:latest + default: ghcr.io/nvidia/jax-toolbox:jax-mealkit-latest BUILD_DATE: type: string description: "Build date in YYYY-MM-DD format" diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index 280ffbdae..85ec89beb 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -59,8 +59,8 @@ jobs: uses: ./.github/workflows/_publish_container.yaml with: SOURCE_IMAGE: | - ${{ needs.amd64.outputs.DOCKER_TAG_STAGING }} - ${{ needs.amd64.outputs.DOCKER_TAG_STAGING }} + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} TARGET_IMAGE: jax-toolbox TARGET_TAGS: | type=raw,value=jax-mealkit-latest,priority=1000 diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index 222a0e467..3ff2615d4 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -18,10 +18,6 @@ permissions: actions: write # to cancel previous workflows packages: write # to upload container -env: - DOCKER_REGISTRY: ghcr.io/nvidia - DOCKER_IMAGE: upstream-t5x - jobs: metadata: @@ -85,6 +81,32 @@ jobs: run: | echo "WARNING: arm64 build is not yet supported" + publish-mealkit: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + TARGET_IMAGE: jax-toolbox + TARGET_TAGS: | + type=raw,value=upstream-t5x-mealkit-latest,priority=1000 + type=raw,value=upstream-t5x-mealkit-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + + publish-final: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + TARGET_IMAGE: upstream-t5x + TARGET_TAGS: | + type=raw,value=latest,priority=1000 + type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + publish: needs: [metadata, amd64, arm64] if: ${{ needs.metadata.outputs.PUBLISH == 'true' }} From 1925b14515cec1340ad4e23264fa8ceb4dd190ec Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Thu, 9 Nov 2023 23:47:41 +0000 Subject: [PATCH 084/146] fix nightly --- .github/workflows/nightly-pax-build.yaml | 79 ++++++++++++++++++++---- .github/workflows/nightly-t5x-build.yaml | 43 ------------- 2 files changed, 66 insertions(+), 56 deletions(-) diff --git a/.github/workflows/nightly-pax-build.yaml b/.github/workflows/nightly-pax-build.yaml index 98224e98a..1a6e093d5 100644 --- a/.github/workflows/nightly-pax-build.yaml +++ b/.github/workflows/nightly-pax-build.yaml @@ -21,11 +21,40 @@ permissions: jobs: metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' runs-on: ubuntu-22.04 outputs: + PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} steps: + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + # - name: Cancel workflow if upstream workflow did not success + # if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + # uses: styfle/cancel-workflow-action@0.12.0 + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + run: | + # call the GitHub API to cancel the workflow + curl \ + -X POST \ + -H "Accept: application/vnd.github.v3+json" \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + https://api.github.com/repos/${{ github.repository }}/actions/workflows/${{ github.event.workflow_run.workflow_id }}/cancel + + - name: Determine if the resulting container should be 'published' + id: if-publish + shell: bash -x -e {0} + run: + # A container should be published if: + # 1) the workflow is triggered by workflow_dispatch and the PUBLISH input is true, or + # 2) the workflow is triggered by workflow_run (i.e., a nightly build) + echo "PUBLISH=${{ github.event_name == 'workflow_run' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT + - name: Set build date id: date shell: bash -x -e {0} @@ -33,28 +62,52 @@ jobs: BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + amd64: needs: metadata uses: ./.github/workflows/_build_pax.yaml with: + ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit - publish: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml + arm64: + needs: metadata + uses: ./.github/workflows/_build_pax.yaml + with: + ARCHITECTURE: arm64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit + + publish-mealkit: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + TARGET_IMAGE: jax-toolbox + TARGET_TAGS: | + type=raw,value=upstream-pax-mealkit-latest,priority=1000 + type=raw,value=upstream-pax-mealkit-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + + publish-final: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} TARGET_IMAGE: upstream-pax TARGET_TAGS: | type=raw,value=latest,priority=1000 type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - on-upstream-failure: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} + secrets: inherit diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index 3ff2615d4..b1fca9be2 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -107,49 +107,6 @@ jobs: type=raw,value=latest,priority=1000 type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - publish: - needs: [metadata, amd64, arm64] - if: ${{ needs.metadata.outputs.PUBLISH == 'true' }} - runs-on: ubuntu-latest - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=latest,priority=1000 - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - labels: - org.opencontainers.image.created=${{ needs.metadata.outputs.BUILD_DATE }} - - - name: Combine images into a single multi-arch image - shell: bash -x -e {0} - run: | - for tag in $(echo "${{ steps.meta.outputs.tags }}"); do - docker manifest create ${tag} $( - for IMAGE in ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }}; do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${tag} - done - finalize: if: always() needs: [metadata, amd64, arm64] From 6e71c648253be15a874ce36da1c1ffc8afc91dbe Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 06:52:26 +0000 Subject: [PATCH 085/146] bug fix --- .github/workflows/_publish_container.yaml | 51 +++++++++++++---------- .github/workflows/_sandbox.yaml | 29 +++++++++---- .github/workflows/nightly-jax-build.yaml | 4 +- .github/workflows/nightly-pax-build.yaml | 16 ++----- .github/workflows/nightly-t5x-build.yaml | 16 ++----- 5 files changed, 57 insertions(+), 59 deletions(-) diff --git a/.github/workflows/_publish_container.yaml b/.github/workflows/_publish_container.yaml index ca0ada8af..6f0fb652e 100644 --- a/.github/workflows/_publish_container.yaml +++ b/.github/workflows/_publish_container.yaml @@ -53,29 +53,36 @@ jobs: id: get-manifests shell: bash -x -e {0} run: | - SOURCE_REPO=$(echo ${{ inputs.SOURCE_IMAGE }} | cut -d: -f1) - MEDIA_TYPE=$(docker manifest inspect ${{ inputs.SOURCE_IMAGE }} | jq -r '.mediaType') - case "$MEDIA_TYPE" in - # OCI image index - "application/vnd.oci.image.index.v1+json") - MANIFESTS=$( - docker manifest inspect ${{ inputs.SOURCE_IMAGE }} |\ - jq -r '.manifests[] | select(.platform.os != "unknown") | .digest' |\ - xargs -I{} echo ${SOURCE_REPO}@{} |\ - tr '\n' ' ' - ) - ;; - # Docker image manifest - "application/vnd.docker.distribution.manifest.v2+json") - MANIFESTS=${{ inputs.SOURCE_IMAGE }} - ;; - *) - echo "Unknown media type: $MEDIA_TYPE" - exit 1 - ;; - esac + manifests="" + for src_img in $(echo ${{ inputs.SOURCE_IMAGE }} | tr '\n' ' '); do + repo=$(echo $src_img | cut -d: -f1) + media_type=$(docker manifest inspect $src_img | jq -r '.mediaType') + case "$media_type" in + + # OCI image index + "application/vnd.oci.image.index.v1+json") + manifest=$( + docker manifest inspect ${src_img} |\ + jq -r '.manifests[] | select(.platform.os != "unknown") | .digest' |\ + xargs -I{} echo ${repo}@{} |\ + tr '\n' ' ' + ) + ;; - echo "manifests=$MANIFESTS" >> $GITHUB_OUTPUT + # Docker image manifest + "application/vnd.docker.distribution.manifest.v2+json") + manifest=${src_img} + ;; + + *) + echo "Unknown media type: $MEDIA_TYPE" + exit 1 + ;; + esac + manifests="$manifests $manifest" + done + + echo "manifests=$manifests" >> $GITHUB_OUTPUT - name: Create multi-arch images id: multi-arch diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 6dd970030..f4fd8874f 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -14,6 +14,17 @@ env: jobs: + publish-mealkit: + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ghcr.io/nvidia/jax-toolbox-internal:6818759132-jax-amd64-mealkit + ghcr.io/nvidia/jax-toolbox-internal:6818759132-jax-arm64-mealkit + TARGET_IMAGE: jax-toolbox-internal + TARGET_TAGS: | + type=raw,value=test-mealkit-latest,priority=1000 + type=raw,value=test-mealkit-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + # build-base: # uses: ./.github/workflows/_build_base.yaml # with: @@ -38,15 +49,15 @@ jobs: # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} # secrets: inherit - build-pax: - # needs: [build-jax] - uses: ./.github/workflows/_build_pax.yaml - with: - ARCHITECTURE: arm64 - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6792001145-jax-arm64-mealkit - # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-mealkit - secrets: inherit + # build-pax: + # # needs: [build-jax] + # uses: ./.github/workflows/_build_pax.yaml + # with: + # ARCHITECTURE: arm64 + # # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6792001145-jax-arm64-mealkit + # # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-mealkit + # secrets: inherit # finalize: # if: always() diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index 85ec89beb..25e54062b 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -60,7 +60,7 @@ jobs: with: SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} - ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} TARGET_IMAGE: jax-toolbox TARGET_TAGS: | type=raw,value=jax-mealkit-latest,priority=1000 @@ -73,7 +73,7 @@ jobs: with: SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} - ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} TARGET_IMAGE: jax TARGET_TAGS: | type=raw,value=latest,priority=1000 diff --git a/.github/workflows/nightly-pax-build.yaml b/.github/workflows/nightly-pax-build.yaml index 1a6e093d5..ab35b31b0 100644 --- a/.github/workflows/nightly-pax-build.yaml +++ b/.github/workflows/nightly-pax-build.yaml @@ -32,19 +32,9 @@ jobs: run: | echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT - # - name: Cancel workflow if upstream workflow did not success - # if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} - # uses: styfle/cancel-workflow-action@0.12.0 - - name: Cancel workflow if upstream workflow did not success if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} - run: | - # call the GitHub API to cancel the workflow - curl \ - -X POST \ - -H "Accept: application/vnd.github.v3+json" \ - -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ - https://api.github.com/repos/${{ github.repository }}/actions/workflows/${{ github.event.workflow_run.workflow_id }}/cancel + uses: styfle/cancel-workflow-action@0.12.0 - name: Determine if the resulting container should be 'published' id: if-publish @@ -85,7 +75,7 @@ jobs: with: SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} - ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} TARGET_IMAGE: jax-toolbox TARGET_TAGS: | type=raw,value=upstream-pax-mealkit-latest,priority=1000 @@ -98,7 +88,7 @@ jobs: with: SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} - ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} TARGET_IMAGE: upstream-pax TARGET_TAGS: | type=raw,value=latest,priority=1000 diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index b1fca9be2..b91f60e87 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -32,19 +32,9 @@ jobs: run: | echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT - # - name: Cancel workflow if upstream workflow did not success - # if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} - # uses: styfle/cancel-workflow-action@0.12.0 - - name: Cancel workflow if upstream workflow did not success if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} - run: | - # call the GitHub API to cancel the workflow - curl \ - -X POST \ - -H "Accept: application/vnd.github.v3+json" \ - -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ - https://api.github.com/repos/${{ github.repository }}/actions/workflows/${{ github.event.workflow_run.workflow_id }}/cancel + uses: styfle/cancel-workflow-action@0.12.0 - name: Determine if the resulting container should be 'published' id: if-publish @@ -88,7 +78,7 @@ jobs: with: SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} - ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} TARGET_IMAGE: jax-toolbox TARGET_TAGS: | type=raw,value=upstream-t5x-mealkit-latest,priority=1000 @@ -101,7 +91,7 @@ jobs: with: SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} - ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} TARGET_IMAGE: upstream-t5x TARGET_TAGS: | type=raw,value=latest,priority=1000 From adb10da32312a831c3fd90b3ecf0bc28186e60c9 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 06:55:02 +0000 Subject: [PATCH 086/146] bug fix --- .github/workflows/_publish_container.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_publish_container.yaml b/.github/workflows/_publish_container.yaml index 6f0fb652e..f0aa38e6f 100644 --- a/.github/workflows/_publish_container.yaml +++ b/.github/workflows/_publish_container.yaml @@ -54,7 +54,7 @@ jobs: shell: bash -x -e {0} run: | manifests="" - for src_img in $(echo ${{ inputs.SOURCE_IMAGE }} | tr '\n' ' '); do + for src_img in $(echo "${{ inputs.SOURCE_IMAGE }}" | tr '\n' ' '); do repo=$(echo $src_img | cut -d: -f1) media_type=$(docker manifest inspect $src_img | jq -r '.mediaType') case "$media_type" in From b956b308441942634fe1bcda1cb3bf6d589ebcd4 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 07:08:11 +0000 Subject: [PATCH 087/146] bug fix --- .github/container/Dockerfile.t5x | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index cfdcaec47..93386278d 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -15,7 +15,9 @@ ARG REPO_T5X ARG REF_T5X ARG SRC_PATH_T5X RUN <<"EOF" bash -ex -get-source.sh -f ${REPO_T5X} -r ${REF_T5X} -d ${SRC_PATH_T5X} -m /opt/pip-tools.d/manifest.t5x +get-source.sh -f ${REPO_T5X} -r ${REF_T5X} -d ${SRC_PATH_T5X} +echo "-e file://${SRC_PATH_T5X}[gpu]" >> /opt/pip-tools.d/manifest.t5x + # remove head-of-tree specs from select dependencies pushd ${SRC_PATH_T5X} sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" setup.py From c727607c8646e4ceb33740f83ffa87a8c99e411a Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 07:28:20 +0000 Subject: [PATCH 088/146] fix --- .github/workflows/_build_pax.yaml | 2 +- .github/workflows/_build_t5x.yaml | 2 +- .github/workflows/nightly-jax-build.yaml | 6 +++--- .github/workflows/nightly-pax-build.yaml | 6 +++--- .github/workflows/nightly-t5x-build.yaml | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 33c95a2a0..62bc175ad 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -11,7 +11,7 @@ on: type: string description: 'Base docker image that provides JAX' required: false - default: ghcr.io/nvidia/jax-toolbox:jax-mealkit-latest + default: ghcr.io/nvidia/jax:mealkit BUILD_DATE: type: string description: "Build date in YYYY-MM-DD format" diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index b1e7e53c4..606840b71 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -11,7 +11,7 @@ on: type: string description: 'Base docker image that provides JAX' required: false - default: ghcr.io/nvidia/jax-toolbox:jax-mealkit-latest + default: ghcr.io/nvidia/jax:mealkit BUILD_DATE: type: string description: "Build date in YYYY-MM-DD format" diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index 25e54062b..043286678 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -61,10 +61,10 @@ jobs: SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} - TARGET_IMAGE: jax-toolbox + TARGET_IMAGE: jax TARGET_TAGS: | - type=raw,value=jax-mealkit-latest,priority=1000 - type=raw,value=jax-mealkit-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + type=raw,value=mealkit,priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 publish-final: needs: [metadata, amd64, arm64] diff --git a/.github/workflows/nightly-pax-build.yaml b/.github/workflows/nightly-pax-build.yaml index ab35b31b0..6ed1938d0 100644 --- a/.github/workflows/nightly-pax-build.yaml +++ b/.github/workflows/nightly-pax-build.yaml @@ -76,10 +76,10 @@ jobs: SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} - TARGET_IMAGE: jax-toolbox + TARGET_IMAGE: upstream-pax TARGET_TAGS: | - type=raw,value=upstream-pax-mealkit-latest,priority=1000 - type=raw,value=upstream-pax-mealkit-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + type=raw,value=mealkit,priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 publish-final: needs: [metadata, amd64, arm64] diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index b91f60e87..46b2f55b9 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -79,10 +79,10 @@ jobs: SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} - TARGET_IMAGE: jax-toolbox + TARGET_IMAGE: upstream-t5x TARGET_TAGS: | - type=raw,value=upstream-t5x-mealkit-latest,priority=1000 - type=raw,value=upstream-t5x-mealkit-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + type=raw,value=mealkit,priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 publish-final: needs: [metadata, amd64, arm64] From d450ceb7a1d1a7bd3a8771c06094bbe3a5c704b6 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 08:08:29 +0000 Subject: [PATCH 089/146] fix TE test --- .github/container/Dockerfile.jax | 5 +- .github/workflows/_sandbox.yaml | 155 ++++--------------------------- .github/workflows/_test_te.yaml | 44 +++------ 3 files changed, 31 insertions(+), 173 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index fa3fd24e5..4b6526a6e 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -84,11 +84,8 @@ set -o pipefail pip install ninja && rm -rf ~/.cache/pip get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} pushd ${SRC_PATH_TE} -git remote add yhtang https://github.com/yhtang/TransformerEngine.git -git fetch yhtang -git merge yhtang/yhtang-add-nspkgs python setup.py bdist_wheel && rm -rf build -echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/manifest.te +echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)[test]" >> /opt/pip-tools.d/manifest.te EOF # TODO: properly configure entrypoint diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index f4fd8874f..37539fc90 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -1,63 +1,35 @@ name: "~Sandbox" on: - workflow_dispatch: - # push: + # workflow_dispatch: + push: permissions: contents: read # to fetch code actions: write # to cancel previous workflows packages: write # to upload container -env: - ARCHITECTURE: arm64 - jobs: - publish-mealkit: - uses: ./.github/workflows/_publish_container.yaml + build-jax: + uses: ./.github/workflows/_build_jax.yaml with: - SOURCE_IMAGE: | - ghcr.io/nvidia/jax-toolbox-internal:6818759132-jax-amd64-mealkit - ghcr.io/nvidia/jax-toolbox-internal:6818759132-jax-arm64-mealkit - TARGET_IMAGE: jax-toolbox-internal - TARGET_TAGS: | - type=raw,value=test-mealkit-latest,priority=1000 - type=raw,value=test-mealkit-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + ARCHITECTURE: amd64 + secrets: inherit - # build-base: - # uses: ./.github/workflows/_build_base.yaml - # with: - # ARCHITECTURE: arm64 - # secrets: inherit - - # build-jax: - # needs: [build-base] - # uses: ./.github/workflows/_build_jax.yaml - # with: - # ARCHITECTURE: arm64 - # BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - # REF_JAX: jax-v0.4.20 - # REF_XLA: ca31652cdbeb6ea187589dea546ff8019274f8b2 - # secrets: inherit + build-pax: + needs: [build-jax] + uses: ./.github/workflows/_build_pax.yaml + with: + ARCHITECTURE: amd64 + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + secrets: inherit - # build-t5x: - # needs: [build-jax] - # uses: ./.github/workflows/_build_t5x.yaml - # with: - # ARCHITECTURE: arm64 - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # secrets: inherit - - # build-pax: - # # needs: [build-jax] - # uses: ./.github/workflows/_build_pax.yaml - # with: - # ARCHITECTURE: arm64 - # # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6792001145-jax-arm64-mealkit - # # BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6768482418-jax-amd64-mealkit - # secrets: inherit + test-te: + needs: [build-pax] + uses: ./.github/workflows/_test_te.yaml + with: + TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} # finalize: # if: always() @@ -67,94 +39,3 @@ jobs: # with: # PUBLISH_BADGE: false # secrets: inherit - - # merge: - # runs-on: ubuntu-latest - # needs: build - # outputs: - # DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - # steps: - # # TODO: currently downloading all artifacts of the entire workflow - # # Revise when this request is fulfilled: - # # https://github.com/actions/download-artifact/issues/214 - # - name: Download image name files into separate folders - # uses: actions/download-artifact@v3 - - # - name: Login to GitHub Container Registry - # uses: docker/login-action@v2 - # with: - # registry: ghcr.io - # username: ${{ github.repository_owner }} - # password: ${{ secrets.GITHUB_TOKEN }} - - # - name: Set docker metadata - # id: meta - # uses: docker/metadata-action@v4 - # with: - # images: | - # ${{ env.UPLD_IMAGE }} - # flavor: | - # latest=false - # tags: | - # type=raw,value=${{ github.run_id }}-jax-multiarch - # labels: - # org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - # - name: Combine images into a single multi-arch image - # shell: bash -x -e {0} - # run: | - # docker manifest create ${{ steps.meta.outputs.tags }} $( - # for IMAGE in $(cat image-name-jax-*/image-name.txt); do - # REPO=$(echo $IMAGE | cut -d: -f1) - # DIGEST=$( - # docker manifest inspect $IMAGE |\ - # jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - # ) - # echo $REPO@${DIGEST} - # done - # ) - # docker manifest push ${{ steps.meta.outputs.tags }} - - # merge: - # runs-on: ubuntu-latest - # needs: build - # outputs: - # DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - # steps: - # - name: Download image name files into separate folders - # uses: actions/download-artifact@v3 - - # - name: Login to GitHub Container Registry - # uses: docker/login-action@v2 - # with: - # registry: ghcr.io - # username: ${{ github.repository_owner }} - # password: ${{ secrets.GITHUB_TOKEN }} - - # - name: Set docker metadata - # id: meta - # uses: docker/metadata-action@v4 - # with: - # images: | - # ${{ env.UPLD_IMAGE }} - # flavor: | - # latest=false - # tags: | - # type=raw,value=${{ github.run_id }}-base-multiarch - # labels: - # org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - # - name: Combine images into a single multi-arch image - # shell: bash -x -e {0} - # run: | - # docker manifest create ${{ steps.meta.outputs.tags }} $( - # for IMAGE in $(cat image-name-base-*/image-name.txt); do - # REPO=$(echo $IMAGE | cut -d: -f1) - # DIGEST=$( - # docker manifest inspect $IMAGE |\ - # jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - # ) - # echo $REPO@${DIGEST} - # done - # ) - # docker manifest push ${{ steps.meta.outputs.tags }} diff --git a/.github/workflows/_test_te.yaml b/.github/workflows/_test_te.yaml index 3f7d571b9..1e2276b20 100644 --- a/.github/workflows/_test_te.yaml +++ b/.github/workflows/_test_te.yaml @@ -1,14 +1,13 @@ name: ~test TransformerEngine on: - # Called from another workflow workflow_call: inputs: - JAX_TE_IMAGE: + TE_IMAGE: type: string - description: 'JAX-TE image build by NVIDIA/JAX-Toolbox' + description: 'JAX+TE+PAXML image' required: true - default: 'ghcr.io/nvidia/jax-te:latest' + default: 'ghcr.io/nvidia/upstream-pax:latest' outputs: UNIT_TEST_ARTIFACT_NAME: description: 'Name of the unit test artifact for downstream workflows' @@ -40,39 +39,20 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Pull JAX-TE image + - name: Pull TE image shell: bash -x -e {0} run: | - docker pull ${{ inputs.JAX_TE_IMAGE }} - docker tag ${{ inputs.JAX_TE_IMAGE }} jax:te + docker pull ${{ inputs.TE_IMAGE }} + docker tag ${{ inputs.TE_IMAGE }} te:local - - name: Run JAX-TE unit tests with docker - shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log jax:te bash -x /cmd.sh + - name: Run TE unit tests with docker + shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log te:local bash -x /cmd.sh run: | - # TE test reqs - TEST_REQS="pytest-reportlog" - TE_PATH=$(dirname $(python -c "import transformer_engine as te; print(*te.__path__)")) - ## WAR: BEGIN - # This installation step is temporary and should be removed and replaced with - # $ NVTE_FRAMEWORK=jax pip install -e ${TE_PATH}[test] $TEST_REQS - # when praxis no longer uses jax at github head as a requirement. Praxis requirements - # are defined here: https://github.com/google/praxis/blob/main/requirements.in - - # After https://github.com/google/praxis/pull/20 was merged, SKIP_HEAD_INSTALLS was introduced - # as an environment variable to allow skipping of head installs like jax/fiddle that overrode - # the jax package that we already installed in our base image. Once SKIP_HEAD_INSTALLS is - # set, the user must specify the head installs manually to ensure they are respected by pip's - # dependency resolver. This is brittle since new head installs may be missed, so they must be - # manually added to TEST_REQS below. Praxis is still installed from head as opposed to pypi - # because no wheel exists with this feature yet. - if ! pip show praxis >/dev/null 2>&1; then - TEST_REQS+=" fiddle git+https://github.com/google/praxis" - fi - SKIP_HEAD_INSTALLS=1 NVTE_FRAMEWORK=jax pip install -e ${TE_PATH}[test] $TEST_REQS - ## WAR: END - pytest --report-log=/log/report.jsonl ${TE_PATH}/tests/jax || true + pip install pytest-reportlog + pytest --report-log=/log/report.jsonl ${SRC_PATH_TE}/tests/jax - name: Upload unit test json logs + if: success() || failure() uses: actions/upload-artifact@v3 with: name: ${{ env.UNIT_TEST_ARTIFACT_NAME }} @@ -111,7 +91,7 @@ jobs: id: meta shell: bash -x -e {0} run: | - PYXIS_IMAGE_NAME=${{ inputs.JAX_TE_IMAGE }} + PYXIS_IMAGE_NAME=${{ inputs.TE_IMAGE }} PYXIS_IMAGE_NAME=${PYXIS_IMAGE_NAME/ghcr.io\//ghcr.io#} TEST_CASE_NAME=1P${{ matrix.N_GPU }}G JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} From fc2c6e6c22ae1ea0e5eb80249a5280e3880328c0 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 08:28:44 +0000 Subject: [PATCH 090/146] fix pax test --- .github/container/Dockerfile.pax.amd64 | 6 ++++-- .github/container/Dockerfile.pax.arm64 | 6 ++++-- .github/workflows/_sandbox.yaml | 20 ++++++++++---------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index 26b0411ae..15d2d5f2b 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -21,8 +21,10 @@ ARG SRC_PATH_PAXML ARG SRC_PATH_PRAXIS RUN <<"EOF" bash -ex -get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} -m /opt/pip-tools.d/manifest.paxml -get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} -m /opt/pip-tools.d/manifest.praxis +get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} +get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} +echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/manifest.pax +echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/manifest.pax for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do pushd ${src} diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index a4b281208..573051a34 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -133,8 +133,10 @@ echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/manifest.pax echo "chex==0.1.7" >> /opt/pip-tools.d/manifest.pax echo "auditwheel" >> /opt/pip-tools.d/manifest.pax -get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} -m /opt/pip-tools.d/manifest.pax -get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} -m /opt/pip-tools.d/manifest.pax +get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} +get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} +echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/manifest.pax +echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/manifest.pax for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do pushd ${src} diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 37539fc90..fd169dca1 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -11,25 +11,25 @@ permissions: jobs: - build-jax: - uses: ./.github/workflows/_build_jax.yaml - with: - ARCHITECTURE: amd64 - secrets: inherit + # build-jax: + # uses: ./.github/workflows/_build_jax.yaml + # with: + # ARCHITECTURE: amd64 + # secrets: inherit build-pax: - needs: [build-jax] + # needs: [build-jax] uses: ./.github/workflows/_build_pax.yaml with: ARCHITECTURE: amd64 - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} secrets: inherit - test-te: + test-pax: needs: [build-pax] - uses: ./.github/workflows/_test_te.yaml + uses: ./.github/workflows/_test_pax.yaml with: - TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} # finalize: # if: always() From f9c6cd3158cdb1cb8c52e490f0f38cef57df5985 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 08:33:25 +0000 Subject: [PATCH 091/146] fix TE test --- .github/container/Dockerfile.jax | 2 +- .github/workflows/_sandbox.yaml | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 4b6526a6e..f77668308 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -85,7 +85,7 @@ pip install ninja && rm -rf ~/.cache/pip get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} pushd ${SRC_PATH_TE} python setup.py bdist_wheel && rm -rf build -echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)[test]" >> /opt/pip-tools.d/manifest.te +echo "transformer-engine[test] @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/manifest.te EOF # TODO: properly configure entrypoint diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index fd169dca1..6b5d93093 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -11,11 +11,11 @@ permissions: jobs: - # build-jax: - # uses: ./.github/workflows/_build_jax.yaml - # with: - # ARCHITECTURE: amd64 - # secrets: inherit + build-jax: + uses: ./.github/workflows/_build_jax.yaml + with: + ARCHITECTURE: amd64 + secrets: inherit build-pax: # needs: [build-jax] @@ -25,11 +25,17 @@ jobs: # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} secrets: inherit - test-pax: + test-te: needs: [build-pax] - uses: ./.github/workflows/_test_pax.yaml + uses: ./.github/workflows/_test_te.yaml with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + + # test-pax: + # needs: [build-pax] + # uses: ./.github/workflows/_test_pax.yaml + # with: + # PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} # finalize: # if: always() From 7230589bc8e616428ab0d3cf6eb50c55943bdf5a Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 08:40:10 +0000 Subject: [PATCH 092/146] merge CI yaml --- .../workflows/{_ci_amd64.yaml => _ci.yaml} | 17 +- .github/workflows/_ci_arm64.yaml | 153 ------------------ .github/workflows/ci.yaml | 6 +- 3 files changed, 17 insertions(+), 159 deletions(-) rename .github/workflows/{_ci_amd64.yaml => _ci.yaml} (91%) delete mode 100644 .github/workflows/_ci_arm64.yaml diff --git a/.github/workflows/_ci_amd64.yaml b/.github/workflows/_ci.yaml similarity index 91% rename from .github/workflows/_ci_amd64.yaml rename to .github/workflows/_ci.yaml index e4c2402d6..7c6048110 100644 --- a/.github/workflows/_ci_amd64.yaml +++ b/.github/workflows/_ci.yaml @@ -1,8 +1,12 @@ -name: CI (amd64) +name: CI +run-name: CI-${{ inputs.ARCHITECTURE }} on: workflow_call: inputs: + ARCHITECTURE: + type: string + required: true CUDA_IMAGE: type: string required: true @@ -90,7 +94,7 @@ jobs: needs: metadata uses: ./.github/workflows/_build_base.yaml with: - ARCHITECTURE: amd64 + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit @@ -99,7 +103,7 @@ jobs: needs: [metadata, build-base] uses: ./.github/workflows/_build_jax.yaml with: - ARCHITECTURE: amd64 + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} REPO_JAX: ${{ needs.metadata.outputs.REPO_JAX }} @@ -112,6 +116,7 @@ jobs: build-t5x: needs: [metadata, build-jax] + if: inputs.ARCHITECTURE == 'amd64' # T5X arm64 build is wip in PR 252 uses: ./.github/workflows/_build_t5x.yaml with: ARCHITECTURE: amd64 @@ -125,7 +130,7 @@ jobs: needs: [metadata, build-jax] uses: ./.github/workflows/_build_pax.yaml with: - ARCHITECTURE: amd64 + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} REPO_PAXML: ${{ needs.metadata.outputs.REPO_PAXML }} @@ -159,6 +164,7 @@ jobs: test-jax: needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_jax.yaml with: JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} @@ -166,6 +172,7 @@ jobs: test-te: needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_te.yaml with: JAX_TE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} @@ -173,6 +180,7 @@ jobs: test-t5x: needs: build-t5x + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_t5x.yaml with: T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_FINAL }} @@ -180,6 +188,7 @@ jobs: test-pax: needs: build-pax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_pax.yaml with: PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} diff --git a/.github/workflows/_ci_arm64.yaml b/.github/workflows/_ci_arm64.yaml deleted file mode 100644 index 2a778cb15..000000000 --- a/.github/workflows/_ci_arm64.yaml +++ /dev/null @@ -1,153 +0,0 @@ -name: CI (arm64) - -on: - workflow_call: - inputs: - CUDA_IMAGE: - type: string - required: true - SRC_JAX: - type: string - required: true - SRC_XLA: - type: string - required: true - SRC_TE: - type: string - required: true - SRC_T5X: - type: string - required: true - SRC_PAXML: - type: string - required: true - SRC_PRAXIS: - type: string - required: true - outputs: - TAG_BASE: - description: "Tags of the base image built" - value: ${{ jobs.build-base.outputs.DOCKER_TAGS }} - TAG_JAX: - description: "Tags of the JAX image built" - value: ${{ jobs.build-jax.outputs.DOCKER_TAG_FINAL }} - TAG_T5X: - description: "Tags of the T5X image built" - value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_FINAL }} - TAG_PAX: - description: "Tags of the PAX image built" - value: ${{ jobs.build-pax.outputs.DOCKER_TAG_FINAL }} - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - - metadata: - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - REPO_JAX: ${{ steps.parse-inputs.outputs.REPO_JAX }} - REF_JAX: ${{ steps.parse-inputs.outputs.REF_JAX }} - REPO_XLA: ${{ steps.parse-inputs.outputs.REPO_XLA }} - REF_XLA: ${{ steps.parse-inputs.outputs.REF_XLA }} - REPO_TE: ${{ steps.parse-inputs.outputs.REPO_TE }} - REF_TE: ${{ steps.parse-inputs.outputs.REF_TE }} - REPO_T5X: ${{ steps.parse-inputs.outputs.REPO_T5X }} - REF_T5X: ${{ steps.parse-inputs.outputs.REF_T5X }} - REPO_PAXML: ${{ steps.parse-inputs.outputs.REPO_PAXML }} - REF_PAXML: ${{ steps.parse-inputs.outputs.REF_PAXML }} - REPO_PRAXIS: ${{ steps.parse-inputs.outputs.REPO_PRAXIS }} - REF_PRAXIS: ${{ steps.parse-inputs.outputs.REF_PRAXIS }} - steps: - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v3 - - - name: Set build date - id: date - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - - - name: split input "repo#ref" into repo and ref parts - id: parse-inputs - shell: bash -x -e {0} - run: | - source .github/workflows/scripts/parse_git_src.sh - - # default values are for `pull_request` event types - parse_git_src JAX "${{ inputs.SRC_JAX }}" - parse_git_src XLA "${{ inputs.SRC_XLA }}" - parse_git_src TE "${{ inputs.SRC_TE }}" - parse_git_src T5X "${{ inputs.SRC_T5X }}" - parse_git_src PAXML "${{ inputs.SRC_PAXML }}" - parse_git_src PRAXIS "${{ inputs.SRC_PRAXIS }}" - - build-base: - needs: metadata - uses: ./.github/workflows/_build_base.yaml - with: - ARCHITECTURE: arm64 - BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - secrets: inherit - - build-jax: - needs: [metadata, build-base] - uses: ./.github/workflows/_build_jax.yaml - with: - ARCHITECTURE: arm64 - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - REPO_JAX: ${{ needs.metadata.outputs.REPO_JAX }} - REF_JAX: ${{ needs.metadata.outputs.REF_JAX }} - REPO_XLA: ${{ needs.metadata.outputs.REPO_XLA }} - REF_XLA: ${{ needs.metadata.outputs.REF_XLA }} - REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} - REF_TE: ${{ needs.metadata.outputs.REF_TE }} - secrets: inherit - - # build-t5x: - # needs: [metadata, build-jax] - # uses: ./.github/workflows/_build_t5x.yaml - # with: - # ARCHITECTURE: arm64 - # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} - # REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} - # secrets: inherit - - build-pax: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_pax.yaml - with: - ARCHITECTURE: arm64 - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - REPO_PAXML: ${{ needs.metadata.outputs.REPO_PAXML }} - REF_PAXML: ${{ needs.metadata.outputs.REF_PAXML }} - REPO_PRAXIS: ${{ needs.metadata.outputs.REPO_PRAXIS }} - REF_PRAXIS: ${{ needs.metadata.outputs.REF_PRAXIS }} - secrets: inherit - - # build-rosetta-t5x: - # uses: ./.github/workflows/_build_rosetta.yaml - # needs: [metadata, build-t5x] - # with: - # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_MEALKIT }} - # BASE_LIBRARY: t5x - # secrets: inherit - - # build-rosetta-pax: - # uses: ./.github/workflows/_build_rosetta.yaml - # needs: [metadata, build-pax] - # with: - # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_MEALKIT }} - # BASE_LIBRARY: pax - # secrets: inherit diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 30457c4a2..35d5ddfc6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -54,8 +54,9 @@ permissions: jobs: amd64: - uses: ./.github/workflows/_ci_amd64.yaml + uses: ./.github/workflows/_ci.yaml with: + ARCHITECTURE: amd64 CUDA_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} SRC_JAX: ${{ inputs.SRC_JAX || 'https://github.com/google/jax.git#main' }} SRC_XLA: ${{ inputs.SRC_XLA || 'https://github.com/openxla/xla.git#main'}} @@ -66,8 +67,9 @@ jobs: secrets: inherit arm64: - uses: ./.github/workflows/_ci_arm64.yaml + uses: ./.github/workflows/_ci.yaml with: + ARCHITECTURE: arm64 CUDA_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} SRC_JAX: ${{ inputs.SRC_JAX || 'https://github.com/google/jax.git#main' }} SRC_XLA: ${{ inputs.SRC_XLA || 'https://github.com/openxla/xla.git#main'}} From 429ae4d74f10e09498d306836be24e9c0a8793bc Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 08:40:51 +0000 Subject: [PATCH 093/146] fix arg --- .github/workflows/_ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 7c6048110..7b845b1c6 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -175,7 +175,7 @@ jobs: if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_te.yaml with: - JAX_TE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + TE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} secrets: inherit test-t5x: From de32e4f7855f0352623cc3502a157874316622b5 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 08:45:13 +0000 Subject: [PATCH 094/146] rerun TE/PAX test --- .github/workflows/_sandbox.yaml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 6b5d93093..2e2a7c57a 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -18,11 +18,11 @@ jobs: secrets: inherit build-pax: - # needs: [build-jax] + needs: [build-jax] uses: ./.github/workflows/_build_pax.yaml with: ARCHITECTURE: amd64 - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} secrets: inherit test-te: @@ -30,12 +30,14 @@ jobs: uses: ./.github/workflows/_test_te.yaml with: TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit - # test-pax: - # needs: [build-pax] - # uses: ./.github/workflows/_test_pax.yaml - # with: - # PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + test-pax: + needs: [build-pax] + uses: ./.github/workflows/_test_pax.yaml + with: + PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit # finalize: # if: always() From ab73f6bf974e2a507f3dd65bd0341f0895cf2c35 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 09:45:20 +0000 Subject: [PATCH 095/146] fix TE multi-device test --- .github/workflows/_sandbox.yaml | 14 +++++++------- .github/workflows/_test_te.yaml | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 38b4ac55c..68bf653d9 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -2,7 +2,7 @@ name: "~Sandbox" on: # workflow_dispatch: - # push: + push: permissions: contents: read # to fetch code @@ -32,12 +32,12 @@ jobs: TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} secrets: inherit - test-pax: - needs: [build-pax] - uses: ./.github/workflows/_test_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-pax: + # needs: [build-pax] + # uses: ./.github/workflows/_test_pax.yaml + # with: + # PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit # finalize: # if: always() diff --git a/.github/workflows/_test_te.yaml b/.github/workflows/_test_te.yaml index 1e2276b20..cc4b92de4 100644 --- a/.github/workflows/_test_te.yaml +++ b/.github/workflows/_test_te.yaml @@ -124,7 +124,7 @@ jobs: --container-entrypoint \ bash -e -x -c 'nvidia-smi pip install pytest pytest-reportlog cuda-python - cd \$(dirname \$(python -c "import transformer_engine as te; print(*te.__path__)"))/examples/jax/encoder + cd \${SRC_PATH_TE}/examples/jax/encoder pip install -r requirements.txt pytest --report-log=/output/$(basename ${{ steps.meta.outputs.PYTEST_LOG_FILE }}) \ test_single_gpu_encoder.py \ From 9eb97e80c6024e8b50262bd65b739626d24b4fe1 Mon Sep 17 00:00:00 2001 From: Yu-Hang 'Maxin' Tang Date: Fri, 10 Nov 2023 16:43:50 +0000 Subject: [PATCH 096/146] fix lzma build issue --- .github/container/Dockerfile.base | 6 ++++++ .github/container/Dockerfile.pax.arm64 | 15 --------------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index fe719ee24..682bca310 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -18,8 +18,14 @@ RUN apt-get update && \ git \ lld \ vim \ + bat \ + curl \ + git \ + gnupg \ + rsync \ python-is-python3 \ python3-pip \ + liblzma-dev \ wget \ && \ apt-get clean && \ diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 573051a34..7a20e975e 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -15,21 +15,6 @@ ARG SRC_PATH_PRAXIS=/opt/praxis ARG BASE_IMAGE FROM ${BASE_IMAGE} as wheel-builder -# We need to build some packages from source, bring some dependencies. -RUN < Date: Sat, 11 Nov 2023 09:05:53 +0000 Subject: [PATCH 097/146] edit TE test name --- .github/workflows/_test_te.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_test_te.yaml b/.github/workflows/_test_te.yaml index cc4b92de4..b6001256e 100644 --- a/.github/workflows/_test_te.yaml +++ b/.github/workflows/_test_te.yaml @@ -14,7 +14,7 @@ on: value: ${{ jobs.te-unit-tests.outputs.UNIT_TEST_ARTIFACT_NAME }} INTEGRATION_TEST_ARTIFACT_NAME: description: 'Name of the integration test artifact for downstream workflows' - value: ${{ jobs.single-process-multi-device.outputs.INTEGRATION_TEST_ARTIFACT_NAME }} + value: ${{ jobs.te-multi-gpu.outputs.INTEGRATION_TEST_ARTIFACT_NAME }} env: UNIT_TEST_ARTIFACT_NAME: unit-test-logs @@ -58,7 +58,7 @@ jobs: name: ${{ env.UNIT_TEST_ARTIFACT_NAME }} path: /log/report.jsonl - single-process-multi-device: + te-multi-gpu: strategy: matrix: N_GPU: [1, 2, 4, 8] From fcb29b4c8684f8ccb137cb99979f8cf601664ac8 Mon Sep 17 00:00:00 2001 From: Yu-Hang Maxin Tang Date: Mon, 13 Nov 2023 14:15:13 +0000 Subject: [PATCH 098/146] fix TE arm64 test install error --- .github/container/Dockerfile.jax | 2 +- .github/container/Dockerfile.pax.arm64 | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index fd4a6dde6..aa2726aff 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -98,7 +98,7 @@ pip install ninja && rm -rf ~/.cache/pip get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} pushd ${SRC_PATH_TE} python setup.py bdist_wheel && rm -rf build -echo "transformer-engine[test] @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/manifest.te +echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/manifest.te EOF # TODO: properly configure entrypoint diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 7a20e975e..552c6dddc 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -97,13 +97,8 @@ ARG REF_PRAXIS ARG SRC_PATH_PAXML ARG SRC_PATH_PRAXIS -RUN <> /opt/pip-tools.d/manifest.pax From 22d400b40a90a87e0c90ae64e555c2867ab452c3 Mon Sep 17 00:00:00 2001 From: Yu-Hang Maxin Tang Date: Mon, 13 Nov 2023 14:47:38 +0000 Subject: [PATCH 099/146] remove --install option from get-source.sh --- .github/container/get-source.sh | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/.github/container/get-source.sh b/.github/container/get-source.sh index 069abee0b..b8fff71a7 100755 --- a/.github/container/get-source.sh +++ b/.github/container/get-source.sh @@ -12,14 +12,13 @@ usage() { echo " -d, --dir PATH [Required] Local path to check out the source code." echo " -f, --from URL [Required] URL of the source repo." echo " -h, --help Print usage." - echo " -i, --install Install the package immediately using pip install." echo " -m, --manifest FILE Create a pip manifest file if specified" echo " -r, --ref REF Git commit SHA, branch name, or tag name to checkout. Uses default branch if not specified." echo exit $1 } -args=$(getopt -o d:f:hi:m:r: --long dir:,from:,help,install,manifest:,ref: -- "$@") +args=$(getopt -o d:f:hm:r: --long dir:,from:,help,manifest:,ref: -- "$@") if [[ $? -ne 0 ]]; then exit 1 fi @@ -28,7 +27,6 @@ fi GIT_REPO="" GIT_REF="${GIT_REF:-HEAD}" -INSTALL=${INSTALL:-0} INSTALL_DIR="" MANIFEST_FILE="" @@ -46,10 +44,6 @@ while [ : ]; do -h | --help) usage ;; - -i | --install) - INSTALL=true - shift - ;; -m | --manifest) MANIFEST_FILE="$2" shift 2 @@ -93,9 +87,5 @@ git submodule init git submodule update --recursive popd -if (( INSTALL == 1 )); then - pip install -e ${INSTALL_DIR} -elif [[ -n "${MANIFEST_FILE}" ]]; then - echo "Writing to ${MANIFEST_FILE}:" - echo "-e file://${INSTALL_DIR}" | tee -a ${MANIFEST_FILE} -fi +echo "Writing to ${MANIFEST_FILE}:" +echo "-e file://${INSTALL_DIR}" | tee -a ${MANIFEST_FILE} From e9f074fdd1c5781e3c0dc47db9b81eece2e81b49 Mon Sep 17 00:00:00 2001 From: Yu-Hang Maxin Tang Date: Mon, 13 Nov 2023 15:10:44 +0000 Subject: [PATCH 100/146] fix TE arm64 test install error --- .github/container/Dockerfile.pax.amd64 | 3 +++ .github/container/Dockerfile.pax.arm64 | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index 15d2d5f2b..d9dffbc36 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -20,6 +20,9 @@ ARG REF_PRAXIS ARG SRC_PATH_PAXML ARG SRC_PATH_PRAXIS +# update TE manifest file to install the [test] extras +RUN sed -i "s/.whl/.whl[test]/g" /opt/pip-tools.d/manifest.te + RUN <<"EOF" bash -ex get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 552c6dddc..79a1342cc 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -97,9 +97,6 @@ ARG REF_PRAXIS ARG SRC_PATH_PAXML ARG SRC_PATH_PRAXIS -# update TE manifest file to install the [test] extras -RUN sed -i "s/transformer-engine/transformer-engine[test]/g" /opt/pip-tools.d/manifest.te - COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt/ RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/manifest.pax From 602002fa1a8df7110bea6e152b560873c4002af8 Mon Sep 17 00:00:00 2001 From: Yu-Hang Maxin Tang Date: Mon, 13 Nov 2023 15:11:15 +0000 Subject: [PATCH 101/146] disable sandbox --- .github/workflows/_sandbox.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 68bf653d9..eaeae2bf6 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -2,7 +2,7 @@ name: "~Sandbox" on: # workflow_dispatch: - push: + # push: permissions: contents: read # to fetch code From 12a57eb72517d4944b441576fd3d7add44e6bb45 Mon Sep 17 00:00:00 2001 From: Yu-Hang Maxin Tang Date: Mon, 13 Nov 2023 15:32:50 +0000 Subject: [PATCH 102/146] i'm jet-lagged --- .github/container/Dockerfile.pax.amd64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index d9dffbc36..89697dced 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -21,7 +21,7 @@ ARG SRC_PATH_PAXML ARG SRC_PATH_PRAXIS # update TE manifest file to install the [test] extras -RUN sed -i "s/.whl/.whl[test]/g" /opt/pip-tools.d/manifest.te +RUN sed -i "s/transformer-engine @/transformer-engine[test] @/g" /opt/pip-tools.d/manifest.te RUN <<"EOF" bash -ex get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} From dbaba5b010d58d8a394bf5bfb17d89a901b79cf7 Mon Sep 17 00:00:00 2001 From: Yu-Hang Maxin Tang Date: Mon, 13 Nov 2023 15:38:31 +0000 Subject: [PATCH 103/146] use Pax image for TE testing --- .github/workflows/_ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 7b845b1c6..6f0cd8d8c 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -175,7 +175,7 @@ jobs: if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_te.yaml with: - TE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} secrets: inherit test-t5x: From ccafb52460b881a9ad072fd40e7dac887be81ff3 Mon Sep 17 00:00:00 2001 From: "Yu-Hang \"Maxin\" Tang" Date: Mon, 13 Nov 2023 07:57:54 -0800 Subject: [PATCH 104/146] Fix job dependency --- .github/workflows/_ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 6f0cd8d8c..1447f5076 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -171,7 +171,7 @@ jobs: secrets: inherit test-te: - needs: build-jax + needs: build-pax if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a uses: ./.github/workflows/_test_te.yaml with: @@ -200,4 +200,4 @@ jobs: # with: # ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} # secrets: inherit - \ No newline at end of file + From 6974a3adcd73312b2f064faad5d18e68e3484147 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 21:52:41 -0700 Subject: [PATCH 105/146] Add nightly rosetta build and test --- .github/workflows/_build_rosetta.yaml | 172 +++++++++--------- .github/workflows/_sandbox.yaml | 8 + .github/workflows/_test_pax_rosetta.yaml | 12 +- ...ml => nightly-rosetta-pax-build-test.yaml} | 137 ++++++++------ .../nightly-rosetta-t5x-build-test.yaml | 139 ++++++++------ .github/workflows/nightly-t5x-build.yaml | 2 +- .github/workflows/nightly-te-test.yaml | 6 +- rosetta/Dockerfile.pax | 2 +- rosetta/Dockerfile.t5x | 2 +- 9 files changed, 265 insertions(+), 215 deletions(-) rename .github/workflows/{nightly-rosetta-pax-build.yaml => nightly-rosetta-pax-build-test.yaml} (54%) diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml index e811e31bf..1f6509e8c 100644 --- a/.github/workflows/_build_rosetta.yaml +++ b/.github/workflows/_build_rosetta.yaml @@ -3,6 +3,10 @@ name: ~build Rosetta container on: workflow_call: inputs: + ARCHITECTURE: + type: string + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true BASE_LIBRARY: type: string description: 'Choice of base library to build on:' @@ -14,18 +18,21 @@ on: required: false BUILD_DATE: type: string - description: "Build date in YYYY-MM-DD format" + description: 'Build date in YYYY-MM-DD format' required: false default: 'NOT SPECIFIED' - PLATFORMS: + BADGE_FILENAME: type: string - description: 'JSON list of platforms. Ex: ["amd64"]' + description: 'Name of the endpoint JSON file for shields.io badge' required: false - default: '["arm64", "amd64"]' + default: 'badge-rosetta-build' outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.merge.outputs.DOCKER_TAGS }} + DOCKER_TAG_MEALKIT: + description: 'Tags of the mealkit image build' + value: $ {{ jobs.build-rosetta.output.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build-rosetta.outputs.DOCKER_TAG_FINAL }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -38,13 +45,17 @@ permissions: jobs: - build: - strategy: - fail-fast: false - matrix: - PLATFORM: ${{ fromJSON(inputs.PLATFORMS) }} - runs-on: [self-hosted, "${{ matrix.PLATFORM }}", small] + build-rosetta: + runs-on: [self-hosted, "${{ input.ARCHITECTURE }}", small] + env: + BADGE_FILENAME_FULL: ${{ input.BADGE_FILENAME}}-${{ input.ARCHITECTURE}}.json + output: + DOCKER_TAG_MEALKIT: ${{ steps.mealkit-metadata.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: + - name: Print environment variables + run: env + - name: Set default BASE_IMAGE id: defaults run: | @@ -54,9 +65,6 @@ jobs: echo "BASE_IMAGE=${{ env.DOCKER_REGISTRY }}/upstream-${{ inputs.BASE_LIBRARY }}:latest" >> "$GITHUB_OUTPUT" fi - - name: Print environment variables - run: env - - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3 @@ -67,92 +75,88 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: ${{ env.UPLD_IMAGE }} - flavor: latest=false - tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ matrix.PLATFORM }} - labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 with: driver-opts: | image=moby/buildkit:v0.12.1 - - name: Build docker images + - name: Set docker metadata - mealkit + id: mealkit-metadata + uses: docker/metadata-action@v4 + with: + images: ${{ env.UPLD_IMAGE }} + flavor: latest=false + tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ input.ARCHITECHTURE }}-mealkit + labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + + - name: Build docker images - mealkit + id: mealkit-build uses: docker/build-push-action@v4 with: context: rosetta/ push: true file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} - platforms: linux/${{ matrix.PLATFORM }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + platforms: linux/${{ input.ARCHITECHTURE }} + tags: ${{ steps.mealkit-metadata.outputs.tags }} + labels: ${{ steps.mealkit-metadata.outputs.labels }} target: rosetta build-args: | BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} - # Temporary workaround until the following issues are solved: - # https://github.com/orgs/community/discussions/17245 - # https://github.com/actions/runner/pull/2477 - # https://github.com/orgs/community/discussions/26639 - - name: Save image name as text file - shell: bash -x -e {0} - run: | - echo "${{ steps.meta.outputs.tags }}" >> image-name.txt - - - name: Upload image name file as artifact - uses: actions/upload-artifact@v3 + - name: Set docker metadata - final + id: final-metadata + uses: docker/metadata-action@v4 with: - name: image-name-${{ inputs.BASE_LIBRARY }}-${{ matrix.PLATFORM }} - path: image-name.txt - - merge: - runs-on: ubuntu-latest - needs: build - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - steps: - # TODO: currently downloading all artifacts of the entire workflow - # Revise when this request is fulfilled: - # https://github.com/actions/download-artifact/issues/214 - - name: Download image name files into separate folders - uses: actions/download-artifact@v3 + images: ${{ env.UPLD_IMAGE }} + flavor: latest=false + tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ input.ARCHITECHTURE }}-final + labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 + - name: Build docker images - final + uses: docker/build-push-action@v4 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + context: rosetta/ + push: true + file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} + platforms: linux/${{ input.ARCHITECHTURE }} + tags: ${{ steps.final-metadata.outputs.tags }} + labels: ${{ steps.final-metadata.outputs.labels }} + target: rosetta + build-args: | + BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-multiarch - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Combine images into a single multi-arch image + + - name: Generate sitrep + if: success() || failure() shell: bash -x -e {0} run: | - docker manifest create ${{ steps.meta.outputs.tags }} $( - for IMAGE in $(cat image-name-${{ inputs.BASE_LIBRARY }}-*/image-name.txt); do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${{ steps.meta.outputs.tags }} \ No newline at end of file + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='JAX ${{ inputs.ARCHITECTURE }} build' + tags="${{ steps.final-metadata.outputs.tags }}" + digest="${{ steps.final-build.outputs.digest }}" + outcome="${{ steps.final-build.outcome }}" + + if [[ ${outcome} == "success" ]]; then + badge_message="pass" + badge_color=brightgreen + summary="JAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" + else + badge_message="fail" + badge_color=red + summary="JAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" + fi + + to_json \ + summary \ + badge_label tags digest outcome \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="${badge_message}" \ + color="${badge_color}" \ + to_json schemaVersion label message color \ + > ${{ env.BADGE_FILENAME_FULL }} \ No newline at end of file diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index eaeae2bf6..67a444e88 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -32,6 +32,14 @@ jobs: TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} secrets: inherit + build-rosetta: + needs: [build-pax] + uses: ./.github/worklows/_build_rosetta.yaml + with: + ARCHITECTURE: amd64 + BASE_LIBRARY: pax + BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + # test-pax: # needs: [build-pax] # uses: ./.github/workflows/_test_pax.yaml diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index c69736a4c..14919e06a 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -1,4 +1,4 @@ -name: ~test Pax, MGMN +name: ~test Pax, multi-node on: workflow_call: @@ -20,7 +20,7 @@ on: jobs: - multi-gpu-multi-node-te: + rosetta-pax-multi-node-te: strategy: matrix: PARALLEL_CONFIG: @@ -157,7 +157,7 @@ jobs: name: ${{ steps.meta.outputs.JOB_NAME }} path: output/* - multi-gpu-multi-node: + rosetta-pax-multi-node: strategy: matrix: PARALLEL_CONFIG: @@ -292,7 +292,7 @@ jobs: path: output/* - multi-gpu-single-node-dropout-te: + rosetta-pax-single-node-dropout-te: strategy: matrix: PARALLEL_CONFIG: @@ -428,7 +428,7 @@ jobs: metrics: - needs: [multi-gpu-multi-node, multi-gpu-multi-node-te, multi-gpu-single-node-dropout-te] + needs: [rosetta-pax-multi-node, rosetta-pax-multi-node-te, rosetta-pax-single-node-dropout-te] runs-on: ubuntu-22.04 steps: @@ -466,7 +466,7 @@ jobs: publish-test: - needs: [multi-gpu-multi-node, multi-gpu-multi-node-te, multi-gpu-single-node-dropout-te, metrics] + needs: [rosetta-pax-multi-node, rosetta-pax-multi-node-te, rosetta-pax-single-node-dropout-te, metrics] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build-test.yaml similarity index 54% rename from .github/workflows/nightly-rosetta-pax-build.yaml rename to .github/workflows/nightly-rosetta-pax-build-test.yaml index c12cfd8f0..e28dd0f2e 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build-test.yaml @@ -31,61 +31,108 @@ permissions: jobs: metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' runs-on: ubuntu-22.04 outputs: - BUILD_DATE: ${{ steps.meta-vars.outputs.BUILD_DATE }} - BASE_LIBRARY: ${{ steps.meta-vars.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ steps.meta-vars.outputs.BASE_IMAGE }} - PUBLISH: ${{ steps.meta-vars.outputs.PUBLISH }} + BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + BASE_LIBRARY: ${{ steps.base-metadata.outputs.BASE_LIBRARY }} + BASE_IMAGE: ${{ steps.base-metadata.outputs.BASE_IMAGE }} + PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} steps: - - name: Set build metadata - id: meta-vars + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + uses: styfle/cancel-workflow-action@0.12.0 + + - name: Determine if the resulting container should be 'published' + id: if-publish + shell: bash -x -e {0} + run: + # A container should be published if: + # 1) the workflow is triggered by workflow_dispatch and the PUBLISH input is true, or + # 2) the workflow is triggered by workflow_run (i.e., a nightly build) + echo "PUBLISH=${{ github.event_name == 'workflow_run' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT + + - name: Set build date + id: date shell: bash -x -e {0} run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') + echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + + - name: Set base library and image + id: base-metadata + shell: bash -x -e {0} + run: | if [[ -z "${{ inputs.BASE_IMAGE }}" ]]; then BASE_IMAGE=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest else BASE_IMAGE=${{ inputs.BASE_IMAGE }} fi - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT - echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + amd64: needs: metadata uses: ./.github/workflows/_build_rosetta.yaml with: + ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }} BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} secrets: inherit - publish-build: - needs: [metadata, build] - uses: ./.github/workflows/_publish_badge.yaml - if: ( always() ) + arm64: + needs: metadata + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: arm64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }} + BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} secrets: inherit + + public-mealkit: + needs: [metadata, amd64, arm64] + if: needs.metadata.output.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: | + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} + TARGET_IMAGE: upstream-pax + TARGET_TAGS: | + type=raw,value=mealkit,priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 + + publish-final: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml with: - ENDPOINT_FILENAME: 'rosetta-pax-build-status.json' - PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} - SCRIPT: | - if [[ ${{ needs.build.result }} == "success" ]]; then - BADGE_COLOR=brightgreen - MSG=passing - else - BADGE_COLOR=red - MSG=failing - fi - echo "LABEL='nightly'" >> $GITHUB_OUTPUT - echo "MESSAGE='${MSG}'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} + TARGET_IMAGE: upstream-pax + TARGET_TAGS: | + type=raw,value=latest,priority=1000 + type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} + secrets: inherit test-pax: - needs: build + needs: [metadata, amd64, arm64] uses: ./.github/workflows/_test_pax_rosetta.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: @@ -93,7 +140,7 @@ jobs: secrets: inherit publish-test: - needs: [metadata, build, test-pax] + needs: [metadata, amd64, arm64, test-pax] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit @@ -119,32 +166,4 @@ jobs: fi echo "MESSAGE='${MESSAGE}'" >> $GITHUB_OUTPUT - echo "COLOR='${COLOR}'" >> $GITHUB_OUTPUT - - publish-latest-container: - needs: [metadata, build, test-pax] - if: ( ${{ needs.test-pax.outputs.TEST_STATUS == 'success' }} ) && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: pax - TARGET_TAGS: | - type=raw,value=latest,priority=1000 - - publish-container: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: pax - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 + echo "COLOR='${COLOR}'" >> $GITHUB_OUTPUT \ No newline at end of file diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index 360f8f586..ed0c46efd 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -31,71 +31,109 @@ permissions: jobs: metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' runs-on: ubuntu-22.04 outputs: - BUILD_DATE: ${{ steps.meta-vars.outputs.BUILD_DATE }} - BASE_LIBRARY: ${{ steps.meta-vars.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ steps.meta-vars.outputs.BASE_IMAGE }} - PUBLISH: ${{ steps.meta-vars.outputs.PUBLISH }} + BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + BASE_LIBRARY: ${{ steps.base-metadata.outputs.BASE_LIBRARY }} + BASE_IMAGE: ${{ steps.base-metadata.outputs.BASE_IMAGE }} + PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} steps: - - name: Set build metadata - id: meta-vars + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + uses: styfle/cancel-workflow-action@0.12.0 + + - name: Determine if the resulting container should be 'published' + id: if-publish + shell: bash -x -e {0} + run: + # A container should be published if: + # 1) the workflow is triggered by workflow_dispatch and the PUBLISH input is true, or + # 2) the workflow is triggered by workflow_run (i.e., a nightly build) + echo "PUBLISH=${{ github.event_name == 'workflow_run' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT + + - name: Set build date + id: date shell: bash -x -e {0} run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') + echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + + - name: Set base library and image + id: base-metadata + shell: bash -x -e {0} + run: | if [[ -z "${{ inputs.BASE_IMAGE }}" ]]; then BASE_IMAGE=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest else BASE_IMAGE=${{ inputs.BASE_IMAGE }} fi - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT - echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT - - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + + amd64: needs: metadata uses: ./.github/workflows/_build_rosetta.yaml with: + ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }} BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} - # TODO: Can't build ARM until https://github.com/NVIDIA/JAX-Toolbox/pull/252 is available - PLATFORMS: '["amd64"]' secrets: inherit + + arm64: + needs: metadata + runs-on: ubuntu-22.04 + outputs: + DOCKER_TAG_MEALKIT='' + steps: + - name: Generate placeholder warning + shell: bash -x -e {0} + run: | + echo "WARNING: arm64 build is not yet supported" + + public-mealkit: + needs: [metadata, amd64, arm64] + if: needs.metadata.output.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: | + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} + TARGET_IMAGE: upstream-pax + TARGET_TAGS: | + type=raw,value=mealkit,priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 - publish-build: - needs: [metadata, build] - uses: ./.github/workflows/_publish_badge.yaml - if: ( always() ) - secrets: inherit + publish-final: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml with: - ENDPOINT_FILENAME: 'rosetta-t5x-build-status.json' - PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} - SCRIPT: | - if [[ ${{ needs.build.result }} == "success" ]]; then - BADGE_COLOR=brightgreen - MSG=passing - else - BADGE_COLOR=red - MSG=failing - fi - echo "LABEL='nightly'" >> $GITHUB_OUTPUT - echo "MESSAGE='${MSG}'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} + TARGET_IMAGE: upstream-t5x + TARGET_TAGS: | + type=raw,value=latest,priority=1000 + type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 test-unit: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - needs: build + needs: [metadata, amd64, arm64] uses: ./.github/workflows/_test_rosetta.yaml with: ROSETTA_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} secrets: inherit test-t5x: - needs: build + needs: [metadata, amd64, arm64] uses: ./.github/workflows/_test_t5x_rosetta.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: @@ -103,7 +141,7 @@ jobs: secrets: inherit test-vit: - needs: build + needs: [metadata, amd64, arm64] uses: ./.github/workflows/_test_vit.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: @@ -120,7 +158,7 @@ jobs: secrets: inherit publish-test: - needs: [metadata, build, test-unit, test-t5x, test-vit] + needs: [metadata, test-unit, test-t5x, test-vit] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit @@ -156,30 +194,11 @@ jobs: echo "MESSAGE='${MESSAGE}'" >> $GITHUB_OUTPUT echo "COLOR='${COLOR}'" >> $GITHUB_OUTPUT - publish-latest-container: - needs: [metadata, build, test-t5x, test-unit, test-vit] - if: ( needs.test-unit.outputs.TEST_STATUS == 'success' && needs.test-t5x.outputs.TEST_STATUS == 'success' && needs.test-vit.outputs.TEST_STATUS == 'success' ) && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: t5x - TARGET_TAGS: | - type=raw,value=latest,priority=1000 - - publish-container: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: t5x - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index 4c29136cf..8f0ad277f 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -65,7 +65,7 @@ jobs: needs: metadata runs-on: ubuntu-22.04 outputs: - DOCKER_TAG_FINAL: '' + DOCKER_TAG_MEALKIT: '' steps: - name: Generate placeholder warning shell: bash -x -e {0} diff --git a/.github/workflows/nightly-te-test.yaml b/.github/workflows/nightly-te-test.yaml index 64644294f..d95de68c1 100644 --- a/.github/workflows/nightly-te-test.yaml +++ b/.github/workflows/nightly-te-test.yaml @@ -3,7 +3,7 @@ run-name: Nightly Transformer Engine test (${{ github.event_name == 'workflow_ru on: workflow_run: - workflows: [Nightly JAX build] + workflows: [Nightly Pax build] types: [completed] branches: [main] workflow_dispatch: @@ -12,7 +12,7 @@ on: type: string description: 'JAX-TE image build by NVIDIA/JAX-Toolbox' required: true - default: 'ghcr.io/nvidia/jax:latest' + default: 'ghcr.io/nvidia/upstream-pax:latest' PUBLISH: type: boolean description: Update status badge? @@ -25,7 +25,7 @@ permissions: packages: write # to upload container env: - DEFAULT_JAX_TE_IMAGE: 'ghcr.io/nvidia/jax:latest' + DEFAULT_JAX_TE_IMAGE: 'ghcr.io/nvidia/upstream-pax:latest' jobs: diff --git a/rosetta/Dockerfile.pax b/rosetta/Dockerfile.pax index 8250827e3..503be8dcb 100644 --- a/rosetta/Dockerfile.pax +++ b/rosetta/Dockerfile.pax @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=ghcr.io/nvidia/upstream-pax:latest +ARG BASE_IMAGE=ghcr.io/nvidia/upstream-pax:mealkit ARG GIT_USER_EMAIL=jax@nvidia.com ARG GIT_USER_NAME=NVIDIA diff --git a/rosetta/Dockerfile.t5x b/rosetta/Dockerfile.t5x index 3878ff5c0..b25223fab 100644 --- a/rosetta/Dockerfile.t5x +++ b/rosetta/Dockerfile.t5x @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=ghcr.io/nvidia/upstream-t5x:latest +ARG BASE_IMAGE=ghcr.io/nvidia/upstream-t5x:mealkit ARG GIT_USER_EMAIL=jax@nvidia.com ARG GIT_USER_NAME=NVIDIA From c5f8f2342bb2f5d9f97a0cef80cb05b110ab7bbb Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 21:54:19 -0700 Subject: [PATCH 106/146] wip: fix typo --- .github/workflows/_sandbox.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 67a444e88..af1a723fb 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -34,7 +34,7 @@ jobs: build-rosetta: needs: [build-pax] - uses: ./.github/worklows/_build_rosetta.yaml + uses: ./.github/workflows/_build_rosetta.yaml with: ARCHITECTURE: amd64 BASE_LIBRARY: pax From 1d9d2823491d88eaaab1b9bf12877eb6350ee2b3 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 21:55:15 -0700 Subject: [PATCH 107/146] wip: build sandbox on dispatch --- .github/workflows/_sandbox.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index af1a723fb..e7b83eac0 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -1,8 +1,8 @@ name: "~Sandbox" on: - # workflow_dispatch: - # push: + workflow_dispatch: + #push: permissions: contents: read # to fetch code From e6bf405b88656cf04338309971aef252a77e7d18 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 21:57:38 -0700 Subject: [PATCH 108/146] wip: fix build_rosetta --- .github/workflows/_build_rosetta.yaml | 49 ++++----------------------- 1 file changed, 7 insertions(+), 42 deletions(-) diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml index 1f6509e8c..d0e1c5f76 100644 --- a/.github/workflows/_build_rosetta.yaml +++ b/.github/workflows/_build_rosetta.yaml @@ -46,9 +46,9 @@ permissions: jobs: build-rosetta: - runs-on: [self-hosted, "${{ input.ARCHITECTURE }}", small] + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] env: - BADGE_FILENAME_FULL: ${{ input.BADGE_FILENAME}}-${{ input.ARCHITECTURE}}.json + BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME}}-${{ inputs.ARCHITECTURE}}.json output: DOCKER_TAG_MEALKIT: ${{ steps.mealkit-metadata.outputs.tags }} DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} @@ -87,7 +87,7 @@ jobs: with: images: ${{ env.UPLD_IMAGE }} flavor: latest=false - tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ input.ARCHITECHTURE }}-mealkit + tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECHTURE }}-mealkit labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build docker images - mealkit @@ -97,7 +97,7 @@ jobs: context: rosetta/ push: true file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} - platforms: linux/${{ input.ARCHITECHTURE }} + platforms: linux/${{ inputs.ARCHITECHTURE }} tags: ${{ steps.mealkit-metadata.outputs.tags }} labels: ${{ steps.mealkit-metadata.outputs.labels }} target: rosetta @@ -110,7 +110,7 @@ jobs: with: images: ${{ env.UPLD_IMAGE }} flavor: latest=false - tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ input.ARCHITECHTURE }}-final + tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECHTURE }}-final labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build docker images - final @@ -119,44 +119,9 @@ jobs: context: rosetta/ push: true file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} - platforms: linux/${{ input.ARCHITECHTURE }} + platforms: linux/${{ inputs.ARCHITECHTURE }} tags: ${{ steps.final-metadata.outputs.tags }} labels: ${{ steps.final-metadata.outputs.labels }} target: rosetta build-args: | - BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} - - - - name: Generate sitrep - if: success() || failure() - shell: bash -x -e {0} - run: | - # bring in utility functions - source .github/workflows/scripts/to_json.sh - - badge_label='JAX ${{ inputs.ARCHITECTURE }} build' - tags="${{ steps.final-metadata.outputs.tags }}" - digest="${{ steps.final-build.outputs.digest }}" - outcome="${{ steps.final-build.outcome }}" - - if [[ ${outcome} == "success" ]]; then - badge_message="pass" - badge_color=brightgreen - summary="JAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" - else - badge_message="fail" - badge_color=red - summary="JAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" - fi - - to_json \ - summary \ - badge_label tags digest outcome \ - > sitrep.json - - schemaVersion=1 \ - label="${badge_label}" \ - message="${badge_message}" \ - color="${badge_color}" \ - to_json schemaVersion label message color \ - > ${{ env.BADGE_FILENAME_FULL }} \ No newline at end of file + BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} \ No newline at end of file From 4c5835690e4f42af47ef361a1417f64fce99f801 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 21:59:50 -0700 Subject: [PATCH 109/146] wip: update yaml structure for nightly build of rosseta y5x --- .github/workflows/nightly-rosetta-t5x-build-test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index ed0c46efd..a06afac3f 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -111,7 +111,7 @@ jobs: type=raw,value=mealkit,priority=500 type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 - publish-final: + publish-final: needs: [metadata, amd64, arm64] if: needs.metadata.outputs.PUBLISH == 'true' uses: ./.github/workflows/_publish_container.yaml @@ -194,7 +194,7 @@ jobs: echo "MESSAGE='${MESSAGE}'" >> $GITHUB_OUTPUT echo "COLOR='${COLOR}'" >> $GITHUB_OUTPUT - finalize: + finalize: if: always() needs: [metadata, amd64, arm64] uses: ./.github/workflows/_finalize.yaml From 1fe5f697a3defb6300706d1c0e7d3fb30e860a43 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 22:06:19 -0700 Subject: [PATCH 110/146] wip: update yaml structure for nightly build of rosseta t5x --- .github/workflows/nightly-rosetta-t5x-build-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index a06afac3f..d8d1fc7ef 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -91,7 +91,7 @@ jobs: needs: metadata runs-on: ubuntu-22.04 outputs: - DOCKER_TAG_MEALKIT='' + DOCKER_TAG_MEALKIT: '' steps: - name: Generate placeholder warning shell: bash -x -e {0} From 95f5729b36171b2717d44aee9afd6c7fc259bc9a Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 22:09:46 -0700 Subject: [PATCH 111/146] wip: fix yaml structure for nightly build of rosseta t5x --- .github/workflows/nightly-rosetta-pax-build-test.yaml | 4 ++-- .github/workflows/nightly-rosetta-t5x-build-test.yaml | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/nightly-rosetta-pax-build-test.yaml b/.github/workflows/nightly-rosetta-pax-build-test.yaml index e28dd0f2e..a338d0a06 100644 --- a/.github/workflows/nightly-rosetta-pax-build-test.yaml +++ b/.github/workflows/nightly-rosetta-pax-build-test.yaml @@ -101,8 +101,8 @@ jobs: needs: [metadata, amd64, arm64] if: needs.metadata.output.PUBLISH == 'true' uses: ./.github/workflows/_publish_container.yaml - with: | - SOURCE_IMAGE: | + with: + SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} TARGET_IMAGE: upstream-pax diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index d8d1fc7ef..73fe9ecfb 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -98,15 +98,15 @@ jobs: run: | echo "WARNING: arm64 build is not yet supported" - public-mealkit: + public-mealkit: needs: [metadata, amd64, arm64] if: needs.metadata.output.PUBLISH == 'true' uses: ./.github/workflows/_publish_container.yaml - with: | - SOURCE_IMAGE: | + with: + SOURCE_IMAGE: | ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} - TARGET_IMAGE: upstream-pax + TARGET_IMAGE: upstream-t5x TARGET_TAGS: | type=raw,value=mealkit,priority=500 type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 From d89f93f8498676b82912b4843638d37f476b88a0 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 22:12:42 -0700 Subject: [PATCH 112/146] wip: fix yaml structure for nightly build of rosseta pax --- .github/workflows/nightly-rosetta-pax-build-test.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly-rosetta-pax-build-test.yaml b/.github/workflows/nightly-rosetta-pax-build-test.yaml index a338d0a06..7d29dcbe0 100644 --- a/.github/workflows/nightly-rosetta-pax-build-test.yaml +++ b/.github/workflows/nightly-rosetta-pax-build-test.yaml @@ -45,7 +45,7 @@ jobs: echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT - name: Cancel workflow if upstream workflow did not success - if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + if: steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' uses: styfle/cancel-workflow-action@0.12.0 - name: Determine if the resulting container should be 'published' @@ -110,7 +110,7 @@ jobs: type=raw,value=mealkit,priority=500 type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 - publish-final: + publish-final: needs: [metadata, amd64, arm64] if: needs.metadata.outputs.PUBLISH == 'true' uses: ./.github/workflows/_publish_container.yaml @@ -123,7 +123,7 @@ jobs: type=raw,value=latest,priority=1000 type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - finalize: + finalize: if: always() needs: [metadata, amd64, arm64] uses: ./.github/workflows/_finalize.yaml From 08fb7f958330556c46135fbb6e345871a19bbd54 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 22:14:58 -0700 Subject: [PATCH 113/146] wip: fix typo in yaml structure for nightly build of rosseta pax --- .../nightly-rosetta-pax-build-test.yaml | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/nightly-rosetta-pax-build-test.yaml b/.github/workflows/nightly-rosetta-pax-build-test.yaml index 7d29dcbe0..955021056 100644 --- a/.github/workflows/nightly-rosetta-pax-build-test.yaml +++ b/.github/workflows/nightly-rosetta-pax-build-test.yaml @@ -99,7 +99,7 @@ jobs: public-mealkit: needs: [metadata, amd64, arm64] - if: needs.metadata.output.PUBLISH == 'true' + if: needs.metadata.outputs.PUBLISH == 'true' uses: ./.github/workflows/_publish_container.yaml with: SOURCE_IMAGE: | @@ -123,14 +123,6 @@ jobs: type=raw,value=latest,priority=1000 type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - finalize: - if: always() - needs: [metadata, amd64, arm64] - uses: ./.github/workflows/_finalize.yaml - with: - PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} - secrets: inherit - test-pax: needs: [metadata, amd64, arm64] uses: ./.github/workflows/_test_pax_rosetta.yaml @@ -166,4 +158,12 @@ jobs: fi echo "MESSAGE='${MESSAGE}'" >> $GITHUB_OUTPUT - echo "COLOR='${COLOR}'" >> $GITHUB_OUTPUT \ No newline at end of file + echo "COLOR='${COLOR}'" >> $GITHUB_OUTPUT + + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} + secrets: inherit From a7f95f1a5b88c962a1e58063237b7150f5e17cbf Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 22:22:43 -0700 Subject: [PATCH 114/146] wip: fix typo in yaml structure for nightly build of rosseta pax 2 --- .github/workflows/nightly-rosetta-pax-build-test.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly-rosetta-pax-build-test.yaml b/.github/workflows/nightly-rosetta-pax-build-test.yaml index 955021056..9282962cb 100644 --- a/.github/workflows/nightly-rosetta-pax-build-test.yaml +++ b/.github/workflows/nightly-rosetta-pax-build-test.yaml @@ -97,9 +97,9 @@ jobs: BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} secrets: inherit - public-mealkit: + publish-mealkit: needs: [metadata, amd64, arm64] - if: needs.metadata.outputs.PUBLISH == 'true' + if: needs.metadata.outputs.PUBLISH == 'true' uses: ./.github/workflows/_publish_container.yaml with: SOURCE_IMAGE: | @@ -108,7 +108,7 @@ jobs: TARGET_IMAGE: upstream-pax TARGET_TAGS: | type=raw,value=mealkit,priority=500 - type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 publish-final: needs: [metadata, amd64, arm64] From 2d357148ada16f4fe31b8e625a3068bebb3160f9 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 22:23:53 -0700 Subject: [PATCH 115/146] wip --- .github/workflows/nightly-rosetta-t5x-build-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index 73fe9ecfb..4057e8e75 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -98,7 +98,7 @@ jobs: run: | echo "WARNING: arm64 build is not yet supported" - public-mealkit: + publish-mealkit: needs: [metadata, amd64, arm64] if: needs.metadata.output.PUBLISH == 'true' uses: ./.github/workflows/_publish_container.yaml From 5ae2e7913558ff134e17c592ea40f0aadc1f04d2 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Wed, 15 Nov 2023 22:26:16 -0700 Subject: [PATCH 116/146] wip: --- .github/workflows/_build_rosetta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml index d0e1c5f76..b4e0bfa02 100644 --- a/.github/workflows/_build_rosetta.yaml +++ b/.github/workflows/_build_rosetta.yaml @@ -29,7 +29,7 @@ on: outputs: DOCKER_TAG_MEALKIT: description: 'Tags of the mealkit image build' - value: $ {{ jobs.build-rosetta.output.DOCKER_TAG_MEALKIT }} + value: $ {{ jobs.build-rosetta.outputs.DOCKER_TAG_MEALKIT }} DOCKER_TAG_FINAL: description: "Tags of the complete image built" value: ${{ jobs.build-rosetta.outputs.DOCKER_TAG_FINAL }} @@ -49,7 +49,7 @@ jobs: runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME}}-${{ inputs.ARCHITECTURE}}.json - output: + outputs: DOCKER_TAG_MEALKIT: ${{ steps.mealkit-metadata.outputs.tags }} DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: From ce8ba2d01da84d46e5a3998ede76b5045bc5fbac Mon Sep 17 00:00:00 2001 From: Yu-Hang Maxin Tang Date: Thu, 16 Nov 2023 13:39:55 +0000 Subject: [PATCH 117/146] use the _publish_container reusable workflow for base container weekly build --- .github/workflows/weekly-base-build.yaml | 61 +++++++----------------- 1 file changed, 17 insertions(+), 44 deletions(-) diff --git a/.github/workflows/weekly-base-build.yaml b/.github/workflows/weekly-base-build.yaml index ed661e801..71a589124 100644 --- a/.github/workflows/weekly-base-build.yaml +++ b/.github/workflows/weekly-base-build.yaml @@ -12,10 +12,6 @@ on: default: false required: false -env: - DOCKER_REGISTRY: ghcr.io/nvidia - DOCKER_IMAGE: jax-toolbox - permissions: contents: read # to fetch code actions: write # to cancel previous workflows @@ -27,6 +23,7 @@ jobs: runs-on: ubuntu-22.04 outputs: BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} steps: - name: Set build date id: date @@ -34,6 +31,12 @@ jobs: run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + + - name: Determine whether results will be 'published' + id: if-publish + shell: bash -x -e {0} + run: | + echo "PUBLISH=${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT amd64: needs: metadata @@ -52,47 +55,17 @@ jobs: secrets: inherit publish: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - runs-on: ubuntu-latest needs: [metadata, amd64, arm64] - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.DOCKER_REGISTRY }}/${{ env.DOCKER_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=base,priority=1000 - type=raw,value=base-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - labels: - org.opencontainers.image.created=${{ needs.metadata.outputs.BUILD_DATE }} - - - name: Combine images into a single multi-arch image - shell: bash -x -e {0} - run: | - for tag in $(echo "${{ steps.meta.outputs.tags }}"); do - docker manifest create ${tag} $( - for IMAGE in ${{ needs.amd64.outputs.DOCKER_TAG }} ${{ needs.arm64.outputs.DOCKER_TAG }}; do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${tag} - done + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG }} + ${{ needs.arm64.outputs.DOCKER_TAG }} + TARGET_IMAGE: jax-toolbox + TARGET_TAGS: | + type=raw,value=base,priority=1000 + type=raw,value=base-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 finalize: if: always() From 6501ae905fad29b4869eba7c7a1f749b088cf616 Mon Sep 17 00:00:00 2001 From: Yu-Hang Maxin Tang Date: Thu, 16 Nov 2023 13:46:05 +0000 Subject: [PATCH 118/146] fix base build output arg name error --- .github/workflows/_ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 1447f5076..801e7299a 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -31,7 +31,7 @@ on: outputs: TAG_BASE: description: "Tags of the base image built" - value: ${{ jobs.build-base.outputs.DOCKER_TAGS }} + value: ${{ jobs.build-base.outputs.DOCKER_TAG }} TAG_JAX: description: "Tags of the JAX image built" value: ${{ jobs.build-jax.outputs.DOCKER_TAG_FINAL }} From aae58652a888fc826a0bde9683dec3d46eb4a5ca Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 09:44:53 -0700 Subject: [PATCH 119/146] wip: add base build --- .github/workflows/_sandbox.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index e7b83eac0..72e4b0e85 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -11,6 +11,12 @@ permissions: jobs: + build-base: + uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: amd64 + secrets: inherit + build-jax: uses: ./.github/workflows/_build_jax.yaml with: From 94d2db7664f61ad71526dec1e3791a5b52789b56 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 09:47:58 -0700 Subject: [PATCH 120/146] wip: add base build --- .github/workflows/_sandbox.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 72e4b0e85..79cec6f24 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -2,7 +2,7 @@ name: "~Sandbox" on: workflow_dispatch: - #push: + push: permissions: contents: read # to fetch code @@ -16,8 +16,9 @@ jobs: with: ARCHITECTURE: amd64 secrets: inherit - + build-jax: + needs: [build-base] uses: ./.github/workflows/_build_jax.yaml with: ARCHITECTURE: amd64 From 9bb80b468f5bdb840e3fae66e8ce94d62de15eb3 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 10:06:07 -0700 Subject: [PATCH 121/146] wip: add base build --- .github/workflows/_sandbox.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 79cec6f24..b664fbda8 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -22,6 +22,7 @@ jobs: uses: ./.github/workflows/_build_jax.yaml with: ARCHITECTURE: amd64 + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} secrets: inherit build-pax: From 3442369981e50a672519df52880c2f376fec97ae Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 11:47:26 -0700 Subject: [PATCH 122/146] wip: build t5x and rosetta/pax --- .github/workflows/_sandbox.yaml | 56 +++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index b664fbda8..e76c3953c 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -11,42 +11,50 @@ permissions: jobs: - build-base: - uses: ./.github/workflows/_build_base.yaml - with: - ARCHITECTURE: amd64 - secrets: inherit + # build-base: + # uses: ./.github/workflows/_build_base.yaml + # with: + # ARCHITECTURE: amd64 + # secrets: inherit - build-jax: - needs: [build-base] - uses: ./.github/workflows/_build_jax.yaml - with: - ARCHITECTURE: amd64 - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - secrets: inherit + # build-jax: + # needs: [build-base] + # uses: ./.github/workflows/_build_jax.yaml + # with: + # ARCHITECTURE: amd64 + # BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + # secrets: inherit - build-pax: - needs: [build-jax] - uses: ./.github/workflows/_build_pax.yaml + # build-pax: + # needs: [build-jax] + # uses: ./.github/workflows/_build_pax.yaml + # with: + # ARCHITECTURE: amd64 + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # secrets: inherit + + build-t5x: +# needs: [build-jax] + uses: ./.github/workflows/_build_t5x.yaml with: ARCHITECTURE: amd64 - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6894053657-jax-amd64 secrets: inherit - test-te: - needs: [build-pax] - uses: ./.github/workflows/_test_te.yaml - with: - TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-te: + # needs: [build-pax] + # uses: ./.github/workflows/_test_te.yaml + # with: + # TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit build-rosetta: - needs: [build-pax] + # needs: [build-pax] uses: ./.github/workflows/_build_rosetta.yaml with: ARCHITECTURE: amd64 BASE_LIBRARY: pax - BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6894053657-upstream-pax-amd64 # test-pax: # needs: [build-pax] From f567e2c90b79b3d61a10582059a8cc5b4a466d39 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 11:59:56 -0700 Subject: [PATCH 123/146] wip: build all in question --- .github/workflows/_build_rosetta.yaml | 8 +-- .github/workflows/_sandbox.yaml | 91 ++++++++++++++------------- 2 files changed, 50 insertions(+), 49 deletions(-) diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml index b4e0bfa02..816e35ab7 100644 --- a/.github/workflows/_build_rosetta.yaml +++ b/.github/workflows/_build_rosetta.yaml @@ -87,7 +87,7 @@ jobs: with: images: ${{ env.UPLD_IMAGE }} flavor: latest=false - tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECHTURE }}-mealkit + tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECTURE }}-mealkit labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build docker images - mealkit @@ -97,7 +97,7 @@ jobs: context: rosetta/ push: true file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} - platforms: linux/${{ inputs.ARCHITECHTURE }} + platforms: linux/${{ inputs.ARCHITECTURE }} tags: ${{ steps.mealkit-metadata.outputs.tags }} labels: ${{ steps.mealkit-metadata.outputs.labels }} target: rosetta @@ -110,7 +110,7 @@ jobs: with: images: ${{ env.UPLD_IMAGE }} flavor: latest=false - tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECHTURE }}-final + tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECTURE }}-final labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - name: Build docker images - final @@ -119,7 +119,7 @@ jobs: context: rosetta/ push: true file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} - platforms: linux/${{ inputs.ARCHITECHTURE }} + platforms: linux/${{ inputs.ARCHITECTURE }} tags: ${{ steps.final-metadata.outputs.tags }} labels: ${{ steps.final-metadata.outputs.labels }} target: rosetta diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index e76c3953c..af999f836 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -11,63 +11,64 @@ permissions: jobs: - # build-base: - # uses: ./.github/workflows/_build_base.yaml - # with: - # ARCHITECTURE: amd64 - # secrets: inherit + build-base: + uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: amd64 + secrets: inherit - # build-jax: - # needs: [build-base] - # uses: ./.github/workflows/_build_jax.yaml - # with: - # ARCHITECTURE: amd64 - # BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - # secrets: inherit + build-jax: + needs: [build-base] + uses: ./.github/workflows/_build_jax.yaml + with: + ARCHITECTURE: amd64 + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + secrets: inherit - # build-pax: - # needs: [build-jax] - # uses: ./.github/workflows/_build_pax.yaml - # with: - # ARCHITECTURE: amd64 - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # secrets: inherit + build-pax: + needs: [build-jax] + uses: ./.github/workflows/_build_pax.yaml + with: + ARCHITECTURE: amd64 + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + secrets: inherit build-t5x: -# needs: [build-jax] + needs: [build-jax] uses: ./.github/workflows/_build_t5x.yaml with: ARCHITECTURE: amd64 - BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6894053657-jax-amd64 + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + secrets: inherit + + test-pax: + needs: [build-pax] + uses: ./.github/workflows/_test_pax.yaml + with: + PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} secrets: inherit - # test-te: - # needs: [build-pax] - # uses: ./.github/workflows/_test_te.yaml - # with: - # TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - build-rosetta: - # needs: [build-pax] + test-te: + needs: [build-pax] + uses: ./.github/workflows/_test_te.yaml + with: + TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + build-rosetta-pax: + needs: [build-pax] uses: ./.github/workflows/_build_rosetta.yaml with: ARCHITECTURE: amd64 BASE_LIBRARY: pax - BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6894053657-upstream-pax-amd64 - - # test-pax: - # needs: [build-pax] - # uses: ./.github/workflows/_test_pax.yaml - # with: - # PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit + BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} - # finalize: - # if: always() - # # TODO: use dynamic matrix to make dependencies self-updating - # needs: [build-jax, build-t5x, build-pax] - # uses: ./.github/workflows/_finalize.yaml - # with: - # PUBLISH_BADGE: false - # secrets: inherit + finalize: + if: always() + # TODO: use dynamic matrix to make dependencies self-updating + needs: [build-jax, build-t5x, build-pax] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: false + secrets: inherit From efb339c43ce4dc0b6b81e249124fe57fcc8ffb72 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 12:03:48 -0700 Subject: [PATCH 124/146] wip: build all in question 2 --- .github/workflows/_sandbox.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index af999f836..60ca24488 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -34,7 +34,7 @@ jobs: secrets: inherit build-t5x: - needs: [build-jax] + needs: [build-jax] uses: ./.github/workflows/_build_t5x.yaml with: ARCHITECTURE: amd64 From dca817c699d121166222db846c05be614a3c9558 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 13:16:59 -0700 Subject: [PATCH 125/146] Build whole pipeline --- .github/workflows/_sandbox.yaml | 103 +++++++++++--------------------- 1 file changed, 35 insertions(+), 68 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 60ca24488..37fa6ca68 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -2,73 +2,40 @@ name: "~Sandbox" on: workflow_dispatch: - push: - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container jobs: - - build-base: - uses: ./.github/workflows/_build_base.yaml - with: - ARCHITECTURE: amd64 - secrets: inherit - - build-jax: - needs: [build-base] - uses: ./.github/workflows/_build_jax.yaml - with: - ARCHITECTURE: amd64 - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - secrets: inherit - - build-pax: - needs: [build-jax] - uses: ./.github/workflows/_build_pax.yaml - with: - ARCHITECTURE: amd64 - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - secrets: inherit - - build-t5x: - needs: [build-jax] - uses: ./.github/workflows/_build_t5x.yaml - with: - ARCHITECTURE: amd64 - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - secrets: inherit - - test-pax: - needs: [build-pax] - uses: ./.github/workflows/_test_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - - - test-te: - needs: [build-pax] - uses: ./.github/workflows/_test_te.yaml - with: - TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - - build-rosetta-pax: - needs: [build-pax] - uses: ./.github/workflows/_build_rosetta.yaml - with: - ARCHITECTURE: amd64 - BASE_LIBRARY: pax - BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} - - finalize: - if: always() - # TODO: use dynamic matrix to make dependencies self-updating - needs: [build-jax, build-t5x, build-pax] - uses: ./.github/workflows/_finalize.yaml - with: - PUBLISH_BADGE: false - secrets: inherit + sandbox: + runs-on: ubuntu-22.04 + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Print usage + run: | + cat << EOF + This is an empty workflow file located in the main branch of your + repository. It serves as a testing ground for new GitHub Actions on + development branches before merging them to the main branch. By + defining and overloading this workflow on your development branch, + you can test new actions without affecting your main branch, ensuring + a smooth integration process once the changes are ready to be merged. + + Usage: + + 1. In your development branch, modify the sandbox.yml workflow file + to include the new actions you want to test. Make sure to commit + the changes to the development branch. + 2. Navigate to the 'Actions' tab in your repository, select the + '~Sandbox' workflow, and choose your development branch from the + branch dropdown menu. Click on 'Run workflow' to trigger the + workflow on your development branch. + 3. Once you have tested and verified the new actions in the Sandbox + workflow, you can incorporate them into your main workflow(s) and + merge the development branch into the main branch. Remember to + revert the changes to the sandbox.yml file in the main branch to + keep it empty for future testing. + EOF \ No newline at end of file From 44d7897e2af778f17c5a71098a5a71f66ef2adbe Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 13:55:04 -0700 Subject: [PATCH 126/146] Build rosetta in CI pipeline --- .github/workflows/_ci.yaml | 56 +++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 801e7299a..fc99e7b4f 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -139,28 +139,28 @@ jobs: REF_PRAXIS: ${{ needs.metadata.outputs.REF_PRAXIS }} secrets: inherit - # build-rosetta-t5x: - # uses: ./.github/workflows/_build_rosetta.yaml - # needs: [metadata, build-t5x] - # with: - # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_MEALKIT }} - # BASE_LIBRARY: t5x - # secrets: inherit - - # build-rosetta-pax: - # uses: ./.github/workflows/_build_rosetta.yaml - # needs: [metadata, build-pax] - # with: - # BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_MEALKIT }} - # BASE_LIBRARY: pax - # secrets: inherit - - # test-distribution: - # needs: metadata - # uses: ./.github/workflows/_test_distribution.yaml - # secrets: inherit + build-rosetta-t5x: + uses: ./.github/workflows/_build_rosetta.yaml + needs: [metadata, build-t5x] + with: + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: t5x + secrets: inherit + + build-rosetta-pax: + uses: ./.github/workflows/_build_rosetta.yaml + needs: [metadata, build-pax] + with: + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: pax + secrets: inherit + + test-distribution: + needs: metadata + uses: ./.github/workflows/_test_distribution.yaml + secrets: inherit test-jax: needs: build-jax @@ -194,10 +194,10 @@ jobs: PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} secrets: inherit - # test-vit: - # needs: build-rosetta-t5x - # uses: ./.github/workflows/_test_vit.yaml - # with: - # ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} - # secrets: inherit + test-vit: + needs: build-rosetta-t5x + uses: ./.github/workflows/_test_vit.yaml + with: + ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} + secrets: inherit From 420162fab4ec665140f621cc87b617a29e1cd4a7 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 13:58:28 -0700 Subject: [PATCH 127/146] Build rosetta in CI pipeline --- .github/workflows/_ci.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index fc99e7b4f..50d70b4a2 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -143,6 +143,7 @@ jobs: uses: ./.github/workflows/_build_rosetta.yaml needs: [metadata, build-t5x] with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_MEALKIT }} BASE_LIBRARY: t5x @@ -152,6 +153,7 @@ jobs: uses: ./.github/workflows/_build_rosetta.yaml needs: [metadata, build-pax] with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_MEALKIT }} BASE_LIBRARY: pax From 2a12b051b515c5255eb393c5d4e94ed7cfc5b090 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 15:02:01 -0700 Subject: [PATCH 128/146] Debug rosetta build --- .github/workflows/_sandbox.yaml | 96 +++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 35 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 37fa6ca68..6bb139dfa 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -2,40 +2,66 @@ name: "~Sandbox" on: workflow_dispatch: + push: + +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container jobs: - sandbox: - runs-on: ubuntu-22.04 - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Print usage - run: | - cat << EOF - This is an empty workflow file located in the main branch of your - repository. It serves as a testing ground for new GitHub Actions on - development branches before merging them to the main branch. By - defining and overloading this workflow on your development branch, - you can test new actions without affecting your main branch, ensuring - a smooth integration process once the changes are ready to be merged. - - Usage: - - 1. In your development branch, modify the sandbox.yml workflow file - to include the new actions you want to test. Make sure to commit - the changes to the development branch. - 2. Navigate to the 'Actions' tab in your repository, select the - '~Sandbox' workflow, and choose your development branch from the - branch dropdown menu. Click on 'Run workflow' to trigger the - workflow on your development branch. - 3. Once you have tested and verified the new actions in the Sandbox - workflow, you can incorporate them into your main workflow(s) and - merge the development branch into the main branch. Remember to - revert the changes to the sandbox.yml file in the main branch to - keep it empty for future testing. - EOF \ No newline at end of file + + # build-base: + # uses: ./.github/workflows/_build_base.yaml + # with: + # ARCHITECTURE: amd64 + # secrets: inherit + + # build-jax: + # needs: [build-base] + # uses: ./.github/workflows/_build_jax.yaml + # with: + # ARCHITECTURE: amd64 + # BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + # secrets: inherit + + # build-pax: + # needs: [build-jax] + # uses: ./.github/workflows/_build_pax.yaml + # with: + # ARCHITECTURE: amd64 + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # secrets: inherit + + # build-t5x: + # needs: [build-jax] + # uses: ./.github/workflows/_build_t5x.yaml + # with: + # ARCHITECTURE: amd64 + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # secrets: inherit + + # test-pax: + # needs: [build-pax] + # uses: ./.github/workflows/_test_pax.yaml + # with: + # PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit + + + # test-te: + # needs: [build-pax] + # uses: ./.github/workflows/_test_te.yaml + # with: + # TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit + + build-rosetta-pax: + uses: ./.github/workflows/_build_rosetta.yaml + # needs: [metadata, build-pax] + with: + ARCHITECTURE: amd64 + BUILD_DATE: "2023-11-15" + BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6896408819-upstream-pax-amd64-mealkit + BASE_LIBRARY: pax + secrets: inherit From 507c6c14cbf146051b7588101b925f03e48b29d1 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 15:55:44 -0700 Subject: [PATCH 129/146] Debug rosetta build: correct container --- .github/workflows/_sandbox.yaml | 2 +- rosetta/Dockerfile.pax | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 6bb139dfa..65f65329e 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -62,6 +62,6 @@ jobs: with: ARCHITECTURE: amd64 BUILD_DATE: "2023-11-15" - BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6896408819-upstream-pax-amd64-mealkit + BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6896408819-upstream-pax-amd64 BASE_LIBRARY: pax secrets: inherit diff --git a/rosetta/Dockerfile.pax b/rosetta/Dockerfile.pax index 503be8dcb..39f719769 100644 --- a/rosetta/Dockerfile.pax +++ b/rosetta/Dockerfile.pax @@ -1,9 +1,14 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=ghcr.io/nvidia/upstream-pax:mealkit +ARG BASE_IMAGE=ghcr.io/nvidia/upstream-pax:latest ARG GIT_USER_EMAIL=jax@nvidia.com ARG GIT_USER_NAME=NVIDIA +ARG SRC_PATH_PAXML=/opt/paxml +ARG SRC_PATH_PRAXIS=/opt/praxis FROM scratch as rosetta-source +ARG SRC_PATH_PAXML +ARG SRC_PATH_PRAXIS + COPY . / FROM scratch as pax-mirror-source @@ -25,7 +30,7 @@ EOF COPY --from=rosetta-source / /opt/rosetta WORKDIR /opt/rosetta RUN --mount=target=/opt/pax-mirror,from=pax-mirror-source,readwrite \ - --mount=target=/opt/praxis-mirror,from=praxis-mirror-source,readwrite <<"EOF" bash -e + --mount=target=/opt/praxis-mirror,from=praxis-mirror-source,readwrite < Date: Thu, 16 Nov 2023 17:43:03 -0700 Subject: [PATCH 130/146] Debug rosetta build: fix odd issue from docker file --- rosetta/Dockerfile.pax | 2 +- rosetta/Dockerfile.t5x | 2 +- rosetta/create-distribution.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rosetta/Dockerfile.pax b/rosetta/Dockerfile.pax index 39f719769..aa356c99d 100644 --- a/rosetta/Dockerfile.pax +++ b/rosetta/Dockerfile.pax @@ -30,7 +30,7 @@ EOF COPY --from=rosetta-source / /opt/rosetta WORKDIR /opt/rosetta RUN --mount=target=/opt/pax-mirror,from=pax-mirror-source,readwrite \ - --mount=target=/opt/praxis-mirror,from=praxis-mirror-source,readwrite < Date: Thu, 16 Nov 2023 17:45:55 -0700 Subject: [PATCH 131/146] Full pipelilne 2 --- .github/workflows/_build_rosetta.yaml | 45 ++++++++++++++------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml index 816e35ab7..9295782fa 100644 --- a/.github/workflows/_build_rosetta.yaml +++ b/.github/workflows/_build_rosetta.yaml @@ -80,29 +80,30 @@ jobs: with: driver-opts: | image=moby/buildkit:v0.12.1 + + # TODO: not sure, we need this here + # - name: Set docker metadata - mealkit + # id: mealkit-metadata + # uses: docker/metadata-action@v4 + # with: + # images: ${{ env.UPLD_IMAGE }} + # flavor: latest=false + # tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECTURE }}-mealkit + # labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set docker metadata - mealkit - id: mealkit-metadata - uses: docker/metadata-action@v4 - with: - images: ${{ env.UPLD_IMAGE }} - flavor: latest=false - tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECTURE }}-mealkit - labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Build docker images - mealkit - id: mealkit-build - uses: docker/build-push-action@v4 - with: - context: rosetta/ - push: true - file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} - platforms: linux/${{ inputs.ARCHITECTURE }} - tags: ${{ steps.mealkit-metadata.outputs.tags }} - labels: ${{ steps.mealkit-metadata.outputs.labels }} - target: rosetta - build-args: | - BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} + # - name: Build docker images - mealkit + # id: mealkit-build + # uses: docker/build-push-action@v4 + # with: + # context: rosetta/ + # push: true + # file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} + # platforms: linux/${{ inputs.ARCHITECTURE }} + # tags: ${{ steps.mealkit-metadata.outputs.tags }} + # labels: ${{ steps.mealkit-metadata.outputs.labels }} + # target: rosetta + # build-args: | + # BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} - name: Set docker metadata - final id: final-metadata From 85a88176dd046c4199055e3d7e61285d03a6456d Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Thu, 16 Nov 2023 22:36:21 -0700 Subject: [PATCH 132/146] Addressed Ann's comments --- .github/workflows/_ci.yaml | 2 +- .github/workflows/_test_pax.yaml | 125 ++++++++++++++++- .github/workflows/_test_pax_rosetta.yaml | 126 +++++++++++++++++- .../nightly-rosetta-pax-build-test.yaml | 2 +- .../nightly-rosetta-t5x-build-test.yaml | 4 +- 5 files changed, 251 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 50d70b4a2..949959739 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -200,6 +200,6 @@ jobs: needs: build-rosetta-t5x uses: ./.github/workflows/_test_vit.yaml with: - ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} + ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS_FINAL }} secrets: inherit diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index a5a5a1d92..f35dee0d2 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -20,6 +20,127 @@ on: jobs: + single-process-multi-device: + strategy: + matrix: + PARALLEL_CONFIG: + - [1, 8, 1, 1] + - [1, 1, 2, 4] + fail-fast: false + + runs-on: ubuntu-22.04 + + steps: + - name: Print environment variables + run: env + + - name: Setup SSH agent + uses: webfactory/ssh-agent@v0.8.0 + with: + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + + - name: Setup SSH known hosts + id: ssh-known-hosts + run: | + mkdir -p ~/.ssh + cat >> ~/.ssh/known_hosts << EOF + ${{ vars.SSH_KNOWN_HOSTS }} + EOF + chmod 600 ~/.ssh/known_hosts + echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT + - name: Labels and metadata + id: meta + shell: bash -x -e {0} + run: | + IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" + TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process + TOTAL_TASKS=1 + MAX_GPUS_PER_NODE=8 + NODES=1 + GPUS_PER_NODE=8 + JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} + LOG_FILE=/nfs/cluster/${JOB_NAME}.log + MODEL_PATH=/nfs/cluster/${JOB_NAME} + for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do + echo "$var=${!var}" >> $GITHUB_OUTPUT + done + - name: Submit SLURM jobs over SSH + id: submit + shell: bash -O expand_aliases -x -e {0} + run: | + alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + sshx "date && hostname && sinfo" + sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} + JOB=$(sshx sbatch --parsable << EOF + #!/bin/bash + #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} + #SBATCH --exclusive + #SBATCH --nodes=${{ steps.meta.outputs.NODES }} + #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }} + #SBATCH --tasks=${{ steps.meta.outputs.TOTAL_TASKS }} + #SBATCH --time=00:30:00 + #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} + #SBATCH --export="VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model,ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" + time srun \ + --container-image=${{ steps.meta.outputs.IMAGE }} \ + --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ + --container-entrypoint \ + test-pax.sh \ + --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ + --dtype bfloat16 \ + --batch-per-gpu 4 \ + --steps 500 \ + --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ + --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ + --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ + --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --nodes ${{ steps.meta.outputs.NODES }} \ + ${{ inputs.EXTRA_TEST_ARGS }} + EOF + ) + set +x + while sshx squeue -j $JOB | grep -q $JOB; do + echo "SLURM Job $JOB is still running." + sleep 15 + done + echo "SLURM Job $JOB finished." + # Gather job info + SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) + SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') + echo "SLURM Job state is ${SLURM_STATE}" + echo "SLURM Job exit code is ${SLURM_EXITCODE}" + echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" + echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" + set -x + - name: Retrieve training logs and upload to TensorBoard server + shell: bash -x -e {0} + run: | + mkdir output/ + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ + output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ + output/ || true + rsync -rtz --progress \ + output/ \ + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + - name: Write SLURM job status to file + shell: bash -x -e {0} + run: | + python << EOF + import json + with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: + dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} + json.dump(dump, f) + EOF + + - name: Upload training logs as artifacts + uses: actions/upload-artifact@v3 + with: + name: ${{ steps.meta.outputs.JOB_NAME }} + path: output/* + pax-multi-node: strategy: matrix: @@ -289,7 +410,7 @@ jobs: path: output/* metrics: - needs: pax-multi-node + needs: [single-process-multi-device, pax-multi-node, single-process-evaluation] runs-on: ubuntu-22.04 steps: @@ -327,7 +448,7 @@ jobs: publish-test: - needs: [pax-multi-node, metrics] + needs: [single-process-multi-device, pax-multi-node, single-process-evaluation, metrics] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 1229d6c5e..ca8715adf 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -20,6 +20,128 @@ on: jobs: + single-process-multi-device-te: + strategy: + matrix: + PARALLEL_CONFIG: + - [1, 8, 1, 1] + - [1, 1, 2, 4] + fail-fast: false + + runs-on: ubuntu-22.04 + + steps: + - name: Print environment variables + run: env + + - name: Setup SSH agent + uses: webfactory/ssh-agent@v0.8.0 + with: + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + + - name: Setup SSH known hosts + id: ssh-known-hosts + run: | + mkdir -p ~/.ssh + cat >> ~/.ssh/known_hosts << EOF + ${{ vars.SSH_KNOWN_HOSTS }} + EOF + chmod 600 ~/.ssh/known_hosts + echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT + - name: Labels and metadata + id: meta + shell: bash -x -e {0} + run: | + IMAGE="$(echo ${{inputs.PAX_IMAGE}} | sed 's/\//#/')" + TEST_CASE_NAME=${{ matrix.PARALLEL_CONFIG[1] }}DP${{ matrix.PARALLEL_CONFIG[2] }}FSDP${{ matrix.PARALLEL_CONFIG[3] }}TP${{ matrix.PARALLEL_CONFIG[0] }}PP_single_process_TE + TOTAL_TASKS=1 + MAX_GPUS_PER_NODE=8 + NODES=1 + GPUS_PER_NODE=8 + JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} + LOG_FILE=/nfs/cluster/${JOB_NAME}.log + MODEL_PATH=/nfs/cluster/${JOB_NAME} + for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do + echo "$var=${!var}" >> $GITHUB_OUTPUT + done + - name: Submit SLURM jobs over SSH + id: submit + shell: bash -O expand_aliases -x -e {0} + run: | + alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' + sshx "date && hostname && sinfo" + sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} + JOB=$(sshx sbatch --parsable << EOF + #!/bin/bash + #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} + #SBATCH --exclusive + #SBATCH --nodes=${{ steps.meta.outputs.NODES }} + #SBATCH --gpus-per-node=${{ steps.meta.outputs.GPUS_PER_NODE }} + #SBATCH --tasks=${{ steps.meta.outputs.TOTAL_TASKS }} + #SBATCH --time=00:30:00 + #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} + #SBATCH --export="VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model,ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" + time srun \ + --container-image=${{ steps.meta.outputs.IMAGE }} \ + --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ + --container-entrypoint \ + test-pax.sh \ + --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ + --dtype bfloat16 \ + --batch-per-gpu 4 \ + --steps 500 \ + --pipeline-parallel ${{ matrix.PARALLEL_CONFIG[0] }} \ + --data-parallel ${{ matrix.PARALLEL_CONFIG[1] }} \ + --fsdp ${{ matrix.PARALLEL_CONFIG[2] }} \ + --tensor-parallel ${{ matrix.PARALLEL_CONFIG[3] }} \ + --nodes ${{ steps.meta.outputs.NODES }} \ + --enable-te \ + ${{ inputs.EXTRA_TEST_ARGS }} + EOF + ) + set +x + while sshx squeue -j $JOB | grep -q $JOB; do + echo "SLURM Job $JOB is still running." + sleep 15 + done + echo "SLURM Job $JOB finished." + # Gather job info + SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) + SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') + echo "SLURM Job state is ${SLURM_STATE}" + echo "SLURM Job exit code is ${SLURM_EXITCODE}" + echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" + echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" + set -x + - name: Retrieve training logs and upload to TensorBoard server + shell: bash -x -e {0} + run: | + mkdir output/ + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ + output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true + rsync -rtz --progress \ + ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ + output/ || true + rsync -rtz --progress \ + output/ \ + ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true + - name: Write SLURM job status to file + shell: bash -x -e {0} + run: | + python << EOF + import json + with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: + dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} + json.dump(dump, f) + EOF + + - name: Upload training logs as artifacts + uses: actions/upload-artifact@v3 + with: + name: ${{ steps.meta.outputs.JOB_NAME }} + path: output/* + rosetta-pax-multi-node-te: strategy: matrix: @@ -559,7 +681,7 @@ jobs: path: output/* metrics: - needs: [rosetta-pax-multi-node, rosetta-pax-multi-node-te, rosetta-pax-single-node-dropout-te] + needs: [single-process-multi-device-te, rosetta-pax-multi-node, rosetta-pax-multi-node-te, rosetta-pax-single-node-dropout-te, single-process-evaluation-te] runs-on: ubuntu-22.04 steps: @@ -597,7 +719,7 @@ jobs: publish-test: - needs: [rosetta-pax-multi-node, rosetta-pax-multi-node-te, rosetta-pax-single-node-dropout-te, metrics] + needs: [single-process-multi-device-te, rosetta-pax-multi-node, rosetta-pax-multi-node-te, rosetta-pax-single-node-dropout-te, single-process-evaluation-te, metrics] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit diff --git a/.github/workflows/nightly-rosetta-pax-build-test.yaml b/.github/workflows/nightly-rosetta-pax-build-test.yaml index 9282962cb..f7d74103b 100644 --- a/.github/workflows/nightly-rosetta-pax-build-test.yaml +++ b/.github/workflows/nightly-rosetta-pax-build-test.yaml @@ -128,7 +128,7 @@ jobs: uses: ./.github/workflows/_test_pax_rosetta.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: - PAX_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + PAX_IMAGE: ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} secrets: inherit publish-test: diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index 4057e8e75..0d866efaf 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -129,7 +129,7 @@ jobs: needs: [metadata, amd64, arm64] uses: ./.github/workflows/_test_rosetta.yaml with: - ROSETTA_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + ROSETTA_IMAGE: ${{ needs.amd64.outputs.DOCKER_TAGS_FINAL }} secrets: inherit test-t5x: @@ -137,7 +137,7 @@ jobs: uses: ./.github/workflows/_test_t5x_rosetta.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: - T5X_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + T5X_IMAGE: ${{ needs.amd64.outputs.DOCKER_TAGS_FINAL }} secrets: inherit test-vit: From 630d1df4692af4b917eb35ebcd65ea2c0d271c8f Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 17 Nov 2023 11:15:47 -0700 Subject: [PATCH 133/146] Full pipelilne: revert _sandbox.yaml --- .github/workflows/_sandbox.yaml | 96 ++++++++++++--------------------- 1 file changed, 35 insertions(+), 61 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 65f65329e..37fa6ca68 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -2,66 +2,40 @@ name: "~Sandbox" on: workflow_dispatch: - push: - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container jobs: - - # build-base: - # uses: ./.github/workflows/_build_base.yaml - # with: - # ARCHITECTURE: amd64 - # secrets: inherit - - # build-jax: - # needs: [build-base] - # uses: ./.github/workflows/_build_jax.yaml - # with: - # ARCHITECTURE: amd64 - # BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} - # secrets: inherit - - # build-pax: - # needs: [build-jax] - # uses: ./.github/workflows/_build_pax.yaml - # with: - # ARCHITECTURE: amd64 - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # secrets: inherit - - # build-t5x: - # needs: [build-jax] - # uses: ./.github/workflows/_build_t5x.yaml - # with: - # ARCHITECTURE: amd64 - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # secrets: inherit - - # test-pax: - # needs: [build-pax] - # uses: ./.github/workflows/_test_pax.yaml - # with: - # PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - - # test-te: - # needs: [build-pax] - # uses: ./.github/workflows/_test_te.yaml - # with: - # TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - build-rosetta-pax: - uses: ./.github/workflows/_build_rosetta.yaml - # needs: [metadata, build-pax] - with: - ARCHITECTURE: amd64 - BUILD_DATE: "2023-11-15" - BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6896408819-upstream-pax-amd64 - BASE_LIBRARY: pax - secrets: inherit + sandbox: + runs-on: ubuntu-22.04 + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Print usage + run: | + cat << EOF + This is an empty workflow file located in the main branch of your + repository. It serves as a testing ground for new GitHub Actions on + development branches before merging them to the main branch. By + defining and overloading this workflow on your development branch, + you can test new actions without affecting your main branch, ensuring + a smooth integration process once the changes are ready to be merged. + + Usage: + + 1. In your development branch, modify the sandbox.yml workflow file + to include the new actions you want to test. Make sure to commit + the changes to the development branch. + 2. Navigate to the 'Actions' tab in your repository, select the + '~Sandbox' workflow, and choose your development branch from the + branch dropdown menu. Click on 'Run workflow' to trigger the + workflow on your development branch. + 3. Once you have tested and verified the new actions in the Sandbox + workflow, you can incorporate them into your main workflow(s) and + merge the development branch into the main branch. Remember to + revert the changes to the sandbox.yml file in the main branch to + keep it empty for future testing. + EOF \ No newline at end of file From 1cf80c295ad8dfa2508e0f3a504a078979db62fb Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 17 Nov 2023 13:58:46 -0700 Subject: [PATCH 134/146] Rename nightly rosseta to be able to sun nigthly rosseta pax build manually --- ...rosetta-pax-build-test.yaml => nightly-rosetta-pax-build.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{nightly-rosetta-pax-build-test.yaml => nightly-rosetta-pax-build.yaml} (100%) diff --git a/.github/workflows/nightly-rosetta-pax-build-test.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml similarity index 100% rename from .github/workflows/nightly-rosetta-pax-build-test.yaml rename to .github/workflows/nightly-rosetta-pax-build.yaml From 0885302fd7b751e200faf5584b687fec37ed011b Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 17 Nov 2023 14:24:20 -0700 Subject: [PATCH 135/146] Add arch to rosetta-pax build --- .../workflows/nightly-rosetta-pax-build.yaml | 17 ++++++++++------- .../nightly-rosetta-t5x-build-test.yaml | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index f7d74103b..e7e4da474 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -35,7 +35,8 @@ jobs: outputs: BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ steps.base-metadata.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ steps.base-metadata.outputs.BASE_IMAGE }} + BASE_IMAGE_AMD64: ${{ steps.base-metadata.outputs.BASE_IMAGE_AMD64 }} + BASE_IMAGE_ARM64: ${{ steps.base-metadata.outputs.BASE_IMAGE_ARM64 }} PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} steps: - name: Check if the triggering workflow failed @@ -69,13 +70,15 @@ jobs: shell: bash -x -e {0} run: | if [[ -z "${{ inputs.BASE_IMAGE }}" ]]; then - BASE_IMAGE=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest + BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest + BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest else - BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BASE_IMAGE_AMD64=${{ inputs.BASE_IMAGE }}-amd64 + BASE_IMAGE_ARM64=${{ inputs.BASE_IMAGE }}-arm64 fi - echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT - echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT + echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT + echo "BASE_IMAGE_ARM64=${BASE_IMAGE_ARM64}" >> $GITHUB_OUTPUT amd64: needs: metadata @@ -84,7 +87,7 @@ jobs: ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} + BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE_AMD64 }} secrets: inherit arm64: @@ -94,7 +97,7 @@ jobs: ARCHITECTURE: arm64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} + BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE_ARM64 }} secrets: inherit publish-mealkit: diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index 0d866efaf..03551310f 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -84,7 +84,7 @@ jobs: ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} + BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }}-amd64 secrets: inherit arm64: From a97659e7b91eccadbc938695257df6ec21dd6f45 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 17 Nov 2023 15:47:44 -0700 Subject: [PATCH 136/146] Add arch to rosetta-pax build --- .github/workflows/_build_rosetta.yaml | 43 +++++++++---------- .../workflows/nightly-rosetta-pax-build.yaml | 10 ++--- .../nightly-rosetta-t5x-build-test.yaml | 15 ++++--- 3 files changed, 35 insertions(+), 33 deletions(-) diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml index 9295782fa..fd544fde0 100644 --- a/.github/workflows/_build_rosetta.yaml +++ b/.github/workflows/_build_rosetta.yaml @@ -81,29 +81,28 @@ jobs: driver-opts: | image=moby/buildkit:v0.12.1 - # TODO: not sure, we need this here - # - name: Set docker metadata - mealkit - # id: mealkit-metadata - # uses: docker/metadata-action@v4 - # with: - # images: ${{ env.UPLD_IMAGE }} - # flavor: latest=false - # tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECTURE }}-mealkit - # labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + - name: Set docker metadata - mealkit + id: mealkit-metadata + uses: docker/metadata-action@v4 + with: + images: ${{ env.UPLD_IMAGE }} + flavor: latest=false + tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECTURE }}-mealkit + labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - # - name: Build docker images - mealkit - # id: mealkit-build - # uses: docker/build-push-action@v4 - # with: - # context: rosetta/ - # push: true - # file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} - # platforms: linux/${{ inputs.ARCHITECTURE }} - # tags: ${{ steps.mealkit-metadata.outputs.tags }} - # labels: ${{ steps.mealkit-metadata.outputs.labels }} - # target: rosetta - # build-args: | - # BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} + - name: Build docker images - mealkit + id: mealkit-build + uses: docker/build-push-action@v4 + with: + context: rosetta/ + push: true + file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} + platforms: linux/${{ inputs.ARCHITECTURE }} + tags: ${{ steps.mealkit-metadata.outputs.tags }} + labels: ${{ steps.mealkit-metadata.outputs.labels }} + target: rosetta + build-args: | + BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} - name: Set docker metadata - final id: final-metadata diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index e7e4da474..d23b6c4e9 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -11,7 +11,7 @@ on: BASE_IMAGE: type: string description: 'PAX image built by NVIDIA/JAX-Toolbox' - default: 'ghcr.io/nvidia/upstream-pax:latest' + default: 'ghcr.io/nvidia/upstream-pax:mealkit' required: true PUBLISH: type: boolean @@ -70,11 +70,11 @@ jobs: shell: bash -x -e {0} run: | if [[ -z "${{ inputs.BASE_IMAGE }}" ]]; then - BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest - BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest + BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit + BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit else - BASE_IMAGE_AMD64=${{ inputs.BASE_IMAGE }}-amd64 - BASE_IMAGE_ARM64=${{ inputs.BASE_IMAGE }}-arm64 + BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}-amd64-mealkit + BASE_IMAGE_ARM64=${BASE_IMAGE_ARM64}-arm64-mealkit fi echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index 03551310f..99ae0321c 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -35,7 +35,8 @@ jobs: outputs: BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ steps.base-metadata.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ steps.base-metadata.outputs.BASE_IMAGE }} + BASE_IMAGE_AMD64: ${{ steps.base-metadata.outputs.BASE_IMAGE_AMD64 }} + BASE_IMAGE_ARM64: ${{ steps.base-metadata.outputs.BASE_IMAGE_ARM64 }} PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} steps: - name: Check if the triggering workflow failed @@ -69,13 +70,15 @@ jobs: shell: bash -x -e {0} run: | if [[ -z "${{ inputs.BASE_IMAGE }}" ]]; then - BASE_IMAGE=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest + BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit + BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit else - BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}-amd64-mealkit + BASE_IMAGE_ARM64=${BASE_IMAGE_ARM64}-arm64-mealkit fi - echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT - echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT + echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT + echo "BASE_IMAGE_ARM64=${BASE_IMAGE_ARM64}" >> $GITHUB_OUTPUT amd64: needs: metadata @@ -84,7 +87,7 @@ jobs: ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }}-amd64 + BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE_AMD64 }} secrets: inherit arm64: From 2744e6749fd3414e0081b1e9bd555c45fc73d7e3 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 17 Nov 2023 15:52:16 -0700 Subject: [PATCH 137/146] Add arch to rosetta-pax build: fix typo --- .github/workflows/nightly-rosetta-pax-build.yaml | 4 ++-- .github/workflows/nightly-rosetta-t5x-build-test.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index d23b6c4e9..ce073edf1 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -73,8 +73,8 @@ jobs: BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit else - BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}-amd64-mealkit - BASE_IMAGE_ARM64=${BASE_IMAGE_ARM64}-arm64-mealkit + BASE_IMAGE_AMD64=${BASE_IMAGE}-amd64-mealkit + BASE_IMAGE_ARM64=${BASE_IMAGE}-arm64-mealkit fi echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index 99ae0321c..d7a47b270 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -73,8 +73,8 @@ jobs: BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit else - BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}-amd64-mealkit - BASE_IMAGE_ARM64=${BASE_IMAGE_ARM64}-arm64-mealkit + BASE_IMAGE_AMD64=${BASE_IMAGE}-amd64-mealkit + BASE_IMAGE_ARM64=${BASE_IMAGE}-arm64-mealkit fi echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT From 858881ba6f49e3cf1aa683aef3ce32a898e34104 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 17 Nov 2023 15:54:40 -0700 Subject: [PATCH 138/146] Add arch to rosetta-pax build: fix typo --- .github/workflows/nightly-rosetta-pax-build.yaml | 4 ++-- .github/workflows/nightly-rosetta-t5x-build-test.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index ce073edf1..31ac1d657 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -73,8 +73,8 @@ jobs: BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit else - BASE_IMAGE_AMD64=${BASE_IMAGE}-amd64-mealkit - BASE_IMAGE_ARM64=${BASE_IMAGE}-arm64-mealkit + BASE_IMAGE_AMD64=${inputs.BASE_IMAGE}-amd64-mealkit + BASE_IMAGE_ARM64=${inputs.BASE_IMAGE}-arm64-mealkit fi echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index d7a47b270..f136f1c18 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -73,8 +73,8 @@ jobs: BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit else - BASE_IMAGE_AMD64=${BASE_IMAGE}-amd64-mealkit - BASE_IMAGE_ARM64=${BASE_IMAGE}-arm64-mealkit + BASE_IMAGE_AMD64=${inputs.BASE_IMAGE}-amd64-mealkit + BASE_IMAGE_ARM64=${inputs.BASE_IMAGE}-arm64-mealkit fi echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT From 2d6a11cc281ecd324a32df60d24b923c133152df Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Fri, 17 Nov 2023 15:56:20 -0700 Subject: [PATCH 139/146] Add arch to rosetta-pax build: fix typo --- .github/workflows/nightly-rosetta-pax-build.yaml | 4 ++-- .github/workflows/nightly-rosetta-t5x-build-test.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index 31ac1d657..e44df83f4 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -73,8 +73,8 @@ jobs: BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit else - BASE_IMAGE_AMD64=${inputs.BASE_IMAGE}-amd64-mealkit - BASE_IMAGE_ARM64=${inputs.BASE_IMAGE}-arm64-mealkit + BASE_IMAGE_AMD64=${{ inputs.BASE_IMAGE }}-amd64-mealkit + BASE_IMAGE_ARM64=${{ inputs.BASE_IMAGE }}-arm64-mealkit fi echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index f136f1c18..8f838dd05 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -73,8 +73,8 @@ jobs: BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit else - BASE_IMAGE_AMD64=${inputs.BASE_IMAGE}-amd64-mealkit - BASE_IMAGE_ARM64=${inputs.BASE_IMAGE}-arm64-mealkit + BASE_IMAGE_AMD64=${{ inputs.BASE_IMAGE }}-amd64-mealkit + BASE_IMAGE_ARM64=${{ inputs.BASE_IMAGE }}-arm64-mealkit fi echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT From 332bf98ed9ef558a4329554de9877f00d5977fca Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Mon, 20 Nov 2023 12:40:17 -0700 Subject: [PATCH 140/146] Addressed Terry's comments --- .github/workflows/nightly-rosetta-pax-build.yaml | 2 +- .github/workflows/nightly-rosetta-t5x-build-test.yaml | 2 +- rosetta/Dockerfile.pax | 3 +-- rosetta/Dockerfile.t5x | 3 +-- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index e44df83f4..82c02e2ff 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -11,7 +11,7 @@ on: BASE_IMAGE: type: string description: 'PAX image built by NVIDIA/JAX-Toolbox' - default: 'ghcr.io/nvidia/upstream-pax:mealkit' + default: '' required: true PUBLISH: type: boolean diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index 8f838dd05..a774873e8 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -11,7 +11,7 @@ on: BASE_IMAGE: type: string description: 'T5x image built by NVIDIA/JAX-Toolbox' - default: 'ghcr.io/nvidia/upstream-t5x:latest' + default: '' required: true PUBLISH: type: boolean diff --git a/rosetta/Dockerfile.pax b/rosetta/Dockerfile.pax index aa356c99d..9e56962bf 100644 --- a/rosetta/Dockerfile.pax +++ b/rosetta/Dockerfile.pax @@ -30,7 +30,7 @@ EOF COPY --from=rosetta-source / /opt/rosetta WORKDIR /opt/rosetta RUN --mount=target=/opt/pax-mirror,from=pax-mirror-source,readwrite \ - --mount=target=/opt/praxis-mirror,from=praxis-mirror-source,readwrite < Date: Mon, 20 Nov 2023 13:08:24 -0700 Subject: [PATCH 141/146] Pass git username and email thru params --- .github/container/Dockerfile.base | 8 +++++-- .github/container/Dockerfile.jax | 8 +++++-- .github/workflows/_build_base.yaml | 13 +++++++++++ .github/workflows/_build_jax.yaml | 12 ++++++++++ .github/workflows/_test_distribution.yaml | 27 +++++++++++++++++++++-- 5 files changed, 62 insertions(+), 6 deletions(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 682bca310..8660e31de 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -1,6 +1,10 @@ ARG BASE_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu22.04 +ARG GIT_USER_NAME="JAX Toolbox" +ARG GIT_USER_EMAIL=jax@nvidia.com FROM ${BASE_IMAGE} +ARG GIT_USER_EMAIL +ARG GIT_USER_NAME ############################################################################### ## Install Python and essential tools @@ -31,8 +35,8 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* RUN <<"EOF" bash -ex -git config --global user.name "JAX Toolbox" -git config --global user.email "jax@nvidia.com" +git config --global user.name "${GIT_USER_NAME}" +git config --global user.email "${GIT_USER_EMAIL}" EOF RUN pip install --upgrade --no-cache-dir pip pip-tools && rm -rf ~/.cache/* RUN mkdir -p /opt/pip-tools.d diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index aa2726aff..e8fb38b80 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -11,6 +11,8 @@ ARG SRC_PATH_JAX=/opt/jax ARG SRC_PATH_XLA=/opt/xla-source ARG SRC_PATH_FLAX=/opt/flax ARG SRC_PATH_TE=/opt/transformer-engine-source +ARG GIT_USER_NAME="JAX Toolbox" +ARG GIT_USER_EMAIL=jax@nvidia.com ARG BAZEL_CACHE=/tmp ARG BUILD_DATE @@ -27,6 +29,8 @@ ARG REF_XLA ARG SRC_PATH_JAX ARG SRC_PATH_XLA ARG BAZEL_CACHE +ARG GIT_USER_NAME +ARG GIT_USER_EMAIL RUN git clone "${REPO_JAX}" "${SRC_PATH_JAX}" && cd "${SRC_PATH_JAX}" && git checkout ${REF_JAX} RUN --mount=type=ssh \ @@ -37,8 +41,8 @@ RUN --mount=type=ssh \ RUN < Date: Mon, 20 Nov 2023 15:37:00 -0700 Subject: [PATCH 142/146] Address Terry's final comments --- rosetta/Dockerfile.pax | 39 ++++++++++++++++++++++++---------- rosetta/Dockerfile.t5x | 48 ++++++++++++++++++++++++------------------ 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/rosetta/Dockerfile.pax b/rosetta/Dockerfile.pax index 9e56962bf..9108693b7 100644 --- a/rosetta/Dockerfile.pax +++ b/rosetta/Dockerfile.pax @@ -1,9 +1,12 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=ghcr.io/nvidia/upstream-pax:latest +ARG BASE_IMAGE=ghcr.io/nvidia/upstream-pax:mealkit ARG GIT_USER_EMAIL=jax@nvidia.com ARG GIT_USER_NAME=NVIDIA ARG SRC_PATH_PAXML=/opt/paxml ARG SRC_PATH_PRAXIS=/opt/praxis +# These patchlist paths should be relative to this script +ARG PAXML_PATCHLIST=patchlist-paxml.txt +ARG PRAXIS_PATCHLIST=patchlist-praxis.txt FROM scratch as rosetta-source ARG SRC_PATH_PAXML @@ -17,29 +20,43 @@ ADD --keep-git-dir=true https://github.com/google/paxml.git#main / FROM scratch as praxis-mirror-source ADD --keep-git-dir=true https://github.com/google/praxis.git#main / -FROM ${BASE_IMAGE} AS rosetta +############################################################################### +### Download source and add auxiliary scripts +################################################################################ + +FROM ${BASE_IMAGE} AS mealkit ENV ENABLE_TE=1 ARG GIT_USER_EMAIL ARG GIT_USER_NAME -RUN <> /opt/pip-tools.d/manifest.t5x +echo "-e file:///opt/rosetta" >> /opt/pip-tools.d/manifest.t5x EOF WORKDIR /opt/rosetta -RUN < Date: Mon, 20 Nov 2023 20:47:10 -0700 Subject: [PATCH 143/146] Address Terry's LGTM comments --- rosetta/Dockerfile.pax | 3 +-- rosetta/Dockerfile.t5x | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/rosetta/Dockerfile.pax b/rosetta/Dockerfile.pax index 9108693b7..e3c220bfd 100644 --- a/rosetta/Dockerfile.pax +++ b/rosetta/Dockerfile.pax @@ -35,8 +35,7 @@ ARG PRAXIS_PATCHLIST COPY --from=rosetta-source / /opt/rosetta WORKDIR /opt/rosetta RUN --mount=target=/opt/pax-mirror,from=pax-mirror-source,readwrite \ - --mount=target=/opt/praxis-mirror,from=praxis-mirror-source,readwrite < Date: Mon, 20 Nov 2023 20:54:38 -0700 Subject: [PATCH 144/146] Rosetta dockerfiles to have 2 stages: mealkit and final --- .github/workflows/_build_rosetta.yaml | 4 ++-- .github/workflows/_build_t5x.yaml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml index fd544fde0..02dcc0951 100644 --- a/.github/workflows/_build_rosetta.yaml +++ b/.github/workflows/_build_rosetta.yaml @@ -100,7 +100,7 @@ jobs: platforms: linux/${{ inputs.ARCHITECTURE }} tags: ${{ steps.mealkit-metadata.outputs.tags }} labels: ${{ steps.mealkit-metadata.outputs.labels }} - target: rosetta + target: mealkit build-args: | BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} @@ -122,6 +122,6 @@ jobs: platforms: linux/${{ inputs.ARCHITECTURE }} tags: ${{ steps.final-metadata.outputs.tags }} labels: ${{ steps.final-metadata.outputs.labels }} - target: rosetta + target: final build-args: | BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} \ No newline at end of file diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index 606840b71..6efcde8f9 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -147,6 +147,7 @@ jobs: platforms: linux/${{ inputs.ARCHITECTURE }} tags: ${{ steps.final-metadata.outputs.tags }} labels: ${{ steps.final-metadata.outputs.labels }} + target: final build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} From 85182efa01c61b87ca4575dabadeee455be27ba5 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Mon, 20 Nov 2023 21:01:05 -0700 Subject: [PATCH 145/146] Rosetta dockerfiles: remore __pychache__ deletion --- rosetta/Dockerfile.pax | 1 - rosetta/Dockerfile.t5x | 2 -- 2 files changed, 3 deletions(-) diff --git a/rosetta/Dockerfile.pax b/rosetta/Dockerfile.pax index e3c220bfd..b03e1ffd8 100644 --- a/rosetta/Dockerfile.pax +++ b/rosetta/Dockerfile.pax @@ -49,7 +49,6 @@ bash create-distribution.sh \ -m https://github.com/nvjax-svc-0/praxis.git \ -d /opt/praxis \ -e /opt/praxis-mirror -rm -rf $(find /opt -name "__pycache__") ~/.gitconfig EOF ############################################################################### diff --git a/rosetta/Dockerfile.t5x b/rosetta/Dockerfile.t5x index 20661166b..7e17c8eec 100644 --- a/rosetta/Dockerfile.t5x +++ b/rosetta/Dockerfile.t5x @@ -43,8 +43,6 @@ bash create-distribution.sh \ -m https://github.com/nvjax-svc-0/flax.git \ -d /opt/flax \ -e /opt/flax-mirror -rm -rf $(find /opt -name "__pycache__") ~/.gitconfig - echo "--extra-index-url https://developer.download.nvidia.com/compute/redist" >> /opt/pip-tools.d/manifest.t5x echo "-e file:///opt/rosetta" >> /opt/pip-tools.d/manifest.t5x EOF From 4ea17429d70cb1f725d68142986071eb4d34be28 Mon Sep 17 00:00:00 2001 From: Vladislav Kozlov Date: Mon, 20 Nov 2023 21:24:13 -0700 Subject: [PATCH 146/146] Reverte test_destribution --- .github/workflows/_test_distribution.yaml | 29 +++-------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/.github/workflows/_test_distribution.yaml b/.github/workflows/_test_distribution.yaml index f6735f1f4..577225017 100644 --- a/.github/workflows/_test_distribution.yaml +++ b/.github/workflows/_test_distribution.yaml @@ -2,30 +2,7 @@ name: ~test core distribution logic on: workflow_call: - inputs: - GIT_USER_NAME: - type: string - description: 'Username in GIT to perform git pull/push' - required: false - default: 'JAX Toolbox' - GIT_USER_EMAIL: - type: string - description: 'User email in GIT to perform git pull/push' - required: false - default: 'jax@nvidia.com' workflow_dispatch: - inputs: - GIT_USER_NAME: - type: string - description: 'Username in GIT to perform git pull/push' - required: false - default: 'JAX Toolbox' - GIT_USER_EMAIL: - type: string - description: 'User email in GIT to perform git pull/push' - required: false - default: 'jax@nvidia.com' - jobs: test-create-distribution: @@ -40,9 +17,9 @@ jobs: - name: Set git login for tests run: | - git config --global user.name "${GIT_USER_NAME}" - git config --global user.email "${GIT_USER_EMAIL}" - + git config --global user.email "jax@nvidia.com" + git config --global user.name "JAX-Toolbox CI" + - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3