diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 6e71ed702..8660e31de 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -1,5 +1,10 @@ ARG BASE_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu22.04 +ARG GIT_USER_NAME="JAX Toolbox" +ARG GIT_USER_EMAIL=jax@nvidia.com + FROM ${BASE_IMAGE} +ARG GIT_USER_EMAIL +ARG GIT_USER_NAME ############################################################################### ## Install Python and essential tools @@ -17,13 +22,28 @@ RUN apt-get update && \ git \ lld \ vim \ + bat \ + curl \ + git \ + gnupg \ + rsync \ python-is-python3 \ python3-pip \ + liblzma-dev \ wget \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN pip install --upgrade --no-cache-dir pip +RUN <<"EOF" bash -ex +git config --global user.name "${GIT_USER_NAME}" +git config --global user.email "${GIT_USER_EMAIL}" +EOF +RUN pip install --upgrade --no-cache-dir pip pip-tools && rm -rf ~/.cache/* +RUN mkdir -p /opt/pip-tools.d +ADD --chmod=777 \ + get-source.sh \ + pip-finalize.sh \ + /usr/local/bin/ ############################################################################### ## Install cuDNN diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index cfcee01df..8ab5fe0b7 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -1,10 +1,19 @@ ARG BASE_IMAGE=ghcr.io/nvidia/jax-toolbox:base ARG REPO_JAX="https://github.com/google/jax.git" ARG REPO_XLA="https://github.com/openxla/xla.git" +ARG REPO_FLAX="https://github.com/google/flax.git" +ARG REPO_TE="https://github.com/NVIDIA/TransformerEngine.git" ARG REF_JAX=main ARG REF_XLA=main -ARG SRC_PATH_JAX=/opt/jax-source +ARG REF_FLAX=main +ARG REF_TE=main +ARG SRC_PATH_JAX=/opt/jax ARG SRC_PATH_XLA=/opt/xla-source +ARG SRC_PATH_FLAX=/opt/flax +ARG SRC_PATH_TE=/opt/transformer-engine-source +ARG GIT_USER_NAME="JAX Toolbox" +ARG GIT_USER_EMAIL=jax@nvidia.com + ARG BAZEL_CACHE=/tmp ARG BUILD_DATE @@ -12,7 +21,7 @@ ARG BUILD_DATE ## Build JAX ############################################################################### -FROM ${BASE_IMAGE} as jax-builder +FROM ${BASE_IMAGE} as builder ARG REPO_JAX ARG REPO_XLA ARG REF_JAX @@ -20,6 +29,8 @@ ARG REF_XLA ARG SRC_PATH_JAX ARG SRC_PATH_XLA ARG BAZEL_CACHE +ARG GIT_USER_NAME +ARG GIT_USER_EMAIL RUN git clone "${REPO_JAX}" "${SRC_PATH_JAX}" && cd "${SRC_PATH_JAX}" && git checkout ${REF_JAX} RUN --mount=type=ssh \ @@ -30,8 +41,8 @@ RUN --mount=type=ssh \ RUN <> /opt/pip-tools.d/manifest.jax +echo "jaxlib @ file://$(ls ${SRC_PATH_JAX}/dist/*.whl)" >> /opt/pip-tools.d/manifest.jax +EOF -# Install software stack in JAX ecosystem -# Made this optional since tensorstore cannot build on Ubuntu 20.04 + ARM -RUN { pip install flax || true; } && rm -rf ~/.cache/* +## Flax +ARG REPO_FLAX +ARG REF_FLAX +ARG SRC_PATH_FLAX +RUN get-source.sh -f ${REPO_FLAX} -r ${REF_FLAX} -d ${SRC_PATH_FLAX} -m /opt/pip-tools.d/manifest.flax + +## Transformer engine: check out source and build wheel +ARG REPO_TE +ARG REF_TE +ARG SRC_PATH_TE +ENV NVTE_FRAMEWORK=jax +ENV SRC_PATH_TE=${SRC_PATH_TE} +RUN <<"EOF" bash -ex +set -o pipefail +pip install ninja && rm -rf ~/.cache/pip +get-source.sh -f ${REPO_TE} -r ${REF_TE} -d ${SRC_PATH_TE} +pushd ${SRC_PATH_TE} +python setup.py bdist_wheel && rm -rf build +echo "transformer-engine @ file://$(ls ${SRC_PATH_TE}/dist/*.whl)" >> /opt/pip-tools.d/manifest.te +EOF # TODO: properly configure entrypoint -# COPY entrypoint.d/ /opt/nvidia/entrypoint.d/ ############################################################################### -## Build 'devel' image with build scripts and git metadata +## Install primary packages and transitive dependencies ############################################################################### -FROM runtime-image as devel-image -ARG SRC_PATH_JAX -ARG SRC_PATH_XLA - -ADD build-jax.sh local_cuda_arch test-jax.sh /usr/local/bin/ +FROM mealkit as final -COPY --from=jax-builder ${SRC_PATH_JAX}/.git ${SRC_PATH_JAX}/.git -COPY --from=jax-builder ${SRC_PATH_XLA}/.git ${SRC_PATH_XLA}/.git +RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.pax.amd64 b/.github/container/Dockerfile.pax.amd64 index f6072f53a..89697dced 100644 --- a/.github/container/Dockerfile.pax.amd64 +++ b/.github/container/Dockerfile.pax.amd64 @@ -1,37 +1,54 @@ # syntax=docker/dockerfile:1-labs -############################################################################### -## Pax -############################################################################### ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} - -ADD install-pax.sh /usr/local/bin -ADD install-flax.sh /usr/local/bin -ADD install-te.sh /usr/local/bin - -ENV NVTE_FRAMEWORK=jax ARG REPO_PAXML=https://github.com/google/paxml.git ARG REPO_PRAXIS=https://github.com/google/praxis.git ARG REF_PAXML=main ARG REF_PRAXIS=main -ARG REPO_TE=https://github.com/NVIDIA/TransformerEngine.git -# TODO: This is a temporary pinning of TE as the API in TE no longer matches the TE patch -# This should be reverted to main ASAP -ARG REF_TE=7976bd003fcf084dd068069b92a9a79b1743316a +ARG SRC_PATH_PAXML=/opt/paxml +ARG SRC_PATH_PRAXIS=/opt/praxis + +############################################################################### +## Download source and add auxiliary scripts +############################################################################### + +FROM ${BASE_IMAGE} as mealkit +ARG REPO_PAXML +ARG REPO_PRAXIS +ARG REF_PAXML +ARG REF_PRAXIS +ARG SRC_PATH_PAXML +ARG SRC_PATH_PRAXIS + +# update TE manifest file to install the [test] extras +RUN sed -i "s/transformer-engine @/transformer-engine[test] @/g" /opt/pip-tools.d/manifest.te + RUN <<"EOF" bash -ex -install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} -install-flax.sh --defer -install-te.sh --defer --from ${REPO_TE} --ref ${REF_TE} - -if [[ -f /opt/requirements-defer.txt ]]; then - # SKIP_HEAD_INSTALLS avoids having to install jax from Github source so that - # we do not overwrite the jax that was already installed. - SKIP_HEAD_INSTALLS=true pip install -r /opt/requirements-defer.txt -fi -if [[ -f /opt/cleanup.sh ]]; then - bash -ex /opt/cleanup.sh -fi +get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} +get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} +echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/manifest.pax +echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/manifest.pax + +for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do + pushd ${src} + sed -i "s| @ git+https://github.com/google/flax||g" requirements.in + sed -i "s| @ git+https://github.com/google/jax||g" requirements.in + if git diff --quiet; then + echo "URL specs no longer present in select dependencies for ${src}" + exit 1 + else + git commit -a -m "remove URL specs from select dependencies for ${src}" + fi + popd +done EOF ADD test-pax.sh /usr/local/bin + +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### + +FROM mealkit as final + +RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.pax.arm64 b/.github/container/Dockerfile.pax.arm64 index 5d55bf2a5..79a1342cc 100644 --- a/.github/container/Dockerfile.pax.arm64 +++ b/.github/container/Dockerfile.pax.arm64 @@ -1,65 +1,152 @@ # syntax=docker/dockerfile:1-labs -############################################################################### -## Pax for AArch64 -############################################################################### ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} - -# We need to build some packages from source, bring some dependencies. -RUN apt-get update && \ - apt-get update && \ - apt-get install -y \ - bat \ - curl \ - git \ - gnupg \ - rsync \ - liblzma-dev \ - && \ - apt-get autoremove -y && apt-get clean && rm -rf /var/lib/apt/lists +ARG REPO_PAXML=https://github.com/google/paxml.git +ARG REPO_PRAXIS=https://github.com/google/praxis.git +ARG REF_PAXML=main +ARG REF_PRAXIS=main +ARG SRC_PATH_PAXML=/opt/paxml +ARG SRC_PATH_PRAXIS=/opt/praxis +############################################################################### +## build tensorflow-text and lingvo, which do not have working arm64 pip wheels +############################################################################### +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as wheel-builder + +# tensorflow-text and lingvo build needs bazel RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-arm64 -O /usr/bin/bazel && \ chmod a+x /usr/bin/bazel +#------------------------------------------------------------------------------ +# build tensorflow-text 2.13.0 from source +#------------------------------------------------------------------------------ + +FROM wheel-builder as tftext-builder + +RUN <<"EOT" bash -exu +set -o pipefail +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 +git clone http://github.com/tensorflow/text.git /opt/tensorflow-text +cd /opt/tensorflow-text +git checkout v2.13.0 +./oss_scripts/run_build.sh +EOT + +#------------------------------------------------------------------------------ +# build lingvo +#------------------------------------------------------------------------------ + +FROM wheel-builder as lingvo-builder +ARG REPO_LINGVO=https://github.com/tensorflow/lingvo.git +ARG REF_LINGVO=master +ARG SRC_PATH_LINGVO=/opt/lingvo + +COPY --from=tftext-builder /opt/tensorflow-text/tensorflow_text*.whl /opt/ + +RUN get-source.sh -f ${REPO_LINGVO} -r ${REF_LINGVO} -d ${SRC_PATH_LINGVO} + +# build lingvo +RUN <<"EOT" bash -exu +set -o pipefail + +pushd ${SRC_PATH_LINGVO} +git fetch origin pull/329/head:pr329 +git cherry-pick --allow-empty pr329 + +# Disable 2 flaky tests here +patch -p1 <<"EOF" +diff --git a/pip_package/build.sh b/pip_package/build.sh +index ef62c432e..659e78956 100755 +--- a/pip_package/build.sh ++++ b/pip_package/build.sh +@@ -89,7 +89,7 @@ bazel clean + bazel build $@ ... + if ! [[ $SKIP_TESTS ]]; then + # Just test the core for the purposes of the pip package. +- bazel test $@ lingvo/core/... ++ bazel test $@ lingvo/core/... -- -//lingvo/tasks/mt:model_test -//lingvo/core:saver_test + fi + + DST_DIR="/tmp/lingvo/dist" +EOF + +pip install tensorflow_datasets==4.9.2 auditwheel tensorflow==2.13.0 /opt/tensorflow_text*.whl +sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt +sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt +sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt +pip install -r docker/dev.requirements.txt -# Lingvo -ADD install_lingvo_aarch64.sh /opt/ -ADD lingvo.patch /opt/ -RUN /opt/install_lingvo_aarch64.sh - -ADD install-pax.sh /usr/local/bin -RUN install-pax.sh - -ENV NVTE_FRAMEWORK=jax -ADD install-te.sh /usr/local/bin -RUN install-te.sh -# Lingvo has pinned TF to 2.13, so we need to downgrade the pydantic version so that its -# transitive dependency on typing-extensions satisfies TF 2.13's req of typing-extensions>=3.6.6,<4.6.0. -# This version of pydantic is the latest version that satisfies the typing-extensions requirement -RUN pip install pydantic==1.10.13 - -# Install T5 now, Pip will build the wheel from source, it needs Rust. -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > /tmp/rustup.sh && \ - echo "be3535b3033ff5e0ecc4d589a35d3656f681332f860c5fd6684859970165ddcc /tmp/rustup.sh" | sha256sum --check && \ - bash /tmp/rustup.sh -y && \ - export PATH=$PATH:/root/.cargo/bin && \ - pip install t5 && \ - rm -Rf /root/.cargo /root/.rustup && \ - mv /root/.profile /root/.profile.save && \ - grep -v cargo /root/.profile.save > /root/.profile && \ - rm /root/.profile.save && \ - mv /root/.bashrc /root/.bashrc.save && \ - grep -v cargo /root/.bashrc.save > /root/.bashrc && \ - rm /root/.bashrc.save && \ - rm -Rf /root/.cache /tmp/* +# Some tests are flaky right now (see the patch abovbe), if needed we can skip +# running the tests entirely by uncommentin the following line. +# SKIP_TEST=1 +PYTHON_MINOR_VERSION=$(python --version | cut -d ' ' -f 2 | cut -d '.' -f 2) pip_package/build.sh +EOT + +############################################################################### +## Pax for AArch64 +############################################################################### + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as mealkit +ARG REPO_PAXML +ARG REPO_PRAXIS +ARG REF_PAXML +ARG REF_PRAXIS +ARG SRC_PATH_PAXML +ARG SRC_PATH_PRAXIS + +COPY --from=lingvo-builder /tmp/lingvo/dist/lingvo*linux_aarch64.whl /opt/ +RUN echo "lingvo @ file://$(ls /opt/lingvo*.whl)" >> /opt/pip-tools.d/manifest.pax + +COPY --from=tftext-builder /opt/tensorflow-text/tensorflow_text*.whl /opt/ +RUN echo "tensorflow-text @ file://$(ls /opt/tensorflow_text*.whl)" >> /opt/pip-tools.d/manifest.pax + +# paxml + praxis +RUN <<"EOT" bash -ex +echo "tensorflow==2.13.0" >> /opt/pip-tools.d/manifest.pax +echo "tensorflow_datasets==4.9.2" >> /opt/pip-tools.d/manifest.pax +echo "chex==0.1.7" >> /opt/pip-tools.d/manifest.pax +echo "auditwheel" >> /opt/pip-tools.d/manifest.pax + +get-source.sh -f ${REPO_PAXML} -r ${REF_PAXML} -d ${SRC_PATH_PAXML} +get-source.sh -f ${REPO_PRAXIS} -r ${REF_PRAXIS} -d ${SRC_PATH_PRAXIS} +echo "-e file://${SRC_PATH_PAXML}[gpu]" >> /opt/pip-tools.d/manifest.pax +echo "-e file://${SRC_PATH_PRAXIS}" >> /opt/pip-tools.d/manifest.pax + +for src in ${SRC_PATH_PAXML} ${SRC_PATH_PRAXIS}; do + pushd ${src} + + for pattern in \ + "s| @ git+https://github.com/google/flax||g" \ + "s| @ git+https://github.com/google/jax||g" \ + "s|^tensorflow|#tensorflow|" \ + "s|^lingvo|#lingvo|" \ + "s|^scikit-learn|#scikit-learn|" \ + "s|^t5|#t5|" \ + "s|^protobuf|#protobuf|" \ + "s|^numpy|#numpy|" \ + ; do + sed -i "${pattern}" */pip_package/requirements.txt requirements.in + done + + if git diff --quiet; then + echo "broken dependencies no longer present in ${src}" + exit 1 + else + git commit -a -m "remove broken dependencies from ${src}" + fi + popd +done +EOT ADD test-pax.sh /usr/local/bin -# TODO: Utilize these build-args and use them when installing pax -# ARG REPO_PAXML=https://github.com/google/paxml.git -# ARG REPO_PRAXIS=https://github.com/google/praxis.git -# ARG REF_PAXML=main -# ARG REF_PRAXIS=main -# install-pax.sh --defer --from_paxml ${REPO_PAXML} --from_praxis ${REPO_PRAXIS} --ref_paxml ${REF_PAXML} --ref_praxis ${REF_PRAXIS} +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### + +FROM mealkit as final + +RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.t5x b/.github/container/Dockerfile.t5x index 657459706..6b55f056c 100644 --- a/.github/container/Dockerfile.t5x +++ b/.github/container/Dockerfile.t5x @@ -1,35 +1,42 @@ # syntax=docker/dockerfile:1-labs + +ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest +ARG REPO_T5X=https://github.com/google-research/t5x.git +ARG REF_T5X=main +ARG SRC_PATH_T5X=/opt/t5x + ############################################################################### -## T5X +## Download source and add auxiliary scripts ############################################################################### -ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} +FROM ${BASE_IMAGE} as mealkit -ADD install-t5x.sh /usr/local/bin -ADD install-flax.sh /usr/local/bin -ADD install-te.sh /usr/local/bin +ARG REPO_T5X +ARG REF_T5X +ARG SRC_PATH_T5X -ENV NVTE_FRAMEWORK=jax -ARG REPO_T5X=https://github.com/google-research/t5x.git -ARG REF_T5X=main -ARG REPO_TE=https://github.com/NVIDIA/TransformerEngine.git -# TODO: This is a temporary pinning of TE as the API in TE no longer matches the TE patch -# This should be reverted to main ASAP -ARG REF_TE=7976bd003fcf084dd068069b92a9a79b1743316a RUN <<"EOF" bash -ex -install-t5x.sh --defer --from ${REPO_T5X} --ref ${REF_T5X} -install-te.sh --defer --from ${REPO_TE} --ref ${REF_TE} +get-source.sh -f ${REPO_T5X} -r ${REF_T5X} -d ${SRC_PATH_T5X} +echo "-e file://${SRC_PATH_T5X}[gpu]" >> /opt/pip-tools.d/manifest.t5x -if [[ -f /opt/requirements-defer.txt ]]; then - pip install -r /opt/requirements-defer.txt +# remove head-of-tree specs from select dependencies +pushd ${SRC_PATH_T5X} +sed -i "s| @ git+https://github.com/google/flax#egg=flax||g" setup.py +if git diff --quiet; then + echo "URL specs no longer present in select dependencies of t5x" + exit 1 +else + git commit -a -m "remove URL specs from select dependencies of t5x" fi -if [[ -f /opt/cleanup.sh ]]; then - bash -ex /opt/cleanup.sh -fi - -# Note: Install after t5x installation b/c t5x installs flax from source -install-flax.sh +popd EOF ADD test-t5x.sh /usr/local/bin + +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### + +FROM mealkit as final + +RUN pip-finalize.sh diff --git a/.github/container/Dockerfile.te b/.github/container/Dockerfile.te deleted file mode 100644 index ffa2c9761..000000000 --- a/.github/container/Dockerfile.te +++ /dev/null @@ -1,12 +0,0 @@ -############################################################################### -## Transformer Engine -############################################################################### - -ARG BASE_IMAGE=ghcr.io/nvidia/jax:latest -FROM ${BASE_IMAGE} -ARG REPO_TE="https://github.com/NVIDIA/TransformerEngine.git" -ARG REF_TE=main -ARG SRC_PATH_TE=/opt/transformer-engine - -ADD install-te.sh /usr/local/bin -RUN install-te.sh --from=${REPO_TE} --ref=${REF_TE} --dir=${SRC_PATH_TE} \ No newline at end of file diff --git a/.github/container/get-source.sh b/.github/container/get-source.sh new file mode 100755 index 000000000..b8fff71a7 --- /dev/null +++ b/.github/container/get-source.sh @@ -0,0 +1,91 @@ +#!/bin/bash +## Fetch a Python package from a git repo and write the pip-tools input manifest to stdout +## Example: +## get-source.sh -f https://github.com/google/flax.git -r main -d /opt/flax +## Output: +## -e /opt/flax + +## Parse command-line arguments + +usage() { + echo "Usage: $0 [OPTION]..." + echo " -d, --dir PATH [Required] Local path to check out the source code." + echo " -f, --from URL [Required] URL of the source repo." + echo " -h, --help Print usage." + echo " -m, --manifest FILE Create a pip manifest file if specified" + echo " -r, --ref REF Git commit SHA, branch name, or tag name to checkout. Uses default branch if not specified." + echo + exit $1 +} + +args=$(getopt -o d:f:hm:r: --long dir:,from:,help,manifest:,ref: -- "$@") +if [[ $? -ne 0 ]]; then + exit 1 +fi + +## Set default arguments + +GIT_REPO="" +GIT_REF="${GIT_REF:-HEAD}" +INSTALL_DIR="" +MANIFEST_FILE="" + +eval set -- "$args" +while [ : ]; do + case "$1" in + -d | --dir) + INSTALL_DIR="$2" + shift 2 + ;; + -f | --from) + GIT_REPO="$2" + shift 2 + ;; + -h | --help) + usage + ;; + -m | --manifest) + MANIFEST_FILE="$2" + shift 2 + ;; + -r | --ref) + GIT_REF="$2" + shift 2 + ;; + --) + shift; + break + ;; + esac +done + +if [[ $# -ge 1 ]]; then + echo "Un-recognized argument: $*" && echo + usage 1 +fi + +if [[ ! -n "${GIT_REPO}" ]]; then + echo "Source repository not speicified." && echo + usage 1 +fi + +if [[ ! -n "${INSTALL_DIR}" ]]; then + echo "Check out destination not specified." && echo + usage 1 +fi + +## check out the source + +echo "Fetching $GIT_REPO#$GIT_REF to $INSTALL_DIR" + +set -ex -o pipefail + +git clone ${GIT_REPO} ${INSTALL_DIR} +pushd ${INSTALL_DIR} +git checkout ${GIT_REF} +git submodule init +git submodule update --recursive +popd + +echo "Writing to ${MANIFEST_FILE}:" +echo "-e file://${INSTALL_DIR}" | tee -a ${MANIFEST_FILE} diff --git a/.github/container/install-flax.sh b/.github/container/install-flax.sh deleted file mode 100755 index 30802e0d4..000000000 --- a/.github/container/install-flax.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - -## Parse command-line arguments - -usage() { - echo "Usage: $0 [OPTION]..." - echo ' --defer When passed, will defer the installation of the main package. Can be installed afterwards with `pip install -r requirements-defer.txt` and any deferred cleanup commands can be run with `bash cleanup.sh`' - echo " -d, --dir=PATH Path to store flax source. Defaults to /opt/flax" - echo " -f, --from=URL URL of the flax repo. Defaults to https://github.com/google/flax.git" - echo " -h, --help Print usage." - echo " -r, --ref=REF Git commit hash or tag name that specifies the version of flax to install. Defaults to HEAD." - exit $1 -} - -args=$(getopt -o d:f:hr: --long defer,dir:,from:,help,ref: -- "$@") -if [[ $? -ne 0 ]]; then - exit 1 -fi - -eval set -- "$args" -while [ : ]; do - case "$1" in - --defer) - DEFER=true - shift - ;; - -d | --dir) - INSTALL_DIR="$2" - shift 2 - ;; - -f | --from) - FLAX_REPO="$2" - shift 2 - ;; - -h | --help) - usage - ;; - -r | --ref) - FLAX_REF="$2" - shift 2 - ;; - --) - shift; - break - ;; - esac -done - -if [[ $# -ge 1 ]]; then - echo "Un-recognized argument: $*" && echo - usage 1 -fi - -## Set default arguments if not provided via command-line - -DEFER=${DEFER:-false} -FLAX_REF="${FLAX_REF:-HEAD}" -FLAX_REPO="${FLAX_REPO:-https://github.com/google/flax.git}" -INSTALL_DIR="${INSTALL_DIR:-/opt/flax}" - -echo "Installing flax $FLAX_REF from $FLAX_REPO to $INSTALL_DIR" - -maybe_defer_pip_install() { - if [[ "$DEFER" = true ]]; then - echo "Deferring installation of 'pip install $*'" - echo "$*" >> /opt/requirements-defer.txt - else - pip install $@ - fi -} - -set -ex - -## Install flax - -git clone ${FLAX_REPO} ${INSTALL_DIR} -cd ${INSTALL_DIR} -git checkout ${FLAX_REF} -# We currently require installing editable (-e) to build a distribution since -# we edit the source in place and do not re-install -maybe_defer_pip_install -e ${INSTALL_DIR} \ No newline at end of file diff --git a/.github/container/install-pax.sh b/.github/container/install-pax.sh index 083f4ce59..f03ad790b 100755 --- a/.github/container/install-pax.sh +++ b/.github/container/install-pax.sh @@ -1,101 +1,3 @@ -#!/bin/bash -exu - -## Parse command-line arguments - -usage() { - echo "Usage: $0 [OPTION]..." - echo ' --defer When passed, will defer the installation of the main package. Can be installed afterwards with `pip install -r requirements-defer.txt` and any deferred cleanup commands can be run with `bash cleanup.sh`' - echo " -d, --dir=PATH Path to store Pax source. Defaults to /opt" - echo " --from_paxml=URL URL of the Paxml repo. Defaults to https://github.com/google/paxml.git" - echo " --from_praxis=URL URL of the Praxis repo. Defaults to https://github.com/google/praxis.git" - echo " -h, --help Print usage." - echo " --ref_paxml=REF Git commit hash or tag name that specifies the version of Paxml to install. Defaults to HEAD." - echo " --ref_praxis=REF Git commit hash or tag name that specifies the version of Praxis to install. Defaults to HEAD." - exit $1 -} - -args=$(getopt -o d:h --long defer,dir:,from_paxml:,from_praxis:,help,ref_paxml:,ref_praxis: -- "$@") -if [[ $? -ne 0 ]]; then - exit 1 -fi - -eval set -- "$args" -while [ : ]; do - case "$1" in - --defer) - DEFER=true - shift - ;; - -d | --dir) - INSTALL_DIR="$2" - shift 2 - ;; - --from_paxml) - PAXML_REPO="$2" - shift 2 - ;; - --from_praxis) - PRAXIS_REPO="$2" - shift 2 - ;; - -h | --help) - usage - ;; - --ref_paxml) - PAXML_REF="$2" - shift 2 - ;; - --ref_praxis) - PRAXIS_REF="$2" - shift 2 - ;; - --) - shift; - break - ;; - esac -done - -if [[ $# -ge 1 ]]; then - echo "Un-recognized argument: $*" && echo - usage 1 -fi - -## Set default arguments if not provided via command-line - -DEFER=${DEFER:-false} -PAXML_REF="${PAXML_REF:-HEAD}" -PAXML_REPO="${PAXML_REPO:-https://github.com/google/paxml.git}" -PRAXIS_REF="${PRAXIS_REF:-HEAD}" -PRAXIS_REPO="${PRAXIS_REPO:-https://github.com/google/praxis.git}" -INSTALL_DIR="${INSTALL_DIR:-/opt}" - -echo "Installing Paxml $PAXML_REF from $PAXML_REPO and $PRAXIS_REF from $PRAXIS_REPO to $INSTALL_DIR" - -maybe_defer_cleanup() { - if [[ "$DEFER" = true ]]; then - echo "# Cleanup from: $0" - echo "$*" >> /opt/cleanup.sh - else - $@ - fi -} - -maybe_defer_pip_install() { - if [[ "$DEFER" = true ]]; then - echo "Deferring installation of 'pip install $*'" - for arg in $@; do - if [[ $arg == "-e" ]]; then - echo -n "$arg " >>/opt/requirements-defer.txt - else - echo "$arg" >> /opt/requirements-defer.txt - fi - done - else - pip install $@ - fi -} - set -ex ## Install Praxis @@ -106,8 +8,8 @@ pushd ${PRAXIS_INSTALLED_DIR} git checkout ${PRAXIS_REF} if [[ $(uname -m) == "aarch64" ]]; then # These dependencies are broken on ARM64 right now, we handle them separately - sed -i 's/^tensorflow/#tensorflow/' praxis/pip_package/requirements.txt requirements.in - sed -i 's/^lingvo/#lingvo/' praxis/pip_package/requirements.txt requirements.in + # sed -i 's/^tensorflow/#tensorflow/' praxis/pip_package/requirements.txt requirements.in + # sed -i 's/^lingvo/#lingvo/' praxis/pip_package/requirements.txt requirements.in sed -i 's/^scikit-learn/#scikit-learn/' praxis/pip_package/requirements.txt requirements.in fi popd @@ -121,8 +23,8 @@ git checkout ${PAXML_REF} if [[ $(uname -m) == "aarch64" ]]; then # These dependencies are broken on ARM64 right now, we handle them separately pip install chex==0.1.7 - sed -i 's/^tensorflow/#tensorflow/' paxml/pip_package/requirements.txt requirements.in - sed -i 's/^lingvo/#lingvo/' paxml/pip_package/requirements.txt requirements.in + # sed -i 's/^tensorflow/#tensorflow/' paxml/pip_package/requirements.txt requirements.in + # sed -i 's/^lingvo/#lingvo/' paxml/pip_package/requirements.txt requirements.in sed -i 's/^scikit-learn/#scikit-learn/' paxml/pip_package/requirements.txt requirements.in sed -i 's/^t5/#t5/' paxml/pip_package/requirements.txt requirements.in sed -i 's/^jax/#jax/' paxml/pip_package/requirements.txt requirements.in diff --git a/.github/container/install-t5x.sh b/.github/container/install-t5x.sh deleted file mode 100755 index a74eb7424..000000000 --- a/.github/container/install-t5x.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/bash - -## Parse command-line arguments - -usage() { - echo "Usage: $0 [OPTION]..." - echo ' --defer When passed, will defer the installation of the main package. Can be installed afterwards with `pip install -r requirements-defer.txt` and any deferred cleanup commands can be run with `bash cleanup.sh`' - echo " -d, --dir=PATH Path to store T5X source. Defaults to /opt" - echo " -f, --from=URL URL of the T5X repo. Defaults to https://github.com/google-research/t5x.git" - echo " -h, --help Print usage." - echo " -r, --ref=REF Git commit hash or tag name that specifies the version of T5X to install. Defaults to HEAD." - exit $1 -} - -args=$(getopt -o d:f:hr: --long defer,dir:,from:,help,ref: -- "$@") -if [[ $? -ne 0 ]]; then - exit 1 -fi - -eval set -- "$args" -while [ : ]; do - case "$1" in - --defer) - DEFER=true - shift - ;; - -d | --dir) - INSTALL_DIR="$2" - shift 2 - ;; - -f | --from) - T5X_REPO="$2" - shift 2 - ;; - -h | --help) - usage - ;; - -r | --ref) - T5X_REF="$2" - shift 2 - ;; - --) - shift; - break - ;; - esac -done - -if [[ $# -ge 1 ]]; then - echo "Un-recognized argument: $*" && echo - usage 1 -fi - -## Set default arguments if not provided via command-line - -DEFER=${DEFER:-false} -T5X_REF="${T5X_REF:-HEAD}" -T5X_REPO="${T5X_REPO:-https://github.com/google-research/t5x.git}" -INSTALL_DIR="${INSTALL_DIR:-/opt}" - -echo "Installing T5X $T5X_REF from $T5X_REPO to $INSTALL_DIR" - -maybe_defer_cleanup() { - if [[ "$DEFER" = true ]]; then - echo "# Cleanup from: $0" - echo "$*" >> /opt/cleanup.sh - else - $@ - fi -} - -maybe_defer_pip_install() { - if [[ "$DEFER" = true ]]; then - echo "Deferring installation of 'pip install $*'" - echo "$*" >> /opt/requirements-defer.txt - else - pip install $@ - fi -} - -set -ex - -## Install dependencies - -apt-get update -apt-get install -y \ - build-essential \ - cmake \ - clang \ - git - -## Install T5X - -T5X_INSTALLED_DIR=${INSTALL_DIR}/t5x - -git clone ${T5X_REPO} ${T5X_INSTALLED_DIR} -cd ${T5X_INSTALLED_DIR} -git checkout ${T5X_REF} -# We currently require installing editable (-e) to build a distribution since -# we edit the source in place and do not re-install -maybe_defer_pip_install -e ${T5X_INSTALLED_DIR}[gpu] - -maybe_defer_cleanup apt-get autoremove -y -maybe_defer_cleanup apt-get clean -maybe_defer_cleanup rm -rf /var/lib/apt/lists/* -maybe_defer_cleanup rm -rf ~/.cache/pip/ diff --git a/.github/container/install-te.sh b/.github/container/install-te.sh deleted file mode 100755 index cfa78ff8f..000000000 --- a/.github/container/install-te.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash - -## Parse command-line arguments - -usage() { - echo "Usage: $0 [OPTION]..." - echo ' --defer When passed, will defer the installation of the main package. Can be installed afterwards with `pip install -r requirements-defer.txt` and any deferred cleanup commands can be run with `bash cleanup.sh`' - echo " -d, --dir=PATH Path to store TE source. Defaults to /opt/transformer-engine" - echo " -f, --from=URL URL of the TE repo. Defaults to https://github.com/NVIDIA/TransformerEngine.git" - echo " -h, --help Print usage." - echo " -r, --ref=REF Git commit hash or tag name that specifies the version of TE to install. Defaults to HEAD." - exit $1 -} - -args=$(getopt -o d:f:hr: --long defer,dir:,from:,help,ref: -- "$@") -if [[ $? -ne 0 ]]; then - exit 1 -fi - -eval set -- "$args" -while [ : ]; do - case "$1" in - --defer) - DEFER=true - shift - ;; - -d | --dir) - INSTALL_DIR="$2" - shift 2 - ;; - -f | --from) - TE_REPO="$2" - shift 2 - ;; - -h | --help) - usage - ;; - -r | --ref) - TE_REF="$2" - shift 2 - ;; - --) - shift; - break - ;; - esac -done - -if [[ $# -ge 1 ]]; then - echo "Un-recognized argument: $*" && echo - usage 1 -fi - -## Set default arguments if not provided via command-line - -DEFER=${DEFER:-false} -TE_REF="${TE_REF:-HEAD}" -TE_REPO="${TE_REPO:-https://github.com/NVIDIA/TransformerEngine.git}" -INSTALL_DIR="${INSTALL_DIR:-/opt/transformer-engine}" - -echo "Installing TE $TE_REF from $TE_REPO to $INSTALL_DIR" - -maybe_defer_pip_install() { - if [[ "$DEFER" = true ]]; then - echo "Deferring installation of 'pip install $*'" - echo "$*" >> /opt/requirements-defer.txt - else - pip install $@ - fi -} - -set -ex - -## Install dependencies - -pip install --no-cache-dir pybind11 ninja packaging - -## Install TE - -git clone ${TE_REPO} ${INSTALL_DIR} -cd ${INSTALL_DIR} -git checkout ${TE_REF} -git submodule init -git submodule update --recursive -maybe_defer_pip_install -e ${INSTALL_DIR} diff --git a/.github/container/install_lingvo_aarch64.sh b/.github/container/install_lingvo_aarch64.sh deleted file mode 100755 index 1c1499684..000000000 --- a/.github/container/install_lingvo_aarch64.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -exu -set -o pipefail -INSTALL_DIR="${INSTALL_DIR:-/opt}" -LINGVO_REF="${LINGVO_REF:-HEAD}" -LINGVO_REPO="${LINGVO_REPO:-https://github.com/tensorflow/lingvo.git}" - -## Install tensorflow-text -cd ${INSTALL_DIR} -pip install tensorflow_datasets==4.9.2 # force a recent version to have latest protobuf dep -pip install auditwheel -pip install tensorflow==2.13.0 -git clone http://github.com/tensorflow/text.git -pushd text -git checkout v2.13.0 -./oss_scripts/run_build.sh -find * | grep '.whl$' -pip install ./tensorflow_text-*.whl -popd -rm -Rf text - -## Install lingvo -LINGVO_INSTALLED_DIR=${INSTALL_DIR}/lingvo - -[[ -d lingvo ]] || git clone ${LINGVO_REPO} ${LINGVO_INSTALLED_DIR} - -pushd ${LINGVO_INSTALLED_DIR} -# Local patches, two PR waiting to be merged + one custom patch -# git fetch origin pull/326/head:pr326 ## merged upstream -# git fetch origin pull/328/head:pr328 ## merged upstream -git fetch origin pull/329/head:pr329 -git config user.name "JAX Toolbox" -git config user.email "jax@nvidia.com" -# git cherry-pick pr326 pr328 pr329 ## pr326, pr328 merged -git cherry-pick --allow-empty pr329 - -# Disable 2 flaky tests here -patch -p1 < /opt/lingvo.patch - -sed -i 's/tensorflow=/#tensorflow=/' docker/dev.requirements.txt -sed -i 's/tensorflow-text=/#tensorflow-text=/' docker/dev.requirements.txt -sed -i 's/dataclasses=/#dataclasses=/' docker/dev.requirements.txt -pip install -r docker/dev.requirements.txt -pip install protobuf==3.20 -pip install patchelf - -# Some tests are flaky right now (see the patch abovbe), if needed we can skip -# running the tests entirely by uncommentin the following line. -# SKIP_TEST=1 -PYTHON_MINOR_VERSION=10 pip_package/build.sh -pip install /tmp/lingvo/dist/lingvo*linux_aarch64.whl -popd -rm -Rf *lingvo* -rm -Rf /root/.cache diff --git a/.github/container/lingvo.patch b/.github/container/lingvo.patch deleted file mode 100644 index c4184a09f..000000000 --- a/.github/container/lingvo.patch +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/pip_package/build.sh b/pip_package/build.sh -index ef62c432e..659e78956 100755 ---- a/pip_package/build.sh -+++ b/pip_package/build.sh -@@ -89,7 +89,7 @@ bazel clean - bazel build $@ ... - if ! [[ $SKIP_TESTS ]]; then - # Just test the core for the purposes of the pip package. -- bazel test $@ lingvo/core/... -+ bazel test $@ lingvo/core/... -- -//lingvo/tasks/mt:model_test -//lingvo/core:saver_test - fi - - DST_DIR="/tmp/lingvo/dist" diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh new file mode 100755 index 000000000..0b9de9526 --- /dev/null +++ b/.github/container/pip-finalize.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -ex -o pipefail + +pushd /opt/pip-tools.d + +pip-compile -o requirements.txt $(ls manifest.*) + +pip-sync --pip-args '--src /opt' requirements.txt + +rm -rf ~/.cache/* diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml index dc35b653d..197a90905 100644 --- a/.github/workflows/_build_base.yaml +++ b/.github/workflows/_build_base.yaml @@ -3,6 +3,10 @@ name: ~build CUDA+Python base container on: workflow_call: inputs: + ARCHITECTURE: + type: string + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true BASE_IMAGE: type: string description: 'Base CUDA image, e.g. nvidia/cuda:X.Y.Z-devel-ubuntu22.04' @@ -13,10 +17,31 @@ on: description: "Build date in YYYY-MM-DD format" required: false default: 'NOT SPECIFIED' + ARTIFACT_NAME: + type: string + description: 'Name of the artifact zip file' + required: false + default: 'artifact-base-build' + BADGE_FILENAME: + type: string + description: 'Name of the endpoint JSON file for shields.io badge' + required: false + default: 'badge-base-build' + GIT_USER_NAME: + type: string + description: 'Username in GIT to perform git pull/push' + required: false + default: 'JAX Toolbox' + GIT_USER_EMAIL: + type: string + description: 'User email in GIT to perform git pull/push' + required: false + default: 'jax@nvidia.com' + outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.merge.outputs.DOCKER_TAGS }} + DOCKER_TAG: + description: "Tag of the image built" + value: ${{ jobs.build-base.outputs.DOCKER_TAG }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -28,12 +53,12 @@ permissions: jobs: - build: - strategy: - fail-fast: false - matrix: - PLATFORM: [amd64, arm64] - runs-on: [self-hosted, "${{ matrix.PLATFORM }}", small] + build-base: + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] + env: + BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json + outputs: + DOCKER_TAG: ${{ steps.meta.outputs.tags }} steps: - name: Print environment variables run: env @@ -48,6 +73,12 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + driver-opts: | + image=moby/buildkit:v0.12.1 + - name: Set docker metadata id: meta uses: docker/metadata-action@v4 @@ -57,84 +88,64 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-base-${{ matrix.PLATFORM }} + type=raw,value=${{ github.run_id }}-base-${{ inputs.ARCHITECTURE }} labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - with: - driver-opts: | - image=moby/buildkit:v0.12.1 - - name: Build docker images + id: build uses: docker/build-push-action@v4 with: context: .github/container push: true file: .github/container/Dockerfile.base - platforms: linux/${{ matrix.PLATFORM }} + platforms: linux/${{ inputs.ARCHITECTURE }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-args: | + GIT_USER_NAME=${{ inputs.GIT_USER_NAME }} + GIT_USER_EMAIL=${{ inputs.GIT_USER_EMAIL }} BUILD_DATE=${{ inputs.BUILD_DATE }} ${{ inputs.BASE_IMAGE != 'latest' && format('BASE_IMAGE={0}', inputs.BASE_IMAGE) }} - # Temporary workaround until the following issues are solved: - # https://github.com/orgs/community/discussions/17245 - # https://github.com/actions/runner/pull/2477 - # https://github.com/orgs/community/discussions/26639 - - name: Save image name as text file + - name: Generate sitrep + if: success() || failure() shell: bash -x -e {0} run: | - echo "${{ steps.meta.outputs.tags }}" >> image-name.txt - - - name: Upload image name file as artifact + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='Base image ${{ inputs.ARCHITECTURE }} build' + tags="${{ steps.meta.outputs.tags }}" + digest="${{ steps.build.outputs.digest }}" + outcome="${{ steps.build.outcome }}" + + if [[ ${outcome} == "success" ]]; then + badge_message="pass" + badge_color=brightgreen + summary="Base image build on ${{ inputs.ARCHITECTURE }}: $badge_message" + else + badge_message="fail" + badge_color=red + summary="Base image build on ${{ inputs.ARCHITECTURE }}: $badge_message" + fi + + to_json \ + summary \ + badge_label tags digest outcome \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="${badge_message}" \ + color="${badge_color}" \ + to_json schemaVersion label message color \ + > ${{ env.BADGE_FILENAME_FULL }} + + - name: Upload sitrep and badge uses: actions/upload-artifact@v3 with: - name: image-name-base-${{ matrix.PLATFORM }} - path: image-name.txt - - merge: - runs-on: ubuntu-latest - needs: build - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - steps: - - name: Download image name files into separate folders - uses: actions/download-artifact@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-base-multiarch - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Combine images into a single multi-arch image - shell: bash -x -e {0} - run: | - docker manifest create ${{ steps.meta.outputs.tags }} $( - for IMAGE in $(cat image-name-base-*/image-name.txt); do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${{ steps.meta.outputs.tags }} + name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }} + path: | + sitrep.json + ${{ env.BADGE_FILENAME_FULL }} diff --git a/.github/workflows/_build_jax.yaml b/.github/workflows/_build_jax.yaml index 311336a36..3732c374a 100644 --- a/.github/workflows/_build_jax.yaml +++ b/.github/workflows/_build_jax.yaml @@ -3,6 +3,10 @@ name: ~build JAX container on: workflow_call: inputs: + ARCHITECTURE: + type: string + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true BASE_IMAGE: type: string description: 'Base docker image that provides CUDA and Python:' @@ -33,6 +37,16 @@ on: description: Git commit, tag, or branch for XLA required: false default: main + REPO_TE: + type: string + description: URL of transformer engine repository to check out + required: false + default: "https://github.com/NVIDIA/TransformerEngine.git" + REF_TE: + type: string + description: Git commit, tag, or branch for XLA + required: false + default: main ARTIFACT_NAME: type: string description: 'Name of the artifact zip file' @@ -43,10 +57,23 @@ on: description: 'Name of the endpoint JSON file for shields.io badge' required: false default: 'badge-jax-build' + GIT_USER_NAME: + type: string + description: 'Username in GIT to perform git pull/push' + required: false + default: 'JAX Toolbox' + GIT_USER_EMAIL: + type: string + description: 'User email in GIT to perform git pull/push' + required: false + default: 'jax@nvidia.com' outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.merge.outputs.DOCKER_TAGS }} + DOCKER_TAG_MEALKIT: + description: "Tags of the 'mealkit' image built" + value: ${{ jobs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build-jax.outputs.DOCKER_TAG_FINAL }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -58,14 +85,13 @@ permissions: jobs: - build: - strategy: - fail-fast: false - matrix: - PLATFORM: [amd64, arm64] - runs-on: [self-hosted, "${{ matrix.PLATFORM }}", large] + build-jax: + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", large] env: - BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ matrix.PLATFORM }}.json + BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.mealkit-metadata.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables run: env @@ -95,8 +121,14 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Set docker metadata - id: meta + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + driver-opts: | + image=moby/buildkit:v0.12.1 + + - name: Set docker metadata - mealkit + id: mealkit-metadata uses: docker/metadata-action@v4 with: images: | @@ -104,26 +136,60 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-jax-${{ matrix.PLATFORM }} + type=raw,value=${{ github.run_id }}-jax-${{ inputs.ARCHITECTURE }}-mealkit labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + - name: Build mealkit image + id: mealkit-build + uses: docker/build-push-action@v4 with: - driver-opts: | - image=moby/buildkit:v0.12.1 + context: .github/container + push: true + file: .github/container/Dockerfile.jax + platforms: linux/${{ inputs.ARCHITECTURE }} + target: mealkit + tags: ${{ steps.mealkit-metadata.outputs.tags }} + labels: ${{ steps.mealkit-metadata.outputs.labels }} + ssh: default + secret-files: | + "SSH_KNOWN_HOSTS=${{ steps.ssh-known-hosts.outputs.FILE }}" + build-args: | + BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BAZEL_CACHE=${{ vars.BAZEL_REMOTE_CACHE_URL }} + BUILD_DATE=${{ inputs.BUILD_DATE }} + REPO_JAX=${{ inputs.REPO_JAX }} + REPO_XLA=${{ inputs.REPO_XLA }} + REPO_TE=${{ inputs.REPO_TE }} + REF_JAX=${{ inputs.REF_JAX }} + REF_XLA=${{ inputs.REF_XLA }} + REF_TE=${{ inputs.REF_TE }} + GIT_USER_NAME=${{ inputs.GIT_USER_NAME }} + GIT_USER_EMAIL=${{ inputs.GIT_USER_EMAIL }} - - name: Build docker images - id: build + - name: Set docker metadata - final + id: final-metadata + uses: docker/metadata-action@v4 + with: + images: | + ${{ env.UPLD_IMAGE }} + flavor: | + latest=false + tags: | + type=raw,value=${{ github.run_id }}-jax-${{ inputs.ARCHITECTURE }} + labels: + org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + + - name: Build final image + id: final-build uses: docker/build-push-action@v4 with: context: .github/container push: true file: .github/container/Dockerfile.jax - platforms: linux/${{ matrix.PLATFORM }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + platforms: linux/${{ inputs.ARCHITECTURE }} + tags: ${{ steps.final-metadata.outputs.tags }} + labels: ${{ steps.final-metadata.outputs.labels }} ssh: default secret-files: | "SSH_KNOWN_HOSTS=${{ steps.ssh-known-hosts.outputs.FILE }}" @@ -143,19 +209,19 @@ jobs: # bring in utility functions source .github/workflows/scripts/to_json.sh - badge_label='JAX ${{ matrix.PLATFORM }} build' - tags="${{ steps.meta.outputs.tags }}" - digest="${{ steps.build.outputs.digest }}" - outcome="${{ steps.build.outcome }}" + badge_label='JAX ${{ inputs.ARCHITECTURE }} build' + tags="${{ steps.final-metadata.outputs.tags }}" + digest="${{ steps.final-build.outputs.digest }}" + outcome="${{ steps.final-build.outcome }}" if [[ ${outcome} == "success" ]]; then badge_message="pass" badge_color=brightgreen - summary="JAX build on ${{ matrix.PLATFORM }}: $badge_message" + summary="JAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" else badge_message="fail" badge_color=red - summary="JAX build on ${{ matrix.PLATFORM }}: $badge_message" + summary="JAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" fi to_json \ @@ -170,72 +236,25 @@ jobs: to_json schemaVersion label message color \ > ${{ env.BADGE_FILENAME_FULL }} - # Temporary workaround until the following issues are solved: - # https://github.com/orgs/community/discussions/17245 - # https://github.com/actions/runner/pull/2477 - # https://github.com/orgs/community/discussions/26639 - - name: Save image name as text file - shell: bash -x -e {0} - run: | - echo "${{ steps.meta.outputs.tags }}" >> image-name.txt + # # Temporary workaround until the following issues are solved: + # # https://github.com/orgs/community/discussions/17245 + # # https://github.com/actions/runner/pull/2477 + # # https://github.com/orgs/community/discussions/26639 + # - name: Save image name as text file + # shell: bash -x -e {0} + # run: | + # echo "${{ steps.final-metadata.outputs.tags }}" >> image-name.txt - - name: Upload image name file as artifact - uses: actions/upload-artifact@v3 - with: - name: image-name-jax-${{ matrix.PLATFORM }} - path: image-name.txt + # - name: Upload image name file as artifact + # uses: actions/upload-artifact@v3 + # with: + # name: image-name-jax-${{ inputs.ARCHITECTURE }} + # path: image-name.txt - name: Upload sitrep and badge uses: actions/upload-artifact@v3 with: - name: ${{ inputs.ARTIFACT_NAME }}-${{ matrix.PLATFORM }} + name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }} path: | sitrep.json ${{ env.BADGE_FILENAME_FULL }} - - merge: - runs-on: ubuntu-latest - needs: build - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - steps: - # TODO: currently downloading all artifacts of the entire workflow - # Revise when this request is fulfilled: - # https://github.com/actions/download-artifact/issues/214 - - name: Download image name files into separate folders - uses: actions/download-artifact@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-jax-multiarch - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Combine images into a single multi-arch image - shell: bash -x -e {0} - run: | - docker manifest create ${{ steps.meta.outputs.tags }} $( - for IMAGE in $(cat image-name-jax-*/image-name.txt); do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${{ steps.meta.outputs.tags }} diff --git a/.github/workflows/_build_pax.yaml b/.github/workflows/_build_pax.yaml index 84012afa8..62bc175ad 100644 --- a/.github/workflows/_build_pax.yaml +++ b/.github/workflows/_build_pax.yaml @@ -3,11 +3,15 @@ name: ~build Pax container on: workflow_call: inputs: + ARCHITECTURE: + type: string + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true BASE_IMAGE: type: string description: 'Base docker image that provides JAX' required: false - default: ghcr.io/nvidia/jax:latest + default: ghcr.io/nvidia/jax:mealkit BUILD_DATE: type: string description: "Build date in YYYY-MM-DD format" @@ -33,10 +37,23 @@ on: description: Git commit, tag, or branch for Praxis required: false default: main + ARTIFACT_NAME: + type: string + description: 'Name of the artifact zip file' + required: false + default: 'artifact-pax-build' + BADGE_FILENAME: + type: string + description: 'Name of the endpoint JSON file for shields.io badge' + required: false + default: 'badge-pax-build' outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.merge.outputs.DOCKER_TAGS }} + DOCKER_TAG_MEALKIT: + description: "Tags of the 'mealkit' image built" + value: ${{ jobs.build-pax.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build-pax.outputs.DOCKER_TAG_FINAL }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -48,12 +65,13 @@ permissions: jobs: - build: - strategy: - fail-fast: false - matrix: - PLATFORM: [amd64, arm64] - runs-on: [self-hosted, "${{ matrix.PLATFORM }}", small] + build-pax: + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] + env: + BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.mealkit-metadata.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables run: env @@ -68,8 +86,14 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Set docker metadata - id: meta + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + driver-opts: | + image=moby/buildkit:v0.12.1 + + - name: Set docker metadata - mealkit + id: mealkit-metadata uses: docker/metadata-action@v4 with: images: | @@ -77,25 +101,21 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-upstream-pax-${{ matrix.PLATFORM }} + type=raw,value=${{ github.run_id }}-upstream-pax-${{ inputs.ARCHITECTURE }}-mealkit labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - with: - driver-opts: | - image=moby/buildkit:v0.12.1 - - - name: Build docker images + - name: Build mealkit image + id: mealkit-build uses: docker/build-push-action@v4 with: context: .github/container push: true - file: .github/container/Dockerfile.pax.${{ matrix.PLATFORM }} - platforms: linux/${{ matrix.PLATFORM }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + file: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} + platforms: linux/${{ inputs.ARCHITECTURE }} + target: mealkit + tags: ${{ steps.mealkit-metadata.outputs.tags }} + labels: ${{ steps.mealkit-metadata.outputs.labels }} build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} @@ -104,42 +124,8 @@ jobs: REF_PAXML=${{ inputs.REF_PAXML }} REF_PRAXIS=${{ inputs.REF_PRAXIS }} - # Temporary workaround until the following issues are solved: - # https://github.com/orgs/community/discussions/17245 - # https://github.com/actions/runner/pull/2477 - # https://github.com/orgs/community/discussions/26639 - - name: Save image name as text file - shell: bash -x -e {0} - run: | - echo "${{ steps.meta.outputs.tags }}" >> image-name.txt - - - name: Upload image name file as artifact - uses: actions/upload-artifact@v3 - with: - name: image-name-upstream-pax-${{ matrix.PLATFORM }} - path: image-name.txt - - merge: - runs-on: ubuntu-latest - needs: build - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - steps: - # TODO: currently downloading all artifacts of the entire workflow - # Revise when this request is fulfilled: - # https://github.com/actions/download-artifact/issues/214 - - name: Download image name files into separate folders - uses: actions/download-artifact@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta + - name: Set docker metadata - final + id: final-metadata uses: docker/metadata-action@v4 with: images: | @@ -147,21 +133,67 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-upstream-pax-multiarch + type=raw,value=${{ github.run_id }}-upstream-pax-${{ inputs.ARCHITECTURE }} labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Combine images into a single multi-arch image + - name: Build final image + id: final-build + uses: docker/build-push-action@v4 + with: + context: .github/container + push: true + file: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} + platforms: linux/${{ inputs.ARCHITECTURE }} + target: final + tags: ${{ steps.final-metadata.outputs.tags }} + labels: ${{ steps.final-metadata.outputs.labels }} + build-args: | + BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BUILD_DATE=${{ inputs.BUILD_DATE }} + REPO_PAXML=${{ inputs.REPO_PAXML }} + REPO_PRAXIS=${{ inputs.REPO_PRAXIS }} + REF_PAXML=${{ inputs.REF_PAXML }} + REF_PRAXIS=${{ inputs.REF_PRAXIS }} + + - name: Generate sitrep + if: success() || failure() shell: bash -x -e {0} run: | - docker manifest create ${{ steps.meta.outputs.tags }} $( - for IMAGE in $(cat image-name-upstream-pax-*/image-name.txt); do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${{ steps.meta.outputs.tags }} + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='PAX ${{ inputs.ARCHITECTURE }} build' + tags="${{ steps.final-metadata.outputs.tags }}" + digest="${{ steps.final-build.outputs.digest }}" + outcome="${{ steps.final-build.outcome }}" + + if [[ ${outcome} == "success" ]]; then + badge_message="pass" + badge_color=brightgreen + summary="PAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" + else + badge_message="fail" + badge_color=red + summary="PAX build on ${{ inputs.ARCHITECTURE }}: $badge_message" + fi + + to_json \ + summary \ + badge_label tags digest outcome \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="${badge_message}" \ + color="${badge_color}" \ + to_json schemaVersion label message color \ + > ${{ env.BADGE_FILENAME_FULL }} + + - name: Upload sitrep and badge + uses: actions/upload-artifact@v3 + with: + name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }} + path: | + sitrep.json + ${{ env.BADGE_FILENAME_FULL }} diff --git a/.github/workflows/_build_rosetta.yaml b/.github/workflows/_build_rosetta.yaml index e811e31bf..02dcc0951 100644 --- a/.github/workflows/_build_rosetta.yaml +++ b/.github/workflows/_build_rosetta.yaml @@ -3,6 +3,10 @@ name: ~build Rosetta container on: workflow_call: inputs: + ARCHITECTURE: + type: string + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true BASE_LIBRARY: type: string description: 'Choice of base library to build on:' @@ -14,18 +18,21 @@ on: required: false BUILD_DATE: type: string - description: "Build date in YYYY-MM-DD format" + description: 'Build date in YYYY-MM-DD format' required: false default: 'NOT SPECIFIED' - PLATFORMS: + BADGE_FILENAME: type: string - description: 'JSON list of platforms. Ex: ["amd64"]' + description: 'Name of the endpoint JSON file for shields.io badge' required: false - default: '["arm64", "amd64"]' + default: 'badge-rosetta-build' outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.merge.outputs.DOCKER_TAGS }} + DOCKER_TAG_MEALKIT: + description: 'Tags of the mealkit image build' + value: $ {{ jobs.build-rosetta.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build-rosetta.outputs.DOCKER_TAG_FINAL }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -38,13 +45,17 @@ permissions: jobs: - build: - strategy: - fail-fast: false - matrix: - PLATFORM: ${{ fromJSON(inputs.PLATFORMS) }} - runs-on: [self-hosted, "${{ matrix.PLATFORM }}", small] + build-rosetta: + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] + env: + BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME}}-${{ inputs.ARCHITECTURE}}.json + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.mealkit-metadata.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: + - name: Print environment variables + run: env + - name: Set default BASE_IMAGE id: defaults run: | @@ -54,9 +65,6 @@ jobs: echo "BASE_IMAGE=${{ env.DOCKER_REGISTRY }}/upstream-${{ inputs.BASE_LIBRARY }}:latest" >> "$GITHUB_OUTPUT" fi - - name: Print environment variables - run: env - - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3 @@ -67,92 +75,53 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: ${{ env.UPLD_IMAGE }} - flavor: latest=false - tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ matrix.PLATFORM }} - labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 with: driver-opts: | image=moby/buildkit:v0.12.1 + + - name: Set docker metadata - mealkit + id: mealkit-metadata + uses: docker/metadata-action@v4 + with: + images: ${{ env.UPLD_IMAGE }} + flavor: latest=false + tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECTURE }}-mealkit + labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Build docker images + - name: Build docker images - mealkit + id: mealkit-build uses: docker/build-push-action@v4 with: context: rosetta/ push: true file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} - platforms: linux/${{ matrix.PLATFORM }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - target: rosetta + platforms: linux/${{ inputs.ARCHITECTURE }} + tags: ${{ steps.mealkit-metadata.outputs.tags }} + labels: ${{ steps.mealkit-metadata.outputs.labels }} + target: mealkit build-args: | BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} - # Temporary workaround until the following issues are solved: - # https://github.com/orgs/community/discussions/17245 - # https://github.com/actions/runner/pull/2477 - # https://github.com/orgs/community/discussions/26639 - - name: Save image name as text file - shell: bash -x -e {0} - run: | - echo "${{ steps.meta.outputs.tags }}" >> image-name.txt - - - name: Upload image name file as artifact - uses: actions/upload-artifact@v3 - with: - name: image-name-${{ inputs.BASE_LIBRARY }}-${{ matrix.PLATFORM }} - path: image-name.txt - - merge: - runs-on: ubuntu-latest - needs: build - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - steps: - # TODO: currently downloading all artifacts of the entire workflow - # Revise when this request is fulfilled: - # https://github.com/actions/download-artifact/issues/214 - - name: Download image name files into separate folders - uses: actions/download-artifact@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta + - name: Set docker metadata - final + id: final-metadata uses: docker/metadata-action@v4 with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-multiarch - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + images: ${{ env.UPLD_IMAGE }} + flavor: latest=false + tags: type=raw,value=${{ github.run_id }}-${{ inputs.BASE_LIBRARY }}-${{ inputs.ARCHITECTURE }}-final + labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Combine images into a single multi-arch image - shell: bash -x -e {0} - run: | - docker manifest create ${{ steps.meta.outputs.tags }} $( - for IMAGE in $(cat image-name-${{ inputs.BASE_LIBRARY }}-*/image-name.txt); do - REPO=$(echo $IMAGE | cut -d: -f1) - DIGEST=$( - docker manifest inspect $IMAGE |\ - jq -r '.manifests[] | select(.platform.os == "linux") | .digest' - ) - echo $REPO@${DIGEST} - done - ) - docker manifest push ${{ steps.meta.outputs.tags }} \ No newline at end of file + - name: Build docker images - final + uses: docker/build-push-action@v4 + with: + context: rosetta/ + push: true + file: rosetta/Dockerfile.${{ inputs.BASE_LIBRARY }} + platforms: linux/${{ inputs.ARCHITECTURE }} + tags: ${{ steps.final-metadata.outputs.tags }} + labels: ${{ steps.final-metadata.outputs.labels }} + target: final + build-args: | + BASE_IMAGE=${{ steps.defaults.outputs.BASE_IMAGE }} \ No newline at end of file diff --git a/.github/workflows/_build_t5x.yaml b/.github/workflows/_build_t5x.yaml index fffbb7731..6efcde8f9 100644 --- a/.github/workflows/_build_t5x.yaml +++ b/.github/workflows/_build_t5x.yaml @@ -3,11 +3,15 @@ name: ~build T5X container on: workflow_call: inputs: + ARCHITECTURE: + type: string + description: 'CPU architecture to build the image for, e.g. amd64, arm64' + required: true BASE_IMAGE: type: string description: 'Base docker image that provides JAX' required: false - default: ghcr.io/nvidia/jax:latest + default: ghcr.io/nvidia/jax:mealkit BUILD_DATE: type: string description: "Build date in YYYY-MM-DD format" @@ -32,13 +36,24 @@ on: type: string description: Git commit, tag, or branch for TE required: false - # TODO: This is a temporary pinning of TE as the API in TE no longer matches the TE patch - # This should be reverted to main ASAP - default: 7976bd003fcf084dd068069b92a9a79b1743316a + default: main + ARTIFACT_NAME: + type: string + description: 'Name of the artifact zip file' + required: false + default: 'artifact-t5x-build' + BADGE_FILENAME: + type: string + description: 'Name of the endpoint JSON file for shields.io badge' + required: false + default: 'badge-t5x-build' outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.build.outputs.DOCKER_TAGS }} + DOCKER_TAG_MEALKIT: + description: "Tags of the 'mealkit' image built" + value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: + description: "Tags of the complete image built" + value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_FINAL }} env: UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal @@ -50,10 +65,13 @@ permissions: jobs: - build: + build-t5x: + runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] + env: + BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - runs-on: [self-hosted, x86, small] + DOCKER_TAG_MEALKIT: ${{ steps.mealkit-metadata.outputs.tags }} + DOCKER_TAG_FINAL: ${{ steps.final-metadata.outputs.tags }} steps: - name: Print environment variables run: env @@ -68,8 +86,14 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Set docker metadata - id: meta + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + driver-opts: | + image=moby/buildkit:v0.12.1 + + - name: Set docker metadata - mealkit + id: mealkit-metadata uses: docker/metadata-action@v4 with: images: | @@ -77,24 +101,53 @@ jobs: flavor: | latest=false tags: | - type=raw,value=${{ github.run_id }}-upstream-t5x + type=raw,value=${{ github.run_id }}-upstream-t5x-${{ inputs.ARCHITECTURE }}-mealkit labels: org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + - name: Build mealkit image + id: mealkit-build + uses: docker/build-push-action@v4 with: - driver-opts: | - image=moby/buildkit:v0.10.6 + context: .github/container + push: true + file: .github/container/Dockerfile.t5x + platforms: linux/${{ inputs.ARCHITECTURE }} + target: mealkit + tags: ${{ steps.mealkit-metadata.outputs.tags }} + labels: ${{ steps.mealkit-metadata.outputs.labels }} + build-args: | + BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BUILD_DATE=${{ inputs.BUILD_DATE }} + REPO_T5X=${{ inputs.REPO_T5X }} + REF_T5X=${{ inputs.REF_T5X }} + REPO_TE=${{ inputs.REPO_TE }} + REF_TE=${{ inputs.REF_TE }} - - name: Build docker images + - name: Set docker metadata - final + id: final-metadata + uses: docker/metadata-action@v4 + with: + images: | + ${{ env.UPLD_IMAGE }} + flavor: | + latest=false + tags: | + type=raw,value=${{ github.run_id }}-upstream-t5x-${{ inputs.ARCHITECTURE }} + labels: + org.opencontainers.image.created=${{ inputs.BUILD_DATE }} + + - name: Build final image + id: final-build uses: docker/build-push-action@v4 with: context: .github/container push: true file: .github/container/Dockerfile.t5x - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + platforms: linux/${{ inputs.ARCHITECTURE }} + tags: ${{ steps.final-metadata.outputs.tags }} + labels: ${{ steps.final-metadata.outputs.labels }} + target: final build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} @@ -102,3 +155,45 @@ jobs: REF_T5X=${{ inputs.REF_T5X }} REPO_TE=${{ inputs.REPO_TE }} REF_TE=${{ inputs.REF_TE }} + + - name: Generate sitrep + if: success() || failure() + shell: bash -x -e {0} + run: | + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='T5X ${{ inputs.ARCHITECTURE }} build' + tags="${{ steps.final-metadata.outputs.tags }}" + digest="${{ steps.final-build.outputs.digest }}" + outcome="${{ steps.final-build.outcome }}" + + if [[ ${outcome} == "success" ]]; then + badge_message="pass" + badge_color=brightgreen + summary="T5X build on ${{ inputs.ARCHITECTURE }}: $badge_message" + else + badge_message="fail" + badge_color=red + summary="T5X build on ${{ inputs.ARCHITECTURE }}: $badge_message" + fi + + to_json \ + summary \ + badge_label tags digest outcome \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="${badge_message}" \ + color="${badge_color}" \ + to_json schemaVersion label message color \ + > ${{ env.BADGE_FILENAME_FULL }} + + - name: Upload sitrep and badge + uses: actions/upload-artifact@v3 + with: + name: ${{ inputs.ARTIFACT_NAME }}-${{ inputs.ARCHITECTURE }} + path: | + sitrep.json + ${{ env.BADGE_FILENAME_FULL }} diff --git a/.github/workflows/_build_te.yaml b/.github/workflows/_build_te.yaml deleted file mode 100644 index d80f334ef..000000000 --- a/.github/workflows/_build_te.yaml +++ /dev/null @@ -1,90 +0,0 @@ -name: ~build Transformer Engine container - -on: - workflow_call: - inputs: - BASE_IMAGE: - type: string - description: 'Base docker image that provides JAX' - required: false - default: ghcr.io/nvidia/jax:latest - BUILD_DATE: - type: string - description: "Build date in YYYY-MM-DD format" - required: false - default: 'NOT SPECIFIED' - REPO_TE: - type: string - description: URL of TE repository to check out - required: false - default: "https://github.com/NVIDIA/TransformerEngine.git" - REF_TE: - type: string - description: Git commit, tag, or branch for TE - required: false - default: main - outputs: - DOCKER_TAGS: - description: "Tags of the image built" - value: ${{ jobs.build.outputs.DOCKER_TAGS }} - -env: - UPLD_IMAGE: ghcr.io/nvidia/jax-toolbox-internal - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - - build: - outputs: - DOCKER_TAGS: ${{ steps.meta.outputs.tags }} - runs-on: [self-hosted, x86, small] - steps: - - name: Print environment variables - run: env - - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v3 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set docker metadata - id: meta - uses: docker/metadata-action@v4 - with: - images: | - ${{ env.UPLD_IMAGE }} - flavor: | - latest=false - tags: | - type=raw,value=${{ github.run_id }}-te - labels: - org.opencontainers.image.created=${{ inputs.BUILD_DATE }} - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - with: - driver-opts: | - image=moby/buildkit:v0.10.6 - - - name: Build docker images - uses: docker/build-push-action@v4 - with: - context: .github/container - push: true - file: .github/container/Dockerfile.te - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - build-args: | - BASE_IMAGE=${{ inputs.BASE_IMAGE }} - BUILD_DATE=${{ inputs.BUILD_DATE }} - REPO_TE=${{ inputs.REPO_TE }} - REF_TE=${{ inputs.REF_TE }} \ No newline at end of file diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml new file mode 100644 index 000000000..949959739 --- /dev/null +++ b/.github/workflows/_ci.yaml @@ -0,0 +1,205 @@ +name: CI +run-name: CI-${{ inputs.ARCHITECTURE }} + +on: + workflow_call: + inputs: + ARCHITECTURE: + type: string + required: true + CUDA_IMAGE: + type: string + required: true + SRC_JAX: + type: string + required: true + SRC_XLA: + type: string + required: true + SRC_TE: + type: string + required: true + SRC_T5X: + type: string + required: true + SRC_PAXML: + type: string + required: true + SRC_PRAXIS: + type: string + required: true + outputs: + TAG_BASE: + description: "Tags of the base image built" + value: ${{ jobs.build-base.outputs.DOCKER_TAG }} + TAG_JAX: + description: "Tags of the JAX image built" + value: ${{ jobs.build-jax.outputs.DOCKER_TAG_FINAL }} + TAG_T5X: + description: "Tags of the T5X image built" + value: ${{ jobs.build-t5x.outputs.DOCKER_TAG_FINAL }} + TAG_PAX: + description: "Tags of the PAX image built" + value: ${{ jobs.build-pax.outputs.DOCKER_TAG_FINAL }} + +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container + +jobs: + + metadata: + runs-on: ubuntu-22.04 + outputs: + BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + REPO_JAX: ${{ steps.parse-inputs.outputs.REPO_JAX }} + REF_JAX: ${{ steps.parse-inputs.outputs.REF_JAX }} + REPO_XLA: ${{ steps.parse-inputs.outputs.REPO_XLA }} + REF_XLA: ${{ steps.parse-inputs.outputs.REF_XLA }} + REPO_TE: ${{ steps.parse-inputs.outputs.REPO_TE }} + REF_TE: ${{ steps.parse-inputs.outputs.REF_TE }} + REPO_T5X: ${{ steps.parse-inputs.outputs.REPO_T5X }} + REF_T5X: ${{ steps.parse-inputs.outputs.REF_T5X }} + REPO_PAXML: ${{ steps.parse-inputs.outputs.REPO_PAXML }} + REF_PAXML: ${{ steps.parse-inputs.outputs.REF_PAXML }} + REPO_PRAXIS: ${{ steps.parse-inputs.outputs.REPO_PRAXIS }} + REF_PRAXIS: ${{ steps.parse-inputs.outputs.REF_PRAXIS }} + steps: + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v3 + + - name: Set build date + id: date + shell: bash -x -e {0} + run: | + BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') + echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + + - name: split input "repo#ref" into repo and ref parts + id: parse-inputs + shell: bash -x -e {0} + run: | + source .github/workflows/scripts/parse_git_src.sh + + # default values are for `pull_request` event types + parse_git_src JAX "${{ inputs.SRC_JAX }}" + parse_git_src XLA "${{ inputs.SRC_XLA }}" + parse_git_src TE "${{ inputs.SRC_TE }}" + parse_git_src T5X "${{ inputs.SRC_T5X }}" + parse_git_src PAXML "${{ inputs.SRC_PAXML }}" + parse_git_src PRAXIS "${{ inputs.SRC_PRAXIS }}" + + build-base: + needs: metadata + uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + secrets: inherit + + build-jax: + needs: [metadata, build-base] + uses: ./.github/workflows/_build_jax.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} + REPO_JAX: ${{ needs.metadata.outputs.REPO_JAX }} + REF_JAX: ${{ needs.metadata.outputs.REF_JAX }} + REPO_XLA: ${{ needs.metadata.outputs.REPO_XLA }} + REF_XLA: ${{ needs.metadata.outputs.REF_XLA }} + REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} + REF_TE: ${{ needs.metadata.outputs.REF_TE }} + secrets: inherit + + build-t5x: + needs: [metadata, build-jax] + if: inputs.ARCHITECTURE == 'amd64' # T5X arm64 build is wip in PR 252 + uses: ./.github/workflows/_build_t5x.yaml + with: + ARCHITECTURE: amd64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} + REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} + secrets: inherit + + build-pax: + needs: [metadata, build-jax] + uses: ./.github/workflows/_build_pax.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + REPO_PAXML: ${{ needs.metadata.outputs.REPO_PAXML }} + REF_PAXML: ${{ needs.metadata.outputs.REF_PAXML }} + REPO_PRAXIS: ${{ needs.metadata.outputs.REPO_PRAXIS }} + REF_PRAXIS: ${{ needs.metadata.outputs.REF_PRAXIS }} + secrets: inherit + + build-rosetta-t5x: + uses: ./.github/workflows/_build_rosetta.yaml + needs: [metadata, build-t5x] + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: t5x + secrets: inherit + + build-rosetta-pax: + uses: ./.github/workflows/_build_rosetta.yaml + needs: [metadata, build-pax] + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: pax + secrets: inherit + + test-distribution: + needs: metadata + uses: ./.github/workflows/_test_distribution.yaml + secrets: inherit + + test-jax: + needs: build-jax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_jax.yaml + with: + JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-te: + needs: build-pax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_te.yaml + with: + TE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-t5x: + needs: build-t5x + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_t5x.yaml + with: + T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-pax: + needs: build-pax + if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + uses: ./.github/workflows/_test_pax.yaml + with: + PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-vit: + needs: build-rosetta-t5x + uses: ./.github/workflows/_test_vit.yaml + with: + ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS_FINAL }} + secrets: inherit + diff --git a/.github/workflows/_copy_gist.yaml b/.github/workflows/_copy_gist.yaml index 492d9fd70..fa2c09845 100644 --- a/.github/workflows/_copy_gist.yaml +++ b/.github/workflows/_copy_gist.yaml @@ -18,7 +18,7 @@ on: default: '.*' jobs: - action: + copy-gist: runs-on: ubuntu-22.04 steps: - name: copy badge to primary Gist diff --git a/.github/workflows/_finalize.yaml b/.github/workflows/_finalize.yaml index b77e93074..5844af63c 100644 --- a/.github/workflows/_finalize.yaml +++ b/.github/workflows/_finalize.yaml @@ -10,13 +10,117 @@ on: required: false jobs: + # show-containers: + # runs-on: ubuntu-22.04 + # steps: + # - name: Generate job summary for container build + # shell: bash -x -e {0} + # run: | + # cat > $GITHUB_STEP_SUMMARY << EOF + # # Images created + + # | Image | Link | + # | ------------ | -------------------------------------------------- | + # | Base | ${{ needs.amd64.outputs.TAG_BASE }} | + # | | ${{ needs.arm64.outputs.TAG_BASE }} | + # | JAX | ${{ needs.amd64.outputs.TAG_JAX }} | + # | | ${{ needs.arm64.outputs.TAG_JAX }} | + # | T5X | ${{ needs.amd64.outputs.TAG_T5X }} | + # | | ${{ needs.arm64.outputs.TAG_T5X }} | + # | PAX | ${{ needs.amd64.outputs.TAG_PAX }} | + # | | ${{ needs.arm64.outputs.TAG_PAX }} | + # EOF + + # # | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | + # # | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | + upload-badge: - uses: ./.github/workflows/_upload_badge.yaml - secrets: inherit + runs-on: ubuntu-22.04 + env: + # Name/bash regex for shields.io endpoint JSON files + BADGE_FILES: '*badge*.json' + outputs: + GIST_ID: ${{ steps.extract-id.outputs.GIST_ID }} + steps: + - name: Download artifacts specified by input + uses: actions/download-artifact@v3 + + - name: Collect all badge files to temporary folder + id: collect + shell: bash -x -e {0} + run: | + workdir=$(mktemp -d) + find -name "${BADGE_FILES}" | while read -s f; do + cp "$f" $workdir + done + echo "WORKDIR=$workdir" >> $GITHUB_OUTPUT + + - name: Upload badge files to gist + id: upload + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.NVJAX_GIST_TOKEN }} + script: | + const currentDateTime = new Date().toISOString(); + const gistDescription = + `Badge endpoint files from Workflow: ${{ github.workflow }}, ` + + `Run ID: ${{ github.run_id }}, ` + + `Repository: ${{ github.repository }}, ` + + `Event: ${{ github.event_name }}, ` + + `Created: ${currentDateTime}`; + + const fs = require('fs').promises; + const workdir = '${{ steps.collect.outputs.WORKDIR }}' + const files = await fs.readdir(workdir); + + gist = await github.rest.gists.create({ + description: gistDescription, + public: false, + files: Object.fromEntries( + await Promise.all( + files.map( + async filename => { + const content = await fs.readFile(`${workdir}/${filename}`, 'utf8'); + return [filename, { content }]; + } + ) + ) + ) + }); + + console.log(gist) + + return gist.data.id; + + - name: Return Gist ID + id: extract-id + shell: bash -x -e {0} + run: | + GIST_ID="${{ steps.upload.outputs.result }}" + echo "GIST_ID=${GIST_ID//\"/}" >> $GITHUB_OUTPUT report: - uses: ./.github/workflows/_summary.yaml - secrets: inherit + runs-on: ubuntu-22.04 + steps: + - name: Download artifacts + uses: actions/download-artifact@v3 + + - name: Write output to step summary + shell: bash -x -e {0} + run: | + find -name "sitrep.json" | while read -s f; do + cat "$f" | jq -r '.summary' | tee -a $GITHUB_STEP_SUMMARY + done + + # - name: Concatenate all sitreps + # shell: bash -x -e {0} + # run: | + # # combine all sitreps files into a single file, where each sitrep json sits + # # in a field named by the folder that contained it + + # find -name "sitrep.json" | while read -s f; do + # echo "$(dirname $f): $(cat $f)," >> + # done publish-badge: needs: [upload-badge] diff --git a/.github/workflows/_publish_container.yaml b/.github/workflows/_publish_container.yaml index c2d03c7f7..f0aa38e6f 100644 --- a/.github/workflows/_publish_container.yaml +++ b/.github/workflows/_publish_container.yaml @@ -15,11 +15,11 @@ on: type: string description: 'Target docker tags in docker/metadata-action format:' required: true - EXPOSE_SINGLE_ARCH_IMAGES: - type: boolean - description: 'Also expose single-arch images:' - required: false - default: true + # EXPOSE_SINGLE_ARCH_IMAGES: + # type: boolean + # description: 'Also expose single-arch images:' + # required: false + # default: false outputs: DOCKER_TAGS: description: "Tags of the image published" @@ -53,29 +53,36 @@ jobs: id: get-manifests shell: bash -x -e {0} run: | - SOURCE_REPO=$(echo ${{ inputs.SOURCE_IMAGE }} | cut -d: -f1) - MEDIA_TYPE=$(docker manifest inspect ${{ inputs.SOURCE_IMAGE }} | jq -r '.mediaType') - case "$MEDIA_TYPE" in - # OCI image index - "application/vnd.oci.image.index.v1+json") - MANIFESTS=$( - docker manifest inspect ${{ inputs.SOURCE_IMAGE }} |\ - jq -r '.manifests[] | select(.platform.os != "unknown") | .digest' |\ - xargs -I{} echo ${SOURCE_REPO}@{} |\ - tr '\n' ' ' - ) - ;; - # Docker image manifest - "application/vnd.docker.distribution.manifest.v2+json") - MANIFESTS=${{ inputs.SOURCE_IMAGE }} - ;; - *) - echo "Unknown media type: $MEDIA_TYPE" - exit 1 - ;; - esac + manifests="" + for src_img in $(echo "${{ inputs.SOURCE_IMAGE }}" | tr '\n' ' '); do + repo=$(echo $src_img | cut -d: -f1) + media_type=$(docker manifest inspect $src_img | jq -r '.mediaType') + case "$media_type" in + + # OCI image index + "application/vnd.oci.image.index.v1+json") + manifest=$( + docker manifest inspect ${src_img} |\ + jq -r '.manifests[] | select(.platform.os != "unknown") | .digest' |\ + xargs -I{} echo ${repo}@{} |\ + tr '\n' ' ' + ) + ;; - echo "manifests=$MANIFESTS" >> $GITHUB_OUTPUT + # Docker image manifest + "application/vnd.docker.distribution.manifest.v2+json") + manifest=${src_img} + ;; + + *) + echo "Unknown media type: $MEDIA_TYPE" + exit 1 + ;; + esac + manifests="$manifests $manifest" + done + + echo "manifests=$manifests" >> $GITHUB_OUTPUT - name: Create multi-arch images id: multi-arch @@ -85,26 +92,26 @@ jobs: docker buildx imagetools create --tag $tag ${{ steps.get-manifests.outputs.manifests }} done - - name: Create single-arch images - if: ${{ inputs.EXPOSE_SINGLE_ARCH_IMAGES }} - shell: bash -x -e {0} - run: | - # Create new manifest list from extracted manifests - for manifest in ${{ steps.get-manifests.outputs.manifests }}; do - os=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.os') - arch=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.architecture') - for tag in $(echo "${{ steps.meta.outputs.tags }}"); do - skopeo copy --format v2s2 docker://$manifest docker://$tag-${os}-${arch} - done - done + # - name: Create single-arch images + # if: ${{ inputs.EXPOSE_SINGLE_ARCH_IMAGES }} + # shell: bash -x -e {0} + # run: | + # # Create new manifest list from extracted manifests + # for manifest in ${{ steps.get-manifests.outputs.manifests }}; do + # os=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.os') + # arch=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.architecture') + # for tag in $(echo "${{ steps.meta.outputs.tags }}"); do + # skopeo copy --format v2s2 docker://$manifest docker://$tag-${os}-${arch} + # done + # done - - name: Generate outputs and artifacts - shell: bash -x -e {0} - run: | - echo "${{ steps.meta.outputs.tags }}" > image-tags-${{ inputs.TARGET_IMAGE }}.txt + # - name: Generate outputs and artifacts + # shell: bash -x -e {0} + # run: | + # echo "${{ steps.meta.outputs.tags }}" > image-tags-${{ inputs.TARGET_IMAGE }}.txt - - name: Upload image tags as artifacts - uses: actions/upload-artifact@v3 - with: - name: image-tags-${{ inputs.TARGET_IMAGE }} - path: image-tags-${{ inputs.TARGET_IMAGE }}.txt + # - name: Upload image tags as artifacts + # uses: actions/upload-artifact@v3 + # with: + # name: image-tags-${{ inputs.TARGET_IMAGE }} + # path: image-tags-${{ inputs.TARGET_IMAGE }}.txt diff --git a/.github/workflows/_runner_ondemand_slurm.yaml b/.github/workflows/_runner_ondemand_slurm.yaml index 507a67139..1c676310a 100644 --- a/.github/workflows/_runner_ondemand_slurm.yaml +++ b/.github/workflows/_runner_ondemand_slurm.yaml @@ -19,7 +19,7 @@ on: jobs: - launch: + launch-slurm-runner: runs-on: ubuntu-latest steps: - name: Print environment variables diff --git a/.github/workflows/_summary.yaml b/.github/workflows/_summary.yaml deleted file mode 100644 index d0f453d0c..000000000 --- a/.github/workflows/_summary.yaml +++ /dev/null @@ -1,18 +0,0 @@ -name: ~create summary for an entire workflow run - -on: - workflow_call: - -jobs: - action: - runs-on: ubuntu-22.04 - steps: - - name: Download artifacts - uses: actions/download-artifact@v3 - - - name: Write output to step summary - shell: bash -x -e {0} - run: | - find -name "sitrep.json" | while read -s f; do - cat "$f" | jq -r '.summary' | tee -a $GITHUB_STEP_SUMMARY - done diff --git a/.github/workflows/_test_distribution.yaml b/.github/workflows/_test_distribution.yaml index 014584050..577225017 100644 --- a/.github/workflows/_test_distribution.yaml +++ b/.github/workflows/_test_distribution.yaml @@ -19,7 +19,7 @@ jobs: run: | git config --global user.email "jax@nvidia.com" git config --global user.name "JAX-Toolbox CI" - + - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3 diff --git a/.github/workflows/_test_jax.yaml b/.github/workflows/_test_jax.yaml index c7e26eec1..02f4de9a0 100644 --- a/.github/workflows/_test_jax.yaml +++ b/.github/workflows/_test_jax.yaml @@ -28,7 +28,7 @@ jobs: TIME: "01:00:00" secrets: inherit - unit-test: + jax-unit-test: strategy: fail-fast: false matrix: diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index 547f9567f..f35dee0d2 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -1,4 +1,4 @@ -name: ~test Pax, MGMN +name: ~test Pax, multi-node on: workflow_call: @@ -48,7 +48,6 @@ jobs: EOF chmod 600 ~/.ssh/known_hosts echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT - - name: Labels and metadata id: meta shell: bash -x -e {0} @@ -59,14 +58,12 @@ jobs: MAX_GPUS_PER_NODE=8 NODES=1 GPUS_PER_NODE=8 - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do echo "$var=${!var}" >> $GITHUB_OUTPUT done - - name: Submit SLURM jobs over SSH id: submit shell: bash -O expand_aliases -x -e {0} @@ -101,15 +98,12 @@ jobs: ${{ inputs.EXTRA_TEST_ARGS }} EOF ) - set +x while sshx squeue -j $JOB | grep -q $JOB; do echo "SLURM Job $JOB is still running." sleep 15 done - echo "SLURM Job $JOB finished." - # Gather job info SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') @@ -117,9 +111,7 @@ jobs: echo "SLURM Job exit code is ${SLURM_EXITCODE}" echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - set -x - - name: Retrieve training logs and upload to TensorBoard server shell: bash -x -e {0} run: | @@ -133,7 +125,6 @@ jobs: rsync -rtz --progress \ output/ \ ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true - - name: Write SLURM job status to file shell: bash -x -e {0} run: | @@ -149,8 +140,8 @@ jobs: with: name: ${{ steps.meta.outputs.JOB_NAME }} path: output/* - - multi-process-multi-device: + + pax-multi-node: strategy: matrix: PARALLEL_CONFIG: @@ -419,7 +410,7 @@ jobs: path: output/* metrics: - needs: [single-process-multi-device, multi-process-multi-device, single-process-evaluation] + needs: [single-process-multi-device, pax-multi-node, single-process-evaluation] runs-on: ubuntu-22.04 steps: @@ -457,7 +448,7 @@ jobs: publish-test: - needs: [single-process-multi-device, multi-process-multi-device, single-process-evaluation, metrics] + needs: [single-process-multi-device, pax-multi-node, single-process-evaluation, metrics] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml index 9f4f74cee..ca8715adf 100644 --- a/.github/workflows/_test_pax_rosetta.yaml +++ b/.github/workflows/_test_pax_rosetta.yaml @@ -1,4 +1,4 @@ -name: ~test Pax, MGMN +name: ~test Pax, multi-node on: workflow_call: @@ -48,7 +48,6 @@ jobs: EOF chmod 600 ~/.ssh/known_hosts echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT - - name: Labels and metadata id: meta shell: bash -x -e {0} @@ -59,14 +58,12 @@ jobs: MAX_GPUS_PER_NODE=8 NODES=1 GPUS_PER_NODE=8 - JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} LOG_FILE=/nfs/cluster/${JOB_NAME}.log MODEL_PATH=/nfs/cluster/${JOB_NAME} for var in IMAGE TEST_CASE_NAME TOTAL_TASKS NODES GPUS_PER_NODE JOB_NAME LOG_FILE MODEL_PATH; do echo "$var=${!var}" >> $GITHUB_OUTPUT done - - name: Submit SLURM jobs over SSH id: submit shell: bash -O expand_aliases -x -e {0} @@ -102,15 +99,12 @@ jobs: ${{ inputs.EXTRA_TEST_ARGS }} EOF ) - set +x while sshx squeue -j $JOB | grep -q $JOB; do echo "SLURM Job $JOB is still running." sleep 15 done - echo "SLURM Job $JOB finished." - # Gather job info SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') @@ -118,9 +112,7 @@ jobs: echo "SLURM Job exit code is ${SLURM_EXITCODE}" echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - set -x - - name: Retrieve training logs and upload to TensorBoard server shell: bash -x -e {0} run: | @@ -134,7 +126,6 @@ jobs: rsync -rtz --progress \ output/ \ ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true - - name: Write SLURM job status to file shell: bash -x -e {0} run: | @@ -150,8 +141,8 @@ jobs: with: name: ${{ steps.meta.outputs.JOB_NAME }} path: output/* - - multi-process-multi-device-te: + + rosetta-pax-multi-node-te: strategy: matrix: PARALLEL_CONFIG: @@ -288,7 +279,7 @@ jobs: name: ${{ steps.meta.outputs.JOB_NAME }} path: output/* - multi-process-multi-device: + rosetta-pax-multi-node: strategy: matrix: PARALLEL_CONFIG: @@ -423,7 +414,7 @@ jobs: path: output/* - multi-process-multi-device-dropout-te: + rosetta-pax-single-node-dropout-te: strategy: matrix: PARALLEL_CONFIG: @@ -690,7 +681,7 @@ jobs: path: output/* metrics: - needs: [single-process-multi-device-te, multi-process-multi-device, multi-process-multi-device-te, multi-process-multi-device-dropout-te, single-process-evaluation-te] + needs: [single-process-multi-device-te, rosetta-pax-multi-node, rosetta-pax-multi-node-te, rosetta-pax-single-node-dropout-te, single-process-evaluation-te] runs-on: ubuntu-22.04 steps: @@ -728,7 +719,7 @@ jobs: publish-test: - needs: [single-process-multi-device-te, multi-process-multi-device, multi-process-multi-device-te, multi-process-multi-device-dropout-te, single-process-evaluation-te, metrics] + needs: [single-process-multi-device-te, rosetta-pax-multi-node, rosetta-pax-multi-node-te, rosetta-pax-single-node-dropout-te, single-process-evaluation-te, metrics] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit diff --git a/.github/workflows/_test_t5x.yaml b/.github/workflows/_test_t5x.yaml index 4a4f9ab21..cb9d7d6b9 100644 --- a/.github/workflows/_test_t5x.yaml +++ b/.github/workflows/_test_t5x.yaml @@ -1,4 +1,4 @@ -name: ~test T5X, MGMN +name: ~test T5X, multi-node on: workflow_call: @@ -25,7 +25,7 @@ on: jobs: - single-process-multi-device: + t5x-multi-gpu: strategy: matrix: N_GPU: [1, 2, 4, 8] @@ -144,7 +144,7 @@ jobs: name: ${{ steps.meta.outputs.JOB_NAME }} path: output/* - multi-gpu-multi-node: + t5x-multi-node: strategy: matrix: N_GPU: [1, 2, 4, 8] @@ -269,7 +269,7 @@ jobs: path: output/* metrics: - needs: [multi-gpu-multi-node, single-process-multi-device] + needs: [t5x-multi-node, t5x-multi-gpu] runs-on: ubuntu-22.04 steps: @@ -307,7 +307,7 @@ jobs: publish-test: - needs: [multi-gpu-multi-node, single-process-multi-device, metrics] + needs: [t5x-multi-node, t5x-multi-gpu, metrics] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit diff --git a/.github/workflows/_test_te.yaml b/.github/workflows/_test_te.yaml index 3f7d571b9..b6001256e 100644 --- a/.github/workflows/_test_te.yaml +++ b/.github/workflows/_test_te.yaml @@ -1,21 +1,20 @@ name: ~test TransformerEngine on: - # Called from another workflow workflow_call: inputs: - JAX_TE_IMAGE: + TE_IMAGE: type: string - description: 'JAX-TE image build by NVIDIA/JAX-Toolbox' + description: 'JAX+TE+PAXML image' required: true - default: 'ghcr.io/nvidia/jax-te:latest' + default: 'ghcr.io/nvidia/upstream-pax:latest' outputs: UNIT_TEST_ARTIFACT_NAME: description: 'Name of the unit test artifact for downstream workflows' value: ${{ jobs.te-unit-tests.outputs.UNIT_TEST_ARTIFACT_NAME }} INTEGRATION_TEST_ARTIFACT_NAME: description: 'Name of the integration test artifact for downstream workflows' - value: ${{ jobs.single-process-multi-device.outputs.INTEGRATION_TEST_ARTIFACT_NAME }} + value: ${{ jobs.te-multi-gpu.outputs.INTEGRATION_TEST_ARTIFACT_NAME }} env: UNIT_TEST_ARTIFACT_NAME: unit-test-logs @@ -40,45 +39,26 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Pull JAX-TE image + - name: Pull TE image shell: bash -x -e {0} run: | - docker pull ${{ inputs.JAX_TE_IMAGE }} - docker tag ${{ inputs.JAX_TE_IMAGE }} jax:te + docker pull ${{ inputs.TE_IMAGE }} + docker tag ${{ inputs.TE_IMAGE }} te:local - - name: Run JAX-TE unit tests with docker - shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log jax:te bash -x /cmd.sh + - name: Run TE unit tests with docker + shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log te:local bash -x /cmd.sh run: | - # TE test reqs - TEST_REQS="pytest-reportlog" - TE_PATH=$(dirname $(python -c "import transformer_engine as te; print(*te.__path__)")) - ## WAR: BEGIN - # This installation step is temporary and should be removed and replaced with - # $ NVTE_FRAMEWORK=jax pip install -e ${TE_PATH}[test] $TEST_REQS - # when praxis no longer uses jax at github head as a requirement. Praxis requirements - # are defined here: https://github.com/google/praxis/blob/main/requirements.in - - # After https://github.com/google/praxis/pull/20 was merged, SKIP_HEAD_INSTALLS was introduced - # as an environment variable to allow skipping of head installs like jax/fiddle that overrode - # the jax package that we already installed in our base image. Once SKIP_HEAD_INSTALLS is - # set, the user must specify the head installs manually to ensure they are respected by pip's - # dependency resolver. This is brittle since new head installs may be missed, so they must be - # manually added to TEST_REQS below. Praxis is still installed from head as opposed to pypi - # because no wheel exists with this feature yet. - if ! pip show praxis >/dev/null 2>&1; then - TEST_REQS+=" fiddle git+https://github.com/google/praxis" - fi - SKIP_HEAD_INSTALLS=1 NVTE_FRAMEWORK=jax pip install -e ${TE_PATH}[test] $TEST_REQS - ## WAR: END - pytest --report-log=/log/report.jsonl ${TE_PATH}/tests/jax || true + pip install pytest-reportlog + pytest --report-log=/log/report.jsonl ${SRC_PATH_TE}/tests/jax - name: Upload unit test json logs + if: success() || failure() uses: actions/upload-artifact@v3 with: name: ${{ env.UNIT_TEST_ARTIFACT_NAME }} path: /log/report.jsonl - single-process-multi-device: + te-multi-gpu: strategy: matrix: N_GPU: [1, 2, 4, 8] @@ -111,7 +91,7 @@ jobs: id: meta shell: bash -x -e {0} run: | - PYXIS_IMAGE_NAME=${{ inputs.JAX_TE_IMAGE }} + PYXIS_IMAGE_NAME=${{ inputs.TE_IMAGE }} PYXIS_IMAGE_NAME=${PYXIS_IMAGE_NAME/ghcr.io\//ghcr.io#} TEST_CASE_NAME=1P${{ matrix.N_GPU }}G JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME} @@ -144,7 +124,7 @@ jobs: --container-entrypoint \ bash -e -x -c 'nvidia-smi pip install pytest pytest-reportlog cuda-python - cd \$(dirname \$(python -c "import transformer_engine as te; print(*te.__path__)"))/examples/jax/encoder + cd \${SRC_PATH_TE}/examples/jax/encoder pip install -r requirements.txt pytest --report-log=/output/$(basename ${{ steps.meta.outputs.PYTEST_LOG_FILE }}) \ test_single_gpu_encoder.py \ diff --git a/.github/workflows/_upload_badge.yaml b/.github/workflows/_upload_badge.yaml deleted file mode 100644 index 936bdcae7..000000000 --- a/.github/workflows/_upload_badge.yaml +++ /dev/null @@ -1,77 +0,0 @@ -name: ~upload shields.io endpoint json files as a GitHub Gist - -on: - workflow_call: - inputs: - BADGE_FILES: - type: string - description: 'Name/bash regex for shields.io endpoint JSON files' - default: '*badge*.json' - required: false - outputs: - GIST_ID: - description: 'Id of the created Gist' - value: ${{ jobs.action.outputs.GIST_ID }} - -jobs: - action: - runs-on: ubuntu-22.04 - outputs: - GIST_ID: ${{ steps.extract-id.outputs.GIST_ID }} - steps: - - name: Download artifacts specified by input - uses: actions/download-artifact@v3 - - - name: Collect all badge files to temporary folder - id: collect - shell: bash -x -e {0} - run: | - workdir=$(mktemp -d) - find -name "${{ inputs.BADGE_FILES }}" | while read -s f; do - cp "$f" $workdir - done - echo "WORKDIR=$workdir" >> $GITHUB_OUTPUT - - - name: Upload badge files to gist - id: upload - uses: actions/github-script@v6 - with: - github-token: ${{ secrets.NVJAX_GIST_TOKEN }} - script: | - const currentDateTime = new Date().toISOString(); - const gistDescription = - `Badge endpoint files from Workflow: ${{ github.workflow }}, ` + - `Run ID: ${{ github.run_id }}, ` + - `Repository: ${{ github.repository }}, ` + - `Event: ${{ github.event_name }}, ` + - `Created: ${currentDateTime}`; - - const fs = require('fs').promises; - const workdir = '${{ steps.collect.outputs.WORKDIR }}' - const files = await fs.readdir(workdir); - - gist = await github.rest.gists.create({ - description: gistDescription, - public: false, - files: Object.fromEntries( - await Promise.all( - files.map( - async filename => { - const content = await fs.readFile(`${workdir}/${filename}`, 'utf8'); - return [filename, { content }]; - } - ) - ) - ) - }); - - console.log(gist) - - return gist.data.id; - - - name: Return Gist ID - id: extract-id - shell: bash -x -e {0} - run: | - GIST_ID="${{ steps.upload.outputs.result }}" - echo "GIST_ID=${GIST_ID//\"/}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d128a17de..d95bc0592 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -55,259 +55,35 @@ permissions: jobs: - metadata: - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - REPO_JAX: ${{ steps.parse-inputs.outputs.REPO_JAX }} - REF_JAX: ${{ steps.parse-inputs.outputs.REF_JAX }} - REPO_XLA: ${{ steps.parse-inputs.outputs.REPO_XLA }} - REF_XLA: ${{ steps.parse-inputs.outputs.REF_XLA }} - REPO_TE: ${{ steps.parse-inputs.outputs.REPO_TE }} - REF_TE: ${{ steps.parse-inputs.outputs.REF_TE }} - REPO_T5X: ${{ steps.parse-inputs.outputs.REPO_T5X }} - REF_T5X: ${{ steps.parse-inputs.outputs.REF_T5X }} - REPO_PAXML: ${{ steps.parse-inputs.outputs.REPO_PAXML }} - REF_PAXML: ${{ steps.parse-inputs.outputs.REF_PAXML }} - REPO_PRAXIS: ${{ steps.parse-inputs.outputs.REPO_PRAXIS }} - REF_PRAXIS: ${{ steps.parse-inputs.outputs.REF_PRAXIS }} - steps: - - name: Set build date - id: date - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - - - name: Parse inputs - id: parse-inputs - shell: bash -x -e {0} - run: | - # split input in the format of repo#ref into repo and ref parts - parse_git_src() { - PACKAGE=$1 - INPUT="$2" - DEFAULT="$3" - SRC="${INPUT:-${DEFAULT}}" - echo "REPO_${PACKAGE}=$(echo "${SRC}" | cut -f1 -d#)" >> $GITHUB_OUTPUT - echo "REF_${PACKAGE}=$(echo "${SRC}" | cut -f2 -d#)" >> $GITHUB_OUTPUT - } - - # default values are for `pull_request`` event types - parse_git_src JAX "${{ inputs.SRC_JAX }}" "https://github.com/google/jax.git#main" - parse_git_src XLA "${{ inputs.SRC_XLA }}" "https://github.com/openxla/xla.git#main" - # TODO: This is a temporary pinning of TE as the API in TE no longer matches the TE patch - # This should be reverted to main ASAP - parse_git_src TE "${{ inputs.SRC_TE }}" "https://github.com/NVIDIA/TransformerEngine.git#7976bd003fcf084dd068069b92a9a79b1743316a" - parse_git_src T5X "${{ inputs.SRC_T5X }}" "https://github.com/google-research/t5x.git#main" - parse_git_src PAXML "${{ inputs.SRC_PAXML }}" "https://github.com/google/paxml.git#main" - parse_git_src PRAXIS "${{ inputs.SRC_PRAXIS }}" "https://github.com/google/praxis.git#main" - - build-base: - needs: metadata - uses: ./.github/workflows/_build_base.yaml - with: - BASE_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - secrets: inherit - - build-jax: - needs: [metadata, build-base] - uses: ./.github/workflows/_build_jax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAGS }} - REPO_JAX: ${{ needs.metadata.outputs.REPO_JAX }} - REF_JAX: ${{ needs.metadata.outputs.REF_JAX }} - REPO_XLA: ${{ needs.metadata.outputs.REPO_XLA }} - REF_XLA: ${{ needs.metadata.outputs.REF_XLA }} - secrets: inherit - - build-te: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_te.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} - REF_TE: ${{ needs.metadata.outputs.REF_TE }} - secrets: inherit - - build-t5x: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_t5x.yaml + amd64: + uses: ./.github/workflows/_ci.yaml with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} - REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} + ARCHITECTURE: amd64 + CUDA_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} + SRC_JAX: ${{ inputs.SRC_JAX || 'https://github.com/google/jax.git#main' }} + SRC_XLA: ${{ inputs.SRC_XLA || 'https://github.com/openxla/xla.git#main'}} + SRC_TE: ${{ inputs.SRC_TE || 'https://github.com/NVIDIA/TransformerEngine.git#main'}} + SRC_T5X: ${{ inputs.SRC_T5X || 'https://github.com/google-research/t5x.git#main'}} + SRC_PAXML: ${{ inputs.SRC_PAXML || 'https://github.com/google/paxml.git#main'}} + SRC_PRAXIS: ${{ inputs.SRC_PRAXIS || 'https://github.com/google/praxis.git#main'}} secrets: inherit - - build-pax: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_pax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - REPO_PAXML: ${{ needs.metadata.outputs.REPO_PAXML }} - REF_PAXML: ${{ needs.metadata.outputs.REF_PAXML }} - REPO_PRAXIS: ${{ needs.metadata.outputs.REPO_PRAXIS }} - REF_PRAXIS: ${{ needs.metadata.outputs.REF_PRAXIS }} - secrets: inherit - - build-rosetta-t5x: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-t5x] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - BASE_LIBRARY: t5x - PLATFORMS: '["amd64"]' - secrets: inherit - - build-rosetta-pax: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-pax] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - BASE_LIBRARY: pax - secrets: inherit - - build-summary: - needs: [build-base, build-jax, build-te, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] - # needs: [build-base, build-jax, build-te, build-t5x, build-pax, build-pax-aarch64, build-rosetta-t5x, build-rosetta-pax] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | Base | ${{ needs.build-base.outputs.DOCKER_TAGS }} | - | JAX | ${{ needs.build-jax.outputs.DOCKER_TAGS }} | - | JAX-TE | ${{ needs.build-te.outputs.DOCKER_TAGS }} | - | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | - | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | - | ROSETTA(t5x) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | - EOF - - retrofit-containers: - needs: [build-base, build-jax, build-te, build-t5x, build-pax, build-rosetta-t5x, build-rosetta-pax] - if: always() - runs-on: ubuntu-22.04 - env: - DOCKER_REPO: 'ghcr.io/nvidia/jax-toolbox-retrofit' - steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - ## Requires skopeo >= v1.6.0, but Actions only has v1.4.0 - # - name: Create Docker v2s2 multi-arch manifest list - # id: multi-arch - # shell: bash -x -e {0} - # run: | - # for tag in $(echo "${{ steps.meta.outputs.tags }}"); do - # skopeo copy --multi-arch all --format v2s2 docker://${{ inputs.SOURCE_IMAGE }} docker://$tag - # done - - - name: Create Docker v2s2 single-arch manifests - id: single-arch - shell: bash -x -e {0} - run: | - - for source in \ - ${{ needs.build-base.outputs.DOCKER_TAGS }} \ - ${{ needs.build-jax.outputs.DOCKER_TAGS }} \ - ${{ needs.build-te.outputs.DOCKER_TAGS }} \ - ${{ needs.build-t5x.outputs.DOCKER_TAGS }} \ - ${{ needs.build-pax.outputs.DOCKER_TAGS }} \ - ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} \ - ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} \ - ; do - source_repo=$(echo ${source} | cut -d: -f1) - media_type=$(docker manifest inspect ${source} | jq -r '.mediaType') - if [[ ${media_type} != "application/vnd.oci.image.index.v1+json" ]]; then - echo "Image ${source} is already in Docker format v2s2" - dest=${DOCKER_REPO}:$(echo ${source} | cut -d: -f2) - skopeo copy --format v2s2 docker://${source} docker://${dest} - echo "${dest}" >> $GITHUB_STEP_SUMMARY - else - manifests=$( - docker manifest inspect ${source} |\ - jq -r '.manifests[] | select(.platform.os != "unknown") | .digest' |\ - xargs -I{} echo ${source_repo}@{} |\ - tr '\n' ' ' - ) - - ## registry/org/repo:tag -> repo-tag - # dest_tag=$(echo ${source} | cut -d: -f1 | cut -d/ -f3)-$(echo ${source} | cut -d: -f2) - ## registry/org/repo:tag -> tag - dest_tag=$(echo ${source} | cut -d: -f2) - - for manifest in ${manifests}; do - os=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.os') - arch=$(docker manifest inspect -v $manifest | jq -r '.Descriptor.platform.architecture') - # single_arch_tag="ghcr.io/nvidia/jax-toolbox-retrofit:${{ github.run_id }}-${dest_tag}-${os}-${arch}" - single_arch_tag="${DOCKER_REPO}:${dest_tag}-${os}-${arch}" - skopeo copy --format v2s2 docker://$manifest docker://${single_arch_tag} - echo "${single_arch_tag}" >> $GITHUB_STEP_SUMMARY - done - fi - done - - test-distribution: - needs: metadata - uses: ./.github/workflows/_test_distribution.yaml - secrets: inherit - - test-jax: - needs: build-jax - uses: ./.github/workflows/_test_jax.yaml - with: - JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - secrets: inherit - - test-te: - needs: build-te - uses: ./.github/workflows/_test_te.yaml - with: - JAX_TE_IMAGE: ${{ needs.build-te.outputs.DOCKER_TAGS }} - secrets: inherit - test-t5x: - needs: build-t5x - uses: ./.github/workflows/_test_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - secrets: inherit - - test-pax: - needs: build-pax - uses: ./.github/workflows/_test_pax.yaml + + arm64: + uses: ./.github/workflows/_ci.yaml with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} + ARCHITECTURE: arm64 + CUDA_IMAGE: ${{ inputs.CUDA_IMAGE || 'latest' }} + SRC_JAX: ${{ inputs.SRC_JAX || 'https://github.com/google/jax.git#main' }} + SRC_XLA: ${{ inputs.SRC_XLA || 'https://github.com/openxla/xla.git#main'}} + SRC_TE: ${{ inputs.SRC_TE || 'https://github.com/NVIDIA/TransformerEngine.git#main'}} + SRC_T5X: ${{ inputs.SRC_T5X || 'https://github.com/google-research/t5x.git#main'}} + SRC_PAXML: ${{ inputs.SRC_PAXML || 'https://github.com/google/paxml.git#main'}} + SRC_PRAXIS: ${{ inputs.SRC_PRAXIS || 'https://github.com/google/praxis.git#main'}} secrets: inherit - test-vit: - needs: build-rosetta-t5x - uses: ./.github/workflows/_test_vit.yaml - with: - ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} - secrets: inherit - finalize: + needs: [amd64, arm64] if: always() - # TODO: use dynamic matrix to make dependencies self-updating - needs: [build-summary, test-distribution, test-jax, test-te, test-t5x, test-pax] uses: ./.github/workflows/_finalize.yaml with: PUBLISH_BADGE: false diff --git a/.github/workflows/cuda-121-jax-pin.yaml b/.github/workflows/cuda-121-jax-pin.yaml deleted file mode 100644 index 2e7e3c382..000000000 --- a/.github/workflows/cuda-121-jax-pin.yaml +++ /dev/null @@ -1,191 +0,0 @@ -name: Nightly Containers on CUDA 12.1 (JAX pinned) -run-name: Nightly Containers on CUDA 12.1 (JAX pinned) (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) - -on: - schedule: - - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC - workflow_dispatch: - inputs: - JAX_BASE_IMAGE: - type: string - description: 'Base Multiarch JAX Image' - default: 'ghcr.io/nvidia/jax-toolbox-internal:6473019396-jax-multiarch' - required: true - REPO_T5X: - type: string - description: URL of T5X repository to check out - required: false - default: "https://github.com/nvjax-svc-0/t5x.git" - REF_T5X: - type: string - description: Git commit, tag, or branch for T5X - required: false - default: unpin-tfds-gpu-extra - REPO_TE: - type: string - description: URL of TE repository to check out - required: false - default: "https://github.com/NVIDIA/TransformerEngine.git" - REF_TE: - type: string - description: Git commit, tag, or branch for TE - required: false - default: v0.13 - PUBLISH: - type: boolean - description: Publish dated images and update the 'latest' tag? - default: false - required: false - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -env: - DEFAULT_JAX_BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6473019396-jax-multiarch - DEFAULT_REPO_T5X: https://github.com/nvjax-svc-0/t5x.git - DEFAULT_REF_T5X: unpin-tfds-gpu-extra - DEFAULT_REPO_TE: https://github.com/NVIDIA/TransformerEngine.git - DEFAULT_REF_TE: v0.13 - -jobs: - - metadata: - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.meta.outputs.BUILD_DATE }} - JAX_BASE_IMAGE: ${{ steps.meta.outputs.JAX_BASE_IMAGE}} - REPO_T5X: ${{ steps.meta.outputs.REPO_T5X }} - REF_T5X: ${{ steps.meta.outputs.REF_T5X }} - REPO_TE: ${{ steps.meta.outputs.REPO_TE }} - REF_TE: ${{ steps.meta.outputs.REF_TE }} - steps: - - name: Set build date and base image - id: meta - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - if [[ -z "${{ inputs.JAX_BASE_IMAGE }}" ]]; then - echo "JAX_BASE_IMAGE=${{ env.DEFAULT_JAX_BASE_IMAGE }}" >> $GITHUB_OUTPUT - else - echo "JAX_BASE_IMAGE=${{ inputs.JAX_BASE_IMAGE }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REPO_T5X }}" ]]; then - echo "REPO_T5X=${{ env.DEFAULT_REPO_T5X }}" >> $GITHUB_OUTPUT - else - echo "REPO_T5X=${{ inputs.REPO_T5X }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REF_T5X }}" ]]; then - echo "REF_T5X=${{ env.DEFAULT_REF_T5X }}" >> $GITHUB_OUTPUT - else - echo "REF_T5X=${{ inputs.REF_T5X }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REPO_TE }}" ]]; then - echo "REPO_TE=${{ env.DEFAULT_REPO_TE }}" >> $GITHUB_OUTPUT - else - echo "REPO_TE=${{ inputs.REPO_TE }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REF_TE }}" ]]; then - echo "REF_TE=${{ env.DEFAULT_REF_TE }}" >> $GITHUB_OUTPUT - else - echo "REF_TE=${{ inputs.REF_TE }}" >> $GITHUB_OUTPUT - fi - - build-pax: - needs: [metadata] - uses: ./.github/workflows/_build_pax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - secrets: inherit - - build-rosetta-pax: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-pax] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - BASE_LIBRARY: pax - PLATFORMS: '["amd64"]' - secrets: inherit - - build-t5x: - needs: [metadata] - uses: ./.github/workflows/_build_t5x.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} - REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} - REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} - REF_TE: ${{ needs.metadata.outputs.REF_TE }} - secrets: inherit - - build-rosetta-t5x: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-t5x] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - BASE_LIBRARY: t5x - PLATFORMS: '["amd64"]' - secrets: inherit - - build-summary: - needs: [metadata, build-t5x, build-rosetta-t5x, build-pax, build-rosetta-pax] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | JAX (input) | ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} | - | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | - | ROSETTA(T5X) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | - | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | - EOF - - test-jax: - needs: metadata - uses: ./.github/workflows/_test_jax.yaml - with: - JAX_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - secrets: inherit - - test-pax: - needs: build-pax - uses: ./.github/workflows/_test_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - secrets: inherit - - test-t5x: - needs: build-t5x - uses: ./.github/workflows/_test_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - secrets: inherit - - # TODO(terry): This is missing the rosetta tests which can only be added - # After a fix for the TB log collision is pushed. - - finalize: - if: always() - # TODO: use dynamic matrix to make dependencies self-updating - needs: [build-summary, test-jax, test-pax] - uses: ./.github/workflows/_finalize.yaml - with: - PUBLISH_BADGE: false - secrets: inherit diff --git a/.github/workflows/cuda-122-jax-pin.yaml b/.github/workflows/cuda-122-jax-pin.yaml deleted file mode 100644 index cb12d1037..000000000 --- a/.github/workflows/cuda-122-jax-pin.yaml +++ /dev/null @@ -1,190 +0,0 @@ -name: Nightly Containers on CUDA 12.2 (JAX pinned) -run-name: Nightly Containers on CUDA 12.2 (JAX pinned) (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) - -on: - schedule: - - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC - workflow_dispatch: - inputs: - JAX_BASE_IMAGE: - type: string - description: 'Base Multiarch JAX Image' - default: 'ghcr.io/nvidia/jax-toolbox-internal:6475553977-jax-multiarch' - required: true - REPO_T5X: - type: string - description: URL of T5X repository to check out - required: false - default: "https://github.com/nvjax-svc-0/t5x.git" - REF_T5X: - type: string - description: Git commit, tag, or branch for T5X - required: false - default: unpin-tfds-gpu-extra - REPO_TE: - type: string - description: URL of TE repository to check out - required: false - default: "https://github.com/NVIDIA/TransformerEngine.git" - REF_TE: - type: string - description: Git commit, tag, or branch for TE - required: false - default: v0.13 - PUBLISH: - type: boolean - description: Publish dated images and update the 'latest' tag? - default: false - required: false - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -env: - DEFAULT_JAX_BASE_IMAGE: ghcr.io/nvidia/jax-toolbox-internal:6475553977-jax-multiarch - DEFAULT_REPO_T5X: https://github.com/nvjax-svc-0/t5x.git - DEFAULT_REF_T5X: unpin-tfds-gpu-extra - DEFAULT_REPO_TE: https://github.com/NVIDIA/TransformerEngine.git - DEFAULT_REF_TE: v0.13 - -jobs: - - metadata: - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.meta.outputs.BUILD_DATE }} - JAX_BASE_IMAGE: ${{ steps.meta.outputs.JAX_BASE_IMAGE}} - REPO_T5X: ${{ steps.meta.outputs.REPO_T5X }} - REF_T5X: ${{ steps.meta.outputs.REF_T5X }} - REPO_TE: ${{ steps.meta.outputs.REPO_TE }} - REF_TE: ${{ steps.meta.outputs.REF_TE }} - steps: - - name: Set build date and base image - id: meta - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - if [[ -z "${{ inputs.JAX_BASE_IMAGE }}" ]]; then - echo "JAX_BASE_IMAGE=${{ env.DEFAULT_JAX_BASE_IMAGE }}" >> $GITHUB_OUTPUT - else - echo "JAX_BASE_IMAGE=${{ inputs.JAX_BASE_IMAGE }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REPO_T5X }}" ]]; then - echo "REPO_T5X=${{ env.DEFAULT_REPO_T5X }}" >> $GITHUB_OUTPUT - else - echo "REPO_T5X=${{ inputs.REPO_T5X }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REF_T5X }}" ]]; then - echo "REF_T5X=${{ env.DEFAULT_REF_T5X }}" >> $GITHUB_OUTPUT - else - echo "REF_T5X=${{ inputs.REF_T5X }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REPO_TE }}" ]]; then - echo "REPO_TE=${{ env.DEFAULT_REPO_TE }}" >> $GITHUB_OUTPUT - else - echo "REPO_TE=${{ inputs.REPO_TE }}" >> $GITHUB_OUTPUT - fi - if [[ -z "${{ inputs.REF_TE }}" ]]; then - echo "REF_TE=${{ env.DEFAULT_REF_TE }}" >> $GITHUB_OUTPUT - else - echo "REF_TE=${{ inputs.REF_TE }}" >> $GITHUB_OUTPUT - fi - - build-pax: - needs: [metadata] - uses: ./.github/workflows/_build_pax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - secrets: inherit - - build-rosetta-pax: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-pax] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - BASE_LIBRARY: pax - secrets: inherit - - build-t5x: - needs: [metadata] - uses: ./.github/workflows/_build_t5x.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - REPO_T5X: ${{ needs.metadata.outputs.REPO_T5X }} - REF_T5X: ${{ needs.metadata.outputs.REF_T5X }} - REPO_TE: ${{ needs.metadata.outputs.REPO_TE }} - REF_TE: ${{ needs.metadata.outputs.REF_TE }} - secrets: inherit - - build-rosetta-t5x: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-t5x] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - BASE_LIBRARY: t5x - PLATFORMS: '["amd64"]' - secrets: inherit - - build-summary: - needs: [metadata, build-t5x, build-rosetta-t5x, build-pax, build-rosetta-pax] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | JAX (input) | ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} | - | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | - | ROSETTA(T5X) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | - | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | - EOF - - test-jax: - needs: metadata - uses: ./.github/workflows/_test_jax.yaml - with: - JAX_IMAGE: ${{ needs.metadata.outputs.JAX_BASE_IMAGE }} - secrets: inherit - - test-pax: - needs: build-pax - uses: ./.github/workflows/_test_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - secrets: inherit - - test-t5x: - needs: build-t5x - uses: ./.github/workflows/_test_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - secrets: inherit - - # TODO(terry): This is missing the rosetta tests which can only be added - # After a fix for the TB log collision is pushed. - - finalize: - if: always() - # TODO: use dynamic matrix to make dependencies self-updating - needs: [build-summary, test-jax, test-pax] - uses: ./.github/workflows/_finalize.yaml - with: - PUBLISH_BADGE: false - secrets: inherit diff --git a/.github/workflows/nightly-jax-build.yaml b/.github/workflows/nightly-jax-build.yaml index 5e513a6f1..38c738436 100644 --- a/.github/workflows/nightly-jax-build.yaml +++ b/.github/workflows/nightly-jax-build.yaml @@ -38,20 +38,43 @@ jobs: run: | echo "PUBLISH=${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT - build: + amd64: needs: metadata uses: ./.github/workflows/_build_jax.yaml with: + ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit - publish: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml + arm64: + needs: metadata + uses: ./.github/workflows/_build_jax.yaml + with: + ARCHITECTURE: arm64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit + + publish-mealkit: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} + TARGET_IMAGE: jax + TARGET_TAGS: | + type=raw,value=mealkit,priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 + + publish-final: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} TARGET_IMAGE: jax TARGET_TAGS: | type=raw,value=latest,priority=1000 @@ -59,7 +82,7 @@ jobs: finalize: if: always() - needs: [metadata, build] + needs: [metadata, amd64, arm64] uses: ./.github/workflows/_finalize.yaml with: PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} diff --git a/.github/workflows/nightly-pax-build.yaml b/.github/workflows/nightly-pax-build.yaml index 64728265a..845737462 100644 --- a/.github/workflows/nightly-pax-build.yaml +++ b/.github/workflows/nightly-pax-build.yaml @@ -22,11 +22,30 @@ permissions: jobs: metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' runs-on: ubuntu-22.04 outputs: + PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} steps: + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + uses: styfle/cancel-workflow-action@0.12.0 + + - name: Determine if the resulting container should be 'published' + id: if-publish + shell: bash -x -e {0} + run: + # A container should be published if: + # 1) the workflow is triggered by workflow_dispatch and the PUBLISH input is true, or + # 2) the workflow is triggered by workflow_run (i.e., a nightly build) + echo "PUBLISH=${{ github.event_name == 'workflow_run' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT + - name: Set build date id: date shell: bash -x -e {0} @@ -34,28 +53,52 @@ jobs: BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + amd64: needs: metadata uses: ./.github/workflows/_build_pax.yaml with: + ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit - publish: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml + arm64: + needs: metadata + uses: ./.github/workflows/_build_pax.yaml + with: + ARCHITECTURE: arm64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit + + publish-mealkit: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} + TARGET_IMAGE: upstream-pax + TARGET_TAGS: | + type=raw,value=mealkit,priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 + + publish-final: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} TARGET_IMAGE: upstream-pax TARGET_TAGS: | type=raw,value=latest,priority=1000 type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - on-upstream-failure: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} + secrets: inherit diff --git a/.github/workflows/nightly-rosetta-pax-build.yaml b/.github/workflows/nightly-rosetta-pax-build.yaml index c12cfd8f0..82c02e2ff 100644 --- a/.github/workflows/nightly-rosetta-pax-build.yaml +++ b/.github/workflows/nightly-rosetta-pax-build.yaml @@ -11,7 +11,7 @@ on: BASE_IMAGE: type: string description: 'PAX image built by NVIDIA/JAX-Toolbox' - default: 'ghcr.io/nvidia/upstream-pax:latest' + default: '' required: true PUBLISH: type: boolean @@ -31,69 +31,111 @@ permissions: jobs: metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' runs-on: ubuntu-22.04 outputs: - BUILD_DATE: ${{ steps.meta-vars.outputs.BUILD_DATE }} - BASE_LIBRARY: ${{ steps.meta-vars.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ steps.meta-vars.outputs.BASE_IMAGE }} - PUBLISH: ${{ steps.meta-vars.outputs.PUBLISH }} + BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + BASE_LIBRARY: ${{ steps.base-metadata.outputs.BASE_LIBRARY }} + BASE_IMAGE_AMD64: ${{ steps.base-metadata.outputs.BASE_IMAGE_AMD64 }} + BASE_IMAGE_ARM64: ${{ steps.base-metadata.outputs.BASE_IMAGE_ARM64 }} + PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} steps: - - name: Set build metadata - id: meta-vars + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' + uses: styfle/cancel-workflow-action@0.12.0 + + - name: Determine if the resulting container should be 'published' + id: if-publish + shell: bash -x -e {0} + run: + # A container should be published if: + # 1) the workflow is triggered by workflow_dispatch and the PUBLISH input is true, or + # 2) the workflow is triggered by workflow_run (i.e., a nightly build) + echo "PUBLISH=${{ github.event_name == 'workflow_run' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT + + - name: Set build date + id: date shell: bash -x -e {0} run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') + echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + + - name: Set base library and image + id: base-metadata + shell: bash -x -e {0} + run: | if [[ -z "${{ inputs.BASE_IMAGE }}" ]]; then - BASE_IMAGE=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest + BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit + BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit else - BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BASE_IMAGE_AMD64=${{ inputs.BASE_IMAGE }}-amd64-mealkit + BASE_IMAGE_ARM64=${{ inputs.BASE_IMAGE }}-arm64-mealkit fi - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT - echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT - echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT + echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT + echo "BASE_IMAGE_ARM64=${BASE_IMAGE_ARM64}" >> $GITHUB_OUTPUT - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + amd64: needs: metadata uses: ./.github/workflows/_build_rosetta.yaml with: + ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} + BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE_AMD64 }} secrets: inherit - publish-build: - needs: [metadata, build] - uses: ./.github/workflows/_publish_badge.yaml - if: ( always() ) + arm64: + needs: metadata + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: arm64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }} + BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE_ARM64 }} secrets: inherit + + publish-mealkit: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml with: - ENDPOINT_FILENAME: 'rosetta-pax-build-status.json' - PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} - SCRIPT: | - if [[ ${{ needs.build.result }} == "success" ]]; then - BADGE_COLOR=brightgreen - MSG=passing - else - BADGE_COLOR=red - MSG=failing - fi - echo "LABEL='nightly'" >> $GITHUB_OUTPUT - echo "MESSAGE='${MSG}'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} + TARGET_IMAGE: upstream-pax + TARGET_TAGS: | + type=raw,value=mealkit,priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 + + publish-final: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} + TARGET_IMAGE: upstream-pax + TARGET_TAGS: | + type=raw,value=latest,priority=1000 + type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 test-pax: - needs: build + needs: [metadata, amd64, arm64] uses: ./.github/workflows/_test_pax_rosetta.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: - PAX_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + PAX_IMAGE: ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} secrets: inherit publish-test: - needs: [metadata, build, test-pax] + needs: [metadata, amd64, arm64, test-pax] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit @@ -121,30 +163,10 @@ jobs: echo "MESSAGE='${MESSAGE}'" >> $GITHUB_OUTPUT echo "COLOR='${COLOR}'" >> $GITHUB_OUTPUT - publish-latest-container: - needs: [metadata, build, test-pax] - if: ( ${{ needs.test-pax.outputs.TEST_STATUS == 'success' }} ) && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: pax - TARGET_TAGS: | - type=raw,value=latest,priority=1000 - - publish-container: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: pax - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 diff --git a/.github/workflows/nightly-rosetta-t5x-build-test.yaml b/.github/workflows/nightly-rosetta-t5x-build-test.yaml index 360f8f586..a774873e8 100644 --- a/.github/workflows/nightly-rosetta-t5x-build-test.yaml +++ b/.github/workflows/nightly-rosetta-t5x-build-test.yaml @@ -11,7 +11,7 @@ on: BASE_IMAGE: type: string description: 'T5x image built by NVIDIA/JAX-Toolbox' - default: 'ghcr.io/nvidia/upstream-t5x:latest' + default: '' required: true PUBLISH: type: boolean @@ -31,79 +31,120 @@ permissions: jobs: metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' runs-on: ubuntu-22.04 outputs: - BUILD_DATE: ${{ steps.meta-vars.outputs.BUILD_DATE }} - BASE_LIBRARY: ${{ steps.meta-vars.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ steps.meta-vars.outputs.BASE_IMAGE }} - PUBLISH: ${{ steps.meta-vars.outputs.PUBLISH }} + BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + BASE_LIBRARY: ${{ steps.base-metadata.outputs.BASE_LIBRARY }} + BASE_IMAGE_AMD64: ${{ steps.base-metadata.outputs.BASE_IMAGE_AMD64 }} + BASE_IMAGE_ARM64: ${{ steps.base-metadata.outputs.BASE_IMAGE_ARM64 }} + PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} steps: - - name: Set build metadata - id: meta-vars + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + uses: styfle/cancel-workflow-action@0.12.0 + + - name: Determine if the resulting container should be 'published' + id: if-publish + shell: bash -x -e {0} + run: + # A container should be published if: + # 1) the workflow is triggered by workflow_dispatch and the PUBLISH input is true, or + # 2) the workflow is triggered by workflow_run (i.e., a nightly build) + echo "PUBLISH=${{ github.event_name == 'workflow_run' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT + + - name: Set build date + id: date shell: bash -x -e {0} run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') + echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + + - name: Set base library and image + id: base-metadata + shell: bash -x -e {0} + run: | if [[ -z "${{ inputs.BASE_IMAGE }}" ]]; then - BASE_IMAGE=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:latest + BASE_IMAGE_AMD64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit + BASE_IMAGE_ARM64=${{ env.DOCKER_REGISTRY }}/upstream-${{ env.BASE_LIBRARY }}:mealkit else - BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BASE_IMAGE_AMD64=${{ inputs.BASE_IMAGE }}-amd64-mealkit + BASE_IMAGE_ARM64=${{ inputs.BASE_IMAGE }}-arm64-mealkit fi - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT echo "BASE_LIBRARY=${{ env.BASE_LIBRARY }}" >> $GITHUB_OUTPUT - echo "BASE_IMAGE=${BASE_IMAGE}" >> $GITHUB_OUTPUT - echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT - - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT + echo "BASE_IMAGE_ARM64=${BASE_IMAGE_ARM64}" >> $GITHUB_OUTPUT + + amd64: needs: metadata uses: ./.github/workflows/_build_rosetta.yaml with: + ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} BASE_LIBRARY: ${{ needs.metadata.outputs.BASE_LIBRARY }} - BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE }} - # TODO: Can't build ARM until https://github.com/NVIDIA/JAX-Toolbox/pull/252 is available - PLATFORMS: '["amd64"]' + BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE_AMD64 }} secrets: inherit + + arm64: + needs: metadata + runs-on: ubuntu-22.04 + outputs: + DOCKER_TAG_MEALKIT: '' + steps: + - name: Generate placeholder warning + shell: bash -x -e {0} + run: | + echo "WARNING: arm64 build is not yet supported" + + publish-mealkit: + needs: [metadata, amd64, arm64] + if: needs.metadata.output.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} + TARGET_IMAGE: upstream-t5x + TARGET_TAGS: | + type=raw,value=mealkit,priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 - publish-build: - needs: [metadata, build] - uses: ./.github/workflows/_publish_badge.yaml - if: ( always() ) - secrets: inherit + publish-final: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml with: - ENDPOINT_FILENAME: 'rosetta-t5x-build-status.json' - PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} - SCRIPT: | - if [[ ${{ needs.build.result }} == "success" ]]; then - BADGE_COLOR=brightgreen - MSG=passing - else - BADGE_COLOR=red - MSG=failing - fi - echo "LABEL='nightly'" >> $GITHUB_OUTPUT - echo "MESSAGE='${MSG}'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} + TARGET_IMAGE: upstream-t5x + TARGET_TAGS: | + type=raw,value=latest,priority=1000 + type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 test-unit: if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - needs: build + needs: [metadata, amd64, arm64] uses: ./.github/workflows/_test_rosetta.yaml with: - ROSETTA_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + ROSETTA_IMAGE: ${{ needs.amd64.outputs.DOCKER_TAGS_FINAL }} secrets: inherit test-t5x: - needs: build + needs: [metadata, amd64, arm64] uses: ./.github/workflows/_test_t5x_rosetta.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: - T5X_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + T5X_IMAGE: ${{ needs.amd64.outputs.DOCKER_TAGS_FINAL }} secrets: inherit test-vit: - needs: build + needs: [metadata, amd64, arm64] uses: ./.github/workflows/_test_vit.yaml if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' with: @@ -120,7 +161,7 @@ jobs: secrets: inherit publish-test: - needs: [metadata, build, test-unit, test-t5x, test-vit] + needs: [metadata, test-unit, test-t5x, test-vit] uses: ./.github/workflows/_publish_badge.yaml if: ( always() ) secrets: inherit @@ -156,30 +197,11 @@ jobs: echo "MESSAGE='${MESSAGE}'" >> $GITHUB_OUTPUT echo "COLOR='${COLOR}'" >> $GITHUB_OUTPUT - publish-latest-container: - needs: [metadata, build, test-t5x, test-unit, test-vit] - if: ( needs.test-unit.outputs.TEST_STATUS == 'success' && needs.test-t5x.outputs.TEST_STATUS == 'success' && needs.test-vit.outputs.TEST_STATUS == 'success' ) && ((github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH)) - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: t5x - TARGET_TAGS: | - type=raw,value=latest,priority=1000 - - publish-container: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: t5x - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 diff --git a/.github/workflows/nightly-t5x-build.yaml b/.github/workflows/nightly-t5x-build.yaml index 089f94069..8f0ad277f 100644 --- a/.github/workflows/nightly-t5x-build.yaml +++ b/.github/workflows/nightly-t5x-build.yaml @@ -22,11 +22,30 @@ permissions: jobs: metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' runs-on: ubuntu-22.04 outputs: + PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} steps: + - name: Check if the triggering workflow failed + id: if-upstream-failed + shell: bash -x -e {0} + run: | + echo "UPSTREAM_FAILED=${{ github.event.workflow_run.conclusion != 'success' }}" >> $GITHUB_OUTPUT + + - name: Cancel workflow if upstream workflow did not success + if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} + uses: styfle/cancel-workflow-action@0.12.0 + + - name: Determine if the resulting container should be 'published' + id: if-publish + shell: bash -x -e {0} + run: + # A container should be published if: + # 1) the workflow is triggered by workflow_dispatch and the PUBLISH input is true, or + # 2) the workflow is triggered by workflow_run (i.e., a nightly build) + echo "PUBLISH=${{ github.event_name == 'workflow_run' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT + - name: Set build date id: date shell: bash -x -e {0} @@ -34,28 +53,55 @@ jobs: BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + amd64: needs: metadata uses: ./.github/workflows/_build_t5x.yaml with: + ARCHITECTURE: amd64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit - publish: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] + arm64: + needs: metadata + runs-on: ubuntu-22.04 + outputs: + DOCKER_TAG_MEALKIT: '' + steps: + - name: Generate placeholder warning + shell: bash -x -e {0} + run: | + echo "WARNING: arm64 build is not yet supported" + + publish-mealkit: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' uses: ./.github/workflows/_publish_container.yaml - secrets: inherit with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} + ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} + TARGET_IMAGE: upstream-t5x + TARGET_TAGS: | + type=raw,value=mealkit,priority=500 + type=raw,value=mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 + + publish-final: + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' + uses: ./.github/workflows/_publish_container.yaml + with: + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} + ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} TARGET_IMAGE: upstream-t5x TARGET_TAGS: | type=raw,value=latest,priority=1000 type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} + secrets: inherit diff --git a/.github/workflows/nightly-te-build.yaml b/.github/workflows/nightly-te-build.yaml deleted file mode 100644 index 2b9cc3c30..000000000 --- a/.github/workflows/nightly-te-build.yaml +++ /dev/null @@ -1,65 +0,0 @@ -name: Nightly Transformer Engine build -run-name: Nightly Transformer Engine build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) - -on: - workflow_run: - workflows: [Nightly JAX build] - types: [completed] - branches: [main] - workflow_dispatch: - inputs: - PUBLISH: - type: boolean - description: Publish dated images and update the 'latest' tag? - default: false - required: false - -env: - TARGET: jax-te - DOCKER_REGISTRY: ghcr.io/nvidia - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - - metadata: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - steps: - - name: Set build date - id: date - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - - build: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - needs: metadata - uses: ./.github/workflows/_build_te.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - secrets: inherit - - publish: - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} - TARGET_IMAGE: jax-te - TARGET_TAGS: | - type=raw,value=latest,priority=1000 - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 \ No newline at end of file diff --git a/.github/workflows/nightly-te-test.yaml b/.github/workflows/nightly-te-test.yaml index e4e03881e..d95de68c1 100644 --- a/.github/workflows/nightly-te-test.yaml +++ b/.github/workflows/nightly-te-test.yaml @@ -3,7 +3,7 @@ run-name: Nightly Transformer Engine test (${{ github.event_name == 'workflow_ru on: workflow_run: - workflows: [Nightly Transformer Engine build] + workflows: [Nightly Pax build] types: [completed] branches: [main] workflow_dispatch: @@ -12,7 +12,7 @@ on: type: string description: 'JAX-TE image build by NVIDIA/JAX-Toolbox' required: true - default: 'ghcr.io/nvidia/jax-te:latest' + default: 'ghcr.io/nvidia/upstream-pax:latest' PUBLISH: type: boolean description: Update status badge? @@ -25,7 +25,7 @@ permissions: packages: write # to upload container env: - DEFAULT_JAX_TE_IMAGE: 'ghcr.io/nvidia/jax-te:latest' + DEFAULT_JAX_TE_IMAGE: 'ghcr.io/nvidia/upstream-pax:latest' jobs: diff --git a/.github/workflows/pax-cuda-121.yaml b/.github/workflows/pax-cuda-121.yaml deleted file mode 100644 index 01330beaa..000000000 --- a/.github/workflows/pax-cuda-121.yaml +++ /dev/null @@ -1,186 +0,0 @@ -name: Nightly Containers on CUDA 12.1 -run-name: Nightly Containers on CUDA 12.1 (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) - -on: - schedule: - - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC - workflow_dispatch: - inputs: - PUBLISH: - type: boolean - description: Publish dated images and update the 'latest' tag? - default: false - required: false - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - - metadata: - runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - steps: - - name: Set build date - id: date - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - - build-base: - needs: metadata - uses: ./.github/workflows/_build_base.yaml - with: - BASE_IMAGE: 'nvidia/cuda:12.1.1-devel-ubuntu22.04' - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - secrets: inherit - - build-jax: - needs: [metadata, build-base] - uses: ./.github/workflows/_build_jax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAGS }} - secrets: inherit - - build-pax: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_pax.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - secrets: inherit - - build-rosetta-pax: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-pax] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - BASE_LIBRARY: pax - PLATFORMS: '["amd64"]' - secrets: inherit - - build-t5x: - needs: [metadata, build-jax] - uses: ./.github/workflows/_build_t5x.yaml - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - secrets: inherit - - build-rosetta-t5x: - uses: ./.github/workflows/_build_rosetta.yaml - needs: [metadata, build-t5x] - with: - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - BASE_LIBRARY: t5x - PLATFORMS: '["amd64"]' - secrets: inherit - - build-summary: - needs: [build-base, build-jax, build-t5x, build-rosetta-t5x, build-pax, build-rosetta-pax] - if: always() - runs-on: ubuntu-22.04 - steps: - - name: Generate job summary for container build - shell: bash -x -e {0} - run: | - cat > $GITHUB_STEP_SUMMARY << EOF - # Images created - - | Image | Link | - | ------------ | -------------------------------------------------- | - | Base | ${{ needs.build-base.outputs.DOCKER_TAGS }} | - | JAX | ${{ needs.build-jax.outputs.DOCKER_TAGS }} | - | T5X | ${{ needs.build-t5x.outputs.DOCKER_TAGS }} | - | ROSETTA(T5X) | ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} | - | PAX | ${{ needs.build-pax.outputs.DOCKER_TAGS }} | - | ROSETTA(pax) | ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} | - EOF - - publish-upstream-pax: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build-pax] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - TARGET_IMAGE: upstream-pax - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }}-cuda-12.1,priority=900 - - publish-pax: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build-rosetta-pax] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAGS }} - TARGET_IMAGE: pax - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }}-cuda-12.1,priority=900 - - publish-upstream-t5x: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build-t5x] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - TARGET_IMAGE: upstream-t5x - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }}-cuda-12.1,priority=900 - - publish-t5x: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build-rosetta-t5x] - uses: ./.github/workflows/_publish_container.yaml - secrets: inherit - with: - SOURCE_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }} - TARGET_IMAGE: t5x - TARGET_TAGS: | - type=raw,value=nightly-${{ needs.metadata.outputs.BUILD_DATE }}-cuda-12.1,priority=900 - - test-jax: - needs: build-jax - uses: ./.github/workflows/_test_jax.yaml - with: - JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAGS }} - secrets: inherit - - test-pax: - needs: build-pax - uses: ./.github/workflows/_test_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }} - secrets: inherit - - test-t5x: - needs: build-t5x - uses: ./.github/workflows/_test_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-t5x.outputs.DOCKER_TAGS }} - secrets: inherit - - # TODO(terry): This is missing the rosetta tests which can only be added - # After a fix for the TB log collision is pushed. - - finalize: - if: always() - # TODO: use dynamic matrix to make dependencies self-updating - needs: [build-summary, test-jax, test-pax] - uses: ./.github/workflows/_finalize.yaml - with: - PUBLISH_BADGE: false - secrets: inherit diff --git a/.github/workflows/scripts/parse_git_src.sh b/.github/workflows/scripts/parse_git_src.sh new file mode 100755 index 000000000..16e95cdf9 --- /dev/null +++ b/.github/workflows/scripts/parse_git_src.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +parse_git_src() { + PACKAGE=$1 + SRC="$2" + echo "REPO_${PACKAGE}=$(echo "${SRC}" | cut -f1 -d#)" >> $GITHUB_OUTPUT + echo "REF_${PACKAGE}=$(echo "${SRC}" | cut -f2 -d#)" >> $GITHUB_OUTPUT +} \ No newline at end of file diff --git a/.github/workflows/weekly-base-build.yaml b/.github/workflows/weekly-base-build.yaml index 7211f478e..71a589124 100644 --- a/.github/workflows/weekly-base-build.yaml +++ b/.github/workflows/weekly-base-build.yaml @@ -12,10 +12,6 @@ on: default: false required: false -env: - TARGET: jax-toolbox - DOCKER_REGISTRY: ghcr.io/nvidia - permissions: contents: read # to fetch code actions: write # to cancel previous workflows @@ -27,6 +23,7 @@ jobs: runs-on: ubuntu-22.04 outputs: BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} steps: - name: Set build date id: date @@ -34,22 +31,46 @@ jobs: run: | BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + + - name: Determine whether results will be 'published' + id: if-publish + shell: bash -x -e {0} + run: | + echo "PUBLISH=${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT - build: + amd64: needs: metadata uses: ./.github/workflows/_build_base.yaml with: + ARCHITECTURE: amd64 + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} + secrets: inherit + + arm64: + needs: metadata + uses: ./.github/workflows/_build_base.yaml + with: + ARCHITECTURE: arm64 BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} secrets: inherit publish: - if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) - needs: [metadata, build] + needs: [metadata, amd64, arm64] + if: needs.metadata.outputs.PUBLISH == 'true' uses: ./.github/workflows/_publish_container.yaml - secrets: inherit with: - SOURCE_IMAGE: ${{ needs.build.outputs.DOCKER_TAGS }} + SOURCE_IMAGE: | + ${{ needs.amd64.outputs.DOCKER_TAG }} + ${{ needs.arm64.outputs.DOCKER_TAG }} TARGET_IMAGE: jax-toolbox TARGET_TAGS: | type=raw,value=base,priority=1000 - type=raw,value=base-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 \ No newline at end of file + type=raw,value=base-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 + + finalize: + if: always() + needs: [metadata, amd64, arm64] + uses: ./.github/workflows/_finalize.yaml + with: + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} + secrets: inherit diff --git a/rosetta/Dockerfile.pax b/rosetta/Dockerfile.pax index 8250827e3..b03e1ffd8 100644 --- a/rosetta/Dockerfile.pax +++ b/rosetta/Dockerfile.pax @@ -1,9 +1,17 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=ghcr.io/nvidia/upstream-pax:latest +ARG BASE_IMAGE=ghcr.io/nvidia/upstream-pax:mealkit ARG GIT_USER_EMAIL=jax@nvidia.com ARG GIT_USER_NAME=NVIDIA +ARG SRC_PATH_PAXML=/opt/paxml +ARG SRC_PATH_PRAXIS=/opt/praxis +# These patchlist paths should be relative to this script +ARG PAXML_PATCHLIST=patchlist-paxml.txt +ARG PRAXIS_PATCHLIST=patchlist-praxis.txt FROM scratch as rosetta-source +ARG SRC_PATH_PAXML +ARG SRC_PATH_PRAXIS + COPY . / FROM scratch as pax-mirror-source @@ -12,30 +20,41 @@ ADD --keep-git-dir=true https://github.com/google/paxml.git#main / FROM scratch as praxis-mirror-source ADD --keep-git-dir=true https://github.com/google/praxis.git#main / -FROM ${BASE_IMAGE} AS rosetta +############################################################################### +### Download source and add auxiliary scripts +################################################################################ + +FROM ${BASE_IMAGE} AS mealkit ENV ENABLE_TE=1 ARG GIT_USER_EMAIL ARG GIT_USER_NAME -RUN <> /opt/pip-tools.d/manifest.t5x +echo "-e file:///opt/rosetta" >> /opt/pip-tools.d/manifest.t5x EOF WORKDIR /opt/rosetta -RUN <