From 0bbdc06b24047767ccd4657d8c9591e230f0df45 Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Wed, 9 Oct 2024 14:39:13 -0700 Subject: [PATCH] Add GPU CI/CD (#253) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add yaml files to gh workflows Signed-off-by: Sarah Yurick * edit spacing Signed-off-by: Sarah Yurick * no cache dir Signed-off-by: Sarah Yurick * cmake Signed-off-by: Sarah Yurick * fasttext wheel Signed-off-by: Sarah Yurick * python3 dev Signed-off-by: Sarah Yurick * get update Signed-off-by: Sarah Yurick * c installs Signed-off-by: Sarah Yurick * setuptools pip upgrade Signed-off-by: Sarah Yurick * use stable rapids Signed-off-by: Sarah Yurick * remove wheel see what happens Signed-off-by: Sarah Yurick * edit readme and remove autolabel for now Signed-off-by: Sarah Yurick * add container logic Signed-off-by: Sarah Yurick * add dockerfile and oliver's other suggestions Signed-off-by: Sarah Yurick * fix run format Signed-off-by: Sarah Yurick * forked repo url Signed-off-by: Sarah Yurick * docker run with all gpus Signed-off-by: Sarah Yurick * remove running container Signed-off-by: Sarah Yurick * Update .github/workflows/gpuci.yml Co-authored-by: oliver könig Signed-off-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> * re add test Signed-off-by: Sarah Yurick * debug attempt Signed-off-by: Sarah Yurick * remove it Signed-off-by: Sarah Yurick * add library path Signed-off-by: Sarah Yurick * remove nvcc check Signed-off-by: Sarah Yurick * more debugging Signed-off-by: Sarah Yurick * specify curator dir Signed-off-by: Sarah Yurick * more debugging Signed-off-by: Sarah Yurick * try pytorch container Signed-off-by: Sarah Yurick * use rapids container Signed-off-by: Sarah Yurick * fix RUN instructions Signed-off-by: Sarah Yurick * add comments and review suggestions Signed-off-by: Sarah Yurick * update runners Signed-off-by: Sarah Yurick * move args Signed-off-by: Sarah Yurick --------- Signed-off-by: Sarah Yurick Signed-off-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Co-authored-by: oliver könig --- .github/workflows/_build_container.yml | 99 ++++++++++++++++++++++++++ .github/workflows/gpuci.yml | 72 +++++++++++++++++++ Dockerfile | 33 +++++++++ README.md | 6 +- nemo_curator/utils/import_utils.py | 2 +- 5 files changed, 208 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/_build_container.yml create mode 100644 .github/workflows/gpuci.yml create mode 100644 Dockerfile diff --git a/.github/workflows/_build_container.yml b/.github/workflows/_build_container.yml new file mode 100644 index 00000000..7e15468b --- /dev/null +++ b/.github/workflows/_build_container.yml @@ -0,0 +1,99 @@ +name: Build NeMo Curator container +on: + # This script is called by "gpuci.yaml" + # We specify a Git reference to checkout, defaulting to the SHA of the commit that triggered the workflow + workflow_call: + inputs: + ref: + description: Git ref to checkout + default: ${{ github.sha }} + required: false + type: string + +defaults: + # Sets default options for executing shell commands in the workflow + # `-x` enables debugging output + # `-e` ensures that the workflow fails fast on errors + # `-u` treats unset variables as errors + # `-o pipefail` ensures that any failures in a pipeline are detected + run: + shell: bash -x -e -u -o pipefail {0} + +jobs: + main: + # This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners + # It is designated for build jobs + runs-on: self-hosted-azure-builder + steps: + # Checks out the repository code using the actions/checkout action, + # storing it in a directory named after the unique workflow run ID + # It checks out the specific commit or branch based on the input sha provided when the workflow is called + - name: Checkout repository + uses: actions/checkout@v4 + with: + path: ${{ github.run_id }} + ref: ${{ inputs.sha }} + + # Cleans up unused Docker resources that haven't been used in the last 24 hours + - name: Clean runner cache + run: | + docker system prune --filter "until=24h" --force + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + # We use `docker` driver as this speeds things up for + # trivial (non-multi-stage) builds. + driver: docker + + # Pull cached Docker images from a specified Azure Container Registry + # It first attempts to pull an image with a tag based on the current PR number (if available) and defaults to buildcache if not + # It then tries to pull the buildcache image regardless of the outcome of the previous command + # The use of || true allows the workflow to continue even if one or both pull commands fail, + # which ensures that the workflow can proceed without interruption + - name: Pull cache images + run: | + docker pull nemoci.azurecr.io/nemo_curator_container:${{ github.event.pull_request.number || 'buildcache' }} || true + docker pull nemoci.azurecr.io/nemo_curator_container:buildcache || true + + - name: Build and push + uses: docker/build-push-action@v5 + with: + # Specifies the path to the Dockerfile to use for building the Docker image (located in the root of the repository) + file: Dockerfile + # The built image should be pushed to the container registry after it is successfully built + push: true + # Specifies build arguments that can be passed into the Dockerfile + # `FORKED_REPO_URL` is the URL to the user's forked repository + # `CURATOR_COMMIT` is the PR's head SHA if available; otherwise, it falls back to the current commit SHA + build-args: | + FORKED_REPO_URL=https://github.com/${{ github.event.pull_request.head.repo.full_name }}.git + CURATOR_COMMIT=${{ github.event.pull_request.head.sha || github.sha }} + # Specifies the images to use as cache sources during the build process + cache-from: | + nemoci.azurecr.io/nemo_curator_container:${{ github.event.pull_request.number || 'buildcache' }} + nemoci.azurecr.io/nemo_curator_container:buildcache + # Inline caching allows the cache to be available for future builds without needing to push it to a separate repository + cache-to: type=inline + # Specifies the tag under which the built image will be pushed to the container registry + # Uses the "github.run_id" to ensure that each build has a unique tag + tags: nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} + + # Updates the Docker image associated with a PR by tagging the built image with the PR number + # and then pushing that tagged image to the Azure Container Registry + - name: Update PR image + if: github.event_name == 'pull_request' + run: | + docker tag nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} nemoci.azurecr.io/nemo_curator_container:${{ github.event.pull_request.number }} + docker push nemoci.azurecr.io/nemo_curator_container:${{ github.event.pull_request.number }} + + - name: Update buildcache image + # Only executes when there is a push to the main branch + # Ensures that the build cache is updated only for stable versions of the codebase + if: github.ref == 'refs/heads/main' + # Updates the Docker image tagged as the build cache by: + # 1. Tagging the built image from the current workflow run with the buildcache tag, and + # 2. Pushing that tagged image to the Azure Container Registry + run: | + docker tag nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} nemoci.azurecr.io/nemo_curator_container:buildcache + docker push nemoci.azurecr.io/nemo_curator_container:buildcache diff --git a/.github/workflows/gpuci.yml b/.github/workflows/gpuci.yml new file mode 100644 index 00000000..b63acf74 --- /dev/null +++ b/.github/workflows/gpuci.yml @@ -0,0 +1,72 @@ +name: "GPU CI/CD" + +on: + pull_request: + branches: + # We can run gpuCI on any PR targeting these branches + - 'main' + - '[rv][0-9].[0-9].[0-9]' + - '[rv][0-9].[0-9].[0-9]rc[0-9]' + # PR has to be labeled with "gpuCI" label + # If new commits are added, the "gpuCI" label has to be removed and re-added to rerun gpuCI + types: [ labeled ] + +jobs: + # First, we build and push a NeMo-Curator container + build-container: + # "build-container" job is run if the "gpuci" label is added to the PR + if: ${{ github.event.label.name == 'gpuci' }} + uses: ./.github/workflows/_build_container.yml + + # Then, we run our PyTests in the container we just built + run-gpu-tests: + needs: build-container + # This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners + # It has 2 A100 GPUs + runs-on: self-hosted-azure + # "run-gpu-tests" job is run if the "gpuci" label is added to the PR + if: ${{ github.event.label.name == 'gpuci' }} + + steps: + # If something went wrong during the last cleanup, this step ensures any existing container is removed + - name: Remove existing container if it exists + run: | + if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then + docker rm -f nemo-curator-container + fi + + # This runs the container which was pushed by build-container, which we call "nemo-curator-container" + # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container + # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with + # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting + - name: Run Docker container + run: | + docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity" + + # Expect `whoami` to be "azureuser" + # Expect `nvidia-smi` to show our 2 A100 GPUs + - name: Check GPUs + run: | + whoami + docker exec nemo-curator-container nvidia-smi + + # In the virtual environment (called "curator") we created in the container, + # list all of our packages. Useful for debugging + - name: Verify installations + run: | + docker exec nemo-curator-container conda run -n curator pip list + + # In the virtual environment (called "curator") we created in the container, + # run our PyTests marked with `@pytest.mark.gpu` + # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository), + # and then the directory where the PyTests are located + - name: Run PyTests with GPU mark + run: | + docker exec nemo-curator-container conda run -n curator pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests + + # After running `docker stop`, the container remains in an exited state + # It is still present on our system and could be restarted with `docker start` + # Thus, we use `docker rm` to permanently removed it from the system + - name: Cleanup + run: | + docker stop nemo-curator-container && docker rm nemo-curator-container diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..aa782055 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +# See https://github.com/rapidsai/ci-imgs for ARG options +# NeMo Curator requires Python 3.10, Ubuntu 22.04/20.04, and CUDA 12 (or above) +ARG CUDA_VER=12.5.1 +ARG LINUX_VER=ubuntu22.04 +ARG PYTHON_VER=3.10 +FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER} + +WORKDIR /opt + +# Install the minimal libcu* libraries needed by NeMo Curator +RUN conda create -y --name curator -c conda-forge -c nvidia \ + python=3.10 \ + cuda-cudart \ + libcufft \ + libcublas \ + libcurand \ + libcusparse \ + libcusolver + +# Needed to navigate to and pull the forked repository's changes +ARG FORKED_REPO_URL +ARG CURATOR_COMMIT + +# Clone the user's repository, find the relevant commit, and install everything we need +RUN bash -exu <