Skip to content

Commit

Permalink
Add ViT MGMN Tests (#270)
Browse files Browse the repository at this point in the history
Adds nightly ViT MGMN tests to the Rosetta CI

---------

Co-authored-by: Terry Kong <terryk@nvidia.com>
  • Loading branch information
ashors1 and terrykong authored Oct 10, 2023
1 parent 5c7351e commit 5745562
Show file tree
Hide file tree
Showing 5 changed files with 354 additions and 11 deletions.
327 changes: 327 additions & 0 deletions .github/workflows/_test_vit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
name: ~test ViT, MGMN

on:
workflow_call:
inputs:
ROSETTA_T5X_IMAGE:
type: string
description: Rosetta image from ghcr.io/nvidia/rosetta-t5x
default: 'ghcr.io/nvidia/rosetta-t5x:latest'
required: false
BATCH_SIZE_PER_GPU:
type: number
description: Batch size per GPU
default: 256
required: false
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
value: ${{ jobs.publish-test.outputs.STATUS }}

jobs:

single-process-multi-device:
strategy:
matrix:
N_GPU: [8]
fail-fast: false

runs-on: ubuntu-22.04

steps:
- name: Print environment variables
run: env

- name: Setup SSH agent
uses: webfactory/ssh-agent@v0.8.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

- name: Setup SSH known hosts
id: ssh-known-hosts
run: |
mkdir -p ~/.ssh
cat >> ~/.ssh/known_hosts << EOF
${{ vars.SSH_KNOWN_HOSTS }}
EOF
chmod 600 ~/.ssh/known_hosts
echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT
- name: Labels and metadata
id: meta
shell: bash -x -e {0}
run: |
IMAGE="$(echo ${{inputs.ROSETTA_T5X_IMAGE}} | sed 's/\//#/')"
TEST_CASE_NAME=VIT1P${{ matrix.N_GPU }}G
JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }}))
for var in IMAGE TEST_CASE_NAME JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do
echo "$var=${!var}" >> $GITHUB_OUTPUT
done
- name: Submit SLURM jobs over SSH
id: submit
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
#SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
#SBATCH --exclusive
#SBATCH --nodes=1
#SBATCH --tasks=1
#SBATCH --gpus-per-node=${{ matrix.N_GPU }}
#SBATCH --time=00:30:00
#SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
#SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }},XLA_PYTHON_CLIENT_MEM_FRACTION=0.9"
time srun \
--container-image=${{ steps.meta.outputs.IMAGE }} \
--container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
--container-entrypoint \
test-vit.sh \
--output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \
--dtype bfloat16 \
--batch-size ${{ steps.meta.outputs.BATCH_SIZE }}
EOF
)
set +x
while sshx squeue -j $JOB | grep -q $JOB; do
echo "SLURM Job $JOB is still running."
sleep 15
done
echo "SLRUM Job $JOB finished."
# Gather job info
SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
echo "SLURM Job state is ${SLURM_STATE}"
echo "SLURM Job exit code is ${SLURM_EXITCODE}"
echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
set -x
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
mkdir output/
rsync -rtz --progress \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
run: |
python << EOF
import json
with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
name: ${{ steps.meta.outputs.JOB_NAME }}
path: output/*

multi-gpu-multi-node:
strategy:
matrix:
N_GPU: [1, 8]
N_NODE: [1, 2]
fail-fast: false

runs-on: ubuntu-22.04

steps:
- name: Print environment variables
run: env

- name: Setup SSH agent
uses: webfactory/ssh-agent@v0.8.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

- name: Setup SSH known hosts
id: ssh-known-hosts
run: |
mkdir -p ~/.ssh
cat >> ~/.ssh/known_hosts << EOF
${{ vars.SSH_KNOWN_HOSTS }}
EOF
chmod 600 ~/.ssh/known_hosts
echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT
- name: Labels and metadata
id: meta
shell: bash -x -e {0}
run: |
IMAGE="$(echo ${{inputs.ROSETTA_T5X_IMAGE}} | sed 's/\//#/')"
TEST_CASE_NAME=VIT${{ matrix.N_GPU }}G${{ matrix.N_NODE }}N
TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }}))
JOB_NAME=${GITHUB_RUN_ID}-${TEST_CASE_NAME}
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
MODEL_PATH=/nfs/cluster/${JOB_NAME}
BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }}))
for var in IMAGE TEST_CASE_NAME TOTAL_TASKS JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do
echo "$var=${!var}" >> $GITHUB_OUTPUT
done
- name: Submit SLURM jobs over SSH
id: submit
shell: bash -O expand_aliases -x -e {0}
run: |
alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
sshx "date && hostname && sinfo"
sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
JOB=$(sshx sbatch --parsable << EOF
#!/bin/bash
#SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }}
#SBATCH --exclusive
#SBATCH --nodes=${{ matrix.N_NODE }}
#SBATCH --gpus-per-node=${{ matrix.N_GPU }}
#SBATCH --tasks=${{ steps.meta.outputs.TOTAL_TASKS }}
#SBATCH --tasks-per-node=${{ matrix.N_GPU }}
#SBATCH --time=00:30:00
#SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
#SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }},XLA_PYTHON_CLIENT_MEM_FRACTION=0.9"
time srun \
--container-image=${{ steps.meta.outputs.IMAGE }} \
--container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \
--container-entrypoint \
test-vit.sh \
--output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \
--dtype bfloat16 \
--batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \
--multiprocess
EOF
)
set +x
while sshx squeue -j $JOB | grep -q $JOB; do
echo "SLURM Job $JOB is still running."
sleep 15
done
echo "SLRUM Job $JOB finished."
# Gather job info
SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1)
SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g')
echo "SLURM Job state is ${SLURM_STATE}"
echo "SLURM Job exit code is ${SLURM_EXITCODE}"
echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT"
echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT"
set -x
- name: Retrieve training logs and upload to TensorBoard server
shell: bash -x -e {0}
run: |
mkdir output/
rsync -rtz --progress \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
rsync -rtz --progress \
${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
output/ || true
rsync -rtz --progress \
output/ \
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${GITHUB_RUN_ID}/ || true
- name: Write SLURM job status to file
shell: bash -x -e {0}
run: |
python << EOF
import json
with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v3
with:
name: ${{ steps.meta.outputs.JOB_NAME }}
path: output/*

publish-test:
needs: [multi-gpu-multi-node, single-process-multi-device]
uses: ./.github/workflows/_publish_badge.yaml
if: ( always() )
secrets: inherit
with:
ENDPOINT_FILENAME: 'vit-test-completion-status.json'
PUBLISH: false
SCRIPT: |
EXIT_STATUSES="${GITHUB_RUN_ID}-*/*-status.json"
PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
FAILED_TESTS=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l)
cat <<EOF >>$GITHUB_STEP_SUMMARY
## ViT MGMN+SPMD Test Status
| Test Case | State | Exit Code |
| --- | --- | --- |
EOF
for i in $EXIT_STATUSES; do
# Files are named <GHID>-<NAME>/<NAME>-status.json
echo "| $(echo $i | cut -d/ -f1 | cut -d- -f2) | $(jq -r .state $i) | $(jq -r .exitcode $i)"
done | tee -a $GITHUB_STEP_SUMMARY
echo "Test statuses:"
jq -rc 'input_filename,.' $EXIT_STATUSES
if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then
echo "STATUS=success" >> $GITHUB_OUTPUT
BADGE_COLOR=brightgreen
elif [[ $PASSED_TESTS -eq 0 ]]; then
echo "STATUS=failure" >> $GITHUB_OUTPUT
BADGE_COLOR=red
else
echo "STATUS=failure" >> $GITHUB_OUTPUT
BADGE_COLOR=yellow
fi
echo "LABEL='Completion'" >> $GITHUB_OUTPUT
echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
summary:
runs-on: ubuntu-22.04

steps:
- name: Generate TensorBoard query URL
run: |
(
cat << EOF
## ViT MGMN training
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=${GITHUB_RUN_ID}/VIT&_smoothingWeight=0&tagFilter=samples_per)
EOF
) | tee $GITHUB_STEP_SUMMARY
outcome:
needs: publish-test
runs-on: ubuntu-22.04
if: ( always() )
steps:
- name: Sets workflow status based on test outputs
run: |
if [[ ${{ needs.publish-test.outputs.STATUS }} != success ]]; then
exit 1
fi
9 changes: 8 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,6 @@ jobs:
with:
JAX_TE_IMAGE: ${{ needs.build-te.outputs.DOCKER_TAGS }}
secrets: inherit

test-t5x:
needs: build-t5x
uses: ./.github/workflows/_test_t5x.yaml
Expand All @@ -237,6 +236,14 @@ jobs:
PAX_IMAGE: ${{ needs.build-pax.outputs.DOCKER_TAGS }}
secrets: inherit

test-vit:
needs: build-rosetta-t5x
uses: ./.github/workflows/_test_vit.yaml
with:
ROSETTA_T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAGS }}
secrets: inherit


finalize:
if: always()
# TODO: use dynamic matrix to make dependencies self-updating
Expand Down
Loading

0 comments on commit 5745562

Please sign in to comment.