Nightly T5X MGMN performance test #143
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Nightly T5X MGMN performance test | |
on: | |
workflow_run: | |
workflows: [Nightly T5X build] | |
types: [completed] | |
branches: [main] | |
workflow_dispatch: | |
inputs: | |
T5X_IMAGE: | |
type: string | |
description: T5X container | |
default: 'ghcr.io/nvidia/t5x:latest' | |
required: true | |
PUBLISH: | |
type: boolean | |
description: Publish dated results to tensorboard server? | |
default: false | |
required: false | |
permissions: | |
contents: read # to fetch code | |
actions: write # to cancel previous workflows | |
packages: write # to upload container | |
env: | |
DEFAULT_T5X_IMAGE: 'ghcr.io/nvidia/t5x:latest' | |
jobs: | |
metadata: | |
runs-on: ubuntu-22.04 | |
outputs: | |
BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} | |
T5X_IMAGE: ${{ steps.date.outputs.T5X_IMAGE }} | |
steps: | |
- name: Set metadata | |
id: date | |
shell: bash -x -e {0} | |
run: | | |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') | |
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT | |
T5X_IMAGE=${{ inputs.T5X_IMAGE }} | |
T5X_IMAGE=${T5X_IMAGE:-${{ env.DEFAULT_T5X_IMAGE }}} | |
echo "T5X_IMAGE=${T5X_IMAGE}" >> $GITHUB_OUTPUT | |
run-jobs: | |
needs: metadata | |
uses: ./.github/workflows/_test_t5x.yaml | |
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' | |
with: | |
T5X_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }} | |
secrets: inherit | |
publish: | |
needs: [metadata, run-jobs] | |
runs-on: ubuntu-22.04 | |
steps: | |
- name: Setup SSH agent | |
uses: webfactory/ssh-agent@v0.8.0 | |
with: | |
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} | |
- name: Setup SSH known hosts | |
id: ssh-known-hosts | |
run: | | |
mkdir -p ~/.ssh | |
cat >> ~/.ssh/known_hosts << EOF | |
${{ vars.SSH_KNOWN_HOSTS }} | |
EOF | |
chmod 600 ~/.ssh/known_hosts | |
echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT | |
- name: Setup SSH config | |
id: ssh-config | |
run: | | |
mkdir -p ~/.ssh | |
cat >> ~/.ssh/config << EOF | |
${{ vars.SSH_CONFIG }} | |
EOF | |
chmod 600 ~/.ssh/config | |
- name: Create dated folder and generate TensorBoard query URL | |
id: mkdir | |
shell: bash -x -e {0} | |
run: | | |
FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/T5X" | |
# copy folder | |
ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER} | |
ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/ | |
# generate query URL | |
( | |
cat << EOF | |
## T5X MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }} | |
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) | |
EOF | |
) | tee $GITHUB_STEP_SUMMARY | |
# - uses: actions/setup-python@v4 | |
# with: | |
# python-version: '3.x' | |
# - name: Generate job summary | |
# id: metric | |
# shell: python {0} >> $GITHUB_STEP_SUMMARY | |
# run: | | |
# import re | |
# import pandas as pd | |
# metrics = pd.DataFrame([ | |
# # Extract `metric` and `value` from `timings/metric=value` | |
# {re.split('=|/',s)[1] : float(re.split('=|/',s)[2]) for s in stat} | |
# for stat in re.findall( | |
# r".*collection=train .*" | |
# r"(timing/seconds=[\d.]+), (timing/seqs=[\d.]+), (timing/seqs_per_second=[\d.]+), " | |
# r"(timing/seqs_per_second_per_core=[\d.]+), (timing/steps_per_second=[\d.]+), " | |
# r"(timing/target_tokens_per_second=[\d.]+), (timing/target_tokens_per_second_per_core=[\d.]+).*", | |
# open('output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log').read() | |
# ) | |
# ]) | |
# summary = pd.DataFrame(metrics.tail(5).mean(axis=0)).transpose() | |
# print(summary.to_markdown(index=False)) | |
if-upstream-failed: | |
runs-on: ubuntu-latest | |
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' | |
steps: | |
- run: echo 'Upstream workflow failed, aborting run' && exit 1 |