Skip to content

Nightly T5X MGMN performance test #143

Nightly T5X MGMN performance test

Nightly T5X MGMN performance test #143

name: Nightly T5X MGMN performance test
on:
workflow_run:
workflows: [Nightly T5X build]
types: [completed]
branches: [main]
workflow_dispatch:
inputs:
T5X_IMAGE:
type: string
description: T5X container
default: 'ghcr.io/nvidia/t5x:latest'
required: true
PUBLISH:
type: boolean
description: Publish dated results to tensorboard server?
default: false
required: false
permissions:
contents: read # to fetch code
actions: write # to cancel previous workflows
packages: write # to upload container
env:
DEFAULT_T5X_IMAGE: 'ghcr.io/nvidia/t5x:latest'
jobs:
metadata:
runs-on: ubuntu-22.04
outputs:
BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }}
T5X_IMAGE: ${{ steps.date.outputs.T5X_IMAGE }}
steps:
- name: Set metadata
id: date
shell: bash -x -e {0}
run: |
BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d')
echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT
T5X_IMAGE=${{ inputs.T5X_IMAGE }}
T5X_IMAGE=${T5X_IMAGE:-${{ env.DEFAULT_T5X_IMAGE }}}
echo "T5X_IMAGE=${T5X_IMAGE}" >> $GITHUB_OUTPUT
run-jobs:
needs: metadata
uses: ./.github/workflows/_test_t5x.yaml
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch'
with:
T5X_IMAGE: ${{ needs.metadata.outputs.T5X_IMAGE }}
secrets: inherit
publish:
needs: [metadata, run-jobs]
runs-on: ubuntu-22.04
steps:
- name: Setup SSH agent
uses: webfactory/ssh-agent@v0.8.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Setup SSH known hosts
id: ssh-known-hosts
run: |
mkdir -p ~/.ssh
cat >> ~/.ssh/known_hosts << EOF
${{ vars.SSH_KNOWN_HOSTS }}
EOF
chmod 600 ~/.ssh/known_hosts
echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT
- name: Setup SSH config
id: ssh-config
run: |
mkdir -p ~/.ssh
cat >> ~/.ssh/config << EOF
${{ vars.SSH_CONFIG }}
EOF
chmod 600 ~/.ssh/config
- name: Create dated folder and generate TensorBoard query URL
id: mkdir
shell: bash -x -e {0}
run: |
FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/T5X"
# copy folder
ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER}
ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/
# generate query URL
(
cat << EOF
## T5X MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }}
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per)
EOF
) | tee $GITHUB_STEP_SUMMARY
# - uses: actions/setup-python@v4
# with:
# python-version: '3.x'
# - name: Generate job summary
# id: metric
# shell: python {0} >> $GITHUB_STEP_SUMMARY
# run: |
# import re
# import pandas as pd
# metrics = pd.DataFrame([
# # Extract `metric` and `value` from `timings/metric=value`
# {re.split('=|/',s)[1] : float(re.split('=|/',s)[2]) for s in stat}
# for stat in re.findall(
# r".*collection=train .*"
# r"(timing/seconds=[\d.]+), (timing/seqs=[\d.]+), (timing/seqs_per_second=[\d.]+), "
# r"(timing/seqs_per_second_per_core=[\d.]+), (timing/steps_per_second=[\d.]+), "
# r"(timing/target_tokens_per_second=[\d.]+), (timing/target_tokens_per_second_per_core=[\d.]+).*",
# open('output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log').read()
# )
# ])
# summary = pd.DataFrame(metrics.tail(5).mean(axis=0)).transpose()
# print(summary.to_markdown(index=False))
if-upstream-failed:
runs-on: ubuntu-latest
if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch'
steps:
- run: echo 'Upstream workflow failed, aborting run' && exit 1