Skip to content

Commit

Permalink
Fix metrics for all MGMN tests (#622)
Browse files Browse the repository at this point in the history
After #509 was merged, the
metrics were broken since there was an unexpected number of hyphens.
This obtains the job name more robustly by splitting on the
GITHUB_RUN_ID, which all experiment dirs should have.

* Introduces `--output_json_path` to be more explicit
* Fixes rosetta-t5x which still used v3 actions, which caused some
issues download artifacts
* Adds size_guidance to the TB EventAccumulator to ensure all steps are
loaded

---------

Co-authored-by: Yu-Hang "Maxin" Tang <Tang.Maxin@gmail.com>
  • Loading branch information
terrykong and yhtang authored Mar 8, 2024
1 parent 9c2ae8f commit 9cced45
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 64 deletions.
14 changes: 8 additions & 6 deletions .github/workflows/_test_maxtext.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ on:
type: string
description: 'Name of the framework being used'
required: false
default: 'MaxText'
default: 'upstream-maxtext'
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
Expand Down Expand Up @@ -343,21 +343,23 @@ jobs:
uses: actions/download-artifact@v4

- name: Run pytest
shell: bash -x {0}
shell: bash -eux {0}
run: |
pip install pytest pytest-reportlog tensorboard
for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*; do
SUBDIR=$(echo $i | cut -d'-' -f3)
mv $i/$SUBDIR* .
python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR --loss_summary_name "learning/loss" --perf_summary_name "perf/step_time_seconds" # create result json in baseline format
JOB_NAME=$(echo $i | awk -F "${GITHUB_RUN_ID}-" '{print $2}')
METRIC_PATH=${JOB_NAME}_metrics.json
python3 .github/workflows/baselines/summarize_metrics.py $i/$JOB_NAME --loss_summary_name "learning/loss" --perf_summary_name "perf/step_time_seconds" --output_json_path $METRIC_PATH
# Test script expects the job dir and the log to be in the CWD
mv $i/$JOB_NAME $i/${JOB_NAME}.log .
done
RESULTS_DIR=$PWD BASELINES_DIR=MAXTEXT/upstream pytest --report-log=report.jsonl .github/workflows/baselines/test_maxtext_metrics.py || true
- name: Upload metrics test json logs
uses: actions/upload-artifact@v4
with:
name: upstream-${{ inputs.FW_NAME }}-metrics-test-log
name: ${{ inputs.FW_NAME }}-metrics-test-log
path: |
report.jsonl
*_metrics.json
Expand Down
27 changes: 6 additions & 21 deletions .github/workflows/_test_pax_rosetta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1033,32 +1033,17 @@ jobs:
uses: actions/download-artifact@v4

- name: Run pytest
shell: bash -x {0}
shell: bash -eux {0}
run: |
pip install pytest pytest-reportlog tensorboard
for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*; do
SUBDIR=$(echo $i | rev | cut -d'-' -f1 | rev)
mv $i/$SUBDIR* .
python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format
JOB_NAME=$(echo $i | awk -F "${GITHUB_RUN_ID}-" '{print $2}')
METRIC_PATH=${JOB_NAME}_metrics.json
python3 .github/workflows/baselines/summarize_metrics.py $i/$JOB_NAME --output_json_path $METRIC_PATH
# Test script expects the job dir and the log to be in the CWD
mv $i/$JOB_NAME $i/${JOB_NAME}.log .
done
echo '## Rosetta PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY
python <<EOF | tee -a $GITHUB_STEP_SUMMARY
import json
files = "$(echo *_metrics.json)".split()
header = None
print_row = lambda lst: print('| ' + ' | '.join(str(el) for el in lst) + ' |')
for path in files:
with open(path) as f:
obj = json.loads(f.read())
if not header:
header = list(obj.keys())
print_row(["Job Name"] + header)
print_row(["---"] * (1+len(header)))
job_name = path[:-len('_metrics.json')]
print_row([job_name] + [obj[h] for h in header])
EOF
RESULTS_DIR=$PWD BASELINES_DIR=PAX_MGMN/rosetta pytest --report-log=report.jsonl .github/workflows/baselines/test_pax_mgmn_metrics.py || true
- name: Upload metrics test json logs
Expand Down
33 changes: 9 additions & 24 deletions .github/workflows/_test_t5x_rosetta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -784,42 +784,27 @@ jobs:

steps:
- name: Check out the repository under ${GITHUB_WORKSPACE}
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Download artifacts
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4

- name: Run pytest
shell: bash -x {0}
shell: bash -eux {0}
run: |
pip install pytest pytest-reportlog tensorboard
for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-* ${{ inputs.FW_NAME }}-vit-${GITHUB_RUN_ID}-*; do
SUBDIR=$(echo $i | rev | cut -d'-' -f1 | rev)
mv $i/$SUBDIR* .
python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format
JOB_NAME=$(echo $i | awk -F "${GITHUB_RUN_ID}-" '{print $2}')
METRIC_PATH=${JOB_NAME}_metrics.json
python3 .github/workflows/baselines/summarize_metrics.py $i/$JOB_NAME --perf_summary_name "timing/steps_per_second" --output_json_path $METRIC_PATH
# Test script expects the job dir and the log to be in the CWD
mv $i/$JOB_NAME $i/${JOB_NAME}.log .
done
echo '## Rosetta T5X MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY
python <<EOF | tee -a $GITHUB_STEP_SUMMARY
import json
files = "$(echo *_metrics.json)".split()
header = None
print_row = lambda lst: print('| ' + ' | '.join(str(el) for el in lst) + ' |')
for path in files:
with open(path) as f:
obj = json.loads(f.read())
if not header:
header = list(obj.keys())
print_row(["Job Name"] + header)
print_row(["---"] * (1+len(header)))
job_name = path[:-len('_metrics.json')]
print_row([job_name] + [obj[h] for h in header])
EOF
RESULTS_DIR=$PWD BASELINES_DIR=T5X_MGMN/rosetta pytest --report-log=report.jsonl .github/workflows/baselines/test_t5x_mgmn_metrics.py || true
- name: Upload metrics test json logs
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: ${{ inputs.FW_NAME }}-metrics-test-log
path: |
Expand Down
10 changes: 6 additions & 4 deletions .github/workflows/_test_upstream_pax.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -486,13 +486,15 @@ jobs:
uses: actions/download-artifact@v4

- name: Run pytest
shell: bash -x {0}
shell: bash -eux {0}
run: |
pip install pytest pytest-reportlog tensorboard
for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*; do
SUBDIR=$(echo $i | cut -d'-' -f3)
mv $i/$SUBDIR* .
python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format
JOB_NAME=$(echo $i | awk -F "${GITHUB_RUN_ID}-" '{print $2}')
METRIC_PATH=${JOB_NAME}_metrics.json
python3 .github/workflows/baselines/summarize_metrics.py $i/$JOB_NAME --output_json_path $METRIC_PATH
# Test script expects the job dir and the log to be in the CWD
mv $i/$JOB_NAME $i/${JOB_NAME}.log .
done
RESULTS_DIR=$PWD BASELINES_DIR=PAX_MGMN/upstream pytest --report-log=report.jsonl .github/workflows/baselines/test_pax_mgmn_metrics.py || true
Expand Down
10 changes: 6 additions & 4 deletions .github/workflows/_test_upstream_t5x.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -324,13 +324,15 @@ jobs:
uses: actions/download-artifact@v4

- name: Run pytest
shell: bash -x {0}
shell: bash -eux {0}
run: |
pip install pytest pytest-reportlog tensorboard
for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*; do
SUBDIR=$(echo $i | cut -d'-' -f3)
mv $i/$SUBDIR* .
python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR --perf_summary_name "timing/steps_per_second" # create result json in baseline format
JOB_NAME=$(echo $i | awk -F "${GITHUB_RUN_ID}-" '{print $2}')
METRIC_PATH=${JOB_NAME}_metrics.json
python3 .github/workflows/baselines/summarize_metrics.py $i/$JOB_NAME --perf_summary_name "timing/steps_per_second" --output_json_path $METRIC_PATH
# Test script expects the job dir and the log to be in the CWD
mv $i/$JOB_NAME $i/${JOB_NAME}.log .
done
RESULTS_DIR=$PWD BASELINES_DIR=T5X_MGMN/upstream pytest --report-log=report.jsonl .github/workflows/baselines/test_t5x_mgmn_metrics.py || true
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/baselines/summarize_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def main():
help='Key in the tensorboard event file containing loss data')
parser.add_argument('-p', '--perf_summary_name', default='Steps/sec',
help='Key in the tensorboard event file containing perf data')
parser.add_argument('-o', '--output_json_path', type=str)
args = parser.parse_args()

try:
Expand Down Expand Up @@ -59,9 +60,10 @@ def main():
e2e_time = read_e2e_time(args.test_config + ".log")

baseline = _create_summary(loss, train_time, e2e_time)
json_fname = args.test_config + "_metrics.json"
print(f'JSON FILENAME: {json_fname}')
with open(json_fname, "w") as f:
print(f'JSON FILENAME: {args.output_json_path}')
if os.path.exists(args.output_json_path):
raise FileExistsError(f"Did not expect --output_json_path={args.output_json_path} to exist. Delete it before proceeding.")
with open(args.output_json_path, "w") as f:
json.dump(baseline, f, indent=2)

except KeyError as e:
Expand Down
14 changes: 12 additions & 2 deletions .github/workflows/baselines/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,18 @@
from tensorboard.util import tensor_util
import numpy as np

# By default TB tries to be smart about what to load in memory to avoid OOM
# Since we expect every step to be there when we do our comparisons, we explicitly
# set the size guidance to 0 so that we load everything. It's okay given our tests
# are small/short.
size_guidance = {
event_accumulator.TENSORS: 0,
event_accumulator.SCALARS: 0,
}


def read_tb_tag(tb_file: str, summary_name: str) -> dict:
ea = event_accumulator.EventAccumulator(tb_file)
ea = event_accumulator.EventAccumulator(tb_file, size_guidance)
ea.Reload()

return {
Expand All @@ -12,7 +22,7 @@ def read_tb_tag(tb_file: str, summary_name: str) -> dict:
}

def read_maxtext_tb_tag(tb_file: str, summary_name: str) -> dict:
ea = event_accumulator.EventAccumulator(tb_file)
ea = event_accumulator.EventAccumulator(tb_file, size_guidance)
ea.Reload()

return {
Expand Down

0 comments on commit 9cced45

Please sign in to comment.