diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index b14e8f3d4..584110e93 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -148,6 +148,43 @@ jobs: name: ${{ steps.meta.outputs.JOB_NAME }} path: output/* + metrics: + needs: multi-gpu-multi-node + runs-on: ubuntu-22.04 + + steps: + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v3 + + - name: Download artifacts + uses: actions/download-artifact@v3 + + - name: Run pytest + shell: bash -x {0} + run: | + pip install pytest pytest-reportlog tensorboard + for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do + SUBDIR=$(echo $i | cut -d'-' -f2) + mv $i/$SUBDIR* . + python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format + done + + echo '## PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY + for i in *_metrics.json; do + echo $i | cut -d'.' -f1 + echo '```json' + jq . $i + echo '```' + done | tee -a $GITHUB_STEP_SUMMARY + + RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/test_pax_mgmn_metrics.py || true + + - name: Upload metrics test json logs + uses: actions/upload-artifact@v3 + with: + name: metrics-test-log + path: report.jsonl + summary: runs-on: ubuntu-22.04 diff --git a/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json new file mode 100644 index 000000000..6154dd526 --- /dev/null +++ b/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json @@ -0,0 +1 @@ +{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.00046839181838246685,2.087271119914173e-05,1.31276870736959e-06,6.912159155233096e-11,0],"step_times":[6.357304414113362,5.979689915974935,6.376240253448486,6.373825391133626,6.355693658192952],"step_time_avg":6.288550726572673,"e2e_time_seconds":295.73600000000005,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"} diff --git a/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json new file mode 100644 index 000000000..a994227aa --- /dev/null +++ b/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json @@ -0,0 +1 @@ +{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.0004682748403865844,2.090286701180351e-05,1.3127760970140419e-06,5.8207657444020455e-11,0],"step_times":[6.688000361124675,6.699192523956299,6.694862047831218,6.698123772939046,6.700749556223552],"step_time_avg":6.6961856524149574,"e2e_time_seconds":223.268,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"} diff --git a/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json b/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json new file mode 100644 index 000000000..92caac7c4 --- /dev/null +++ b/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json @@ -0,0 +1 @@ +{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.00043803153675980866,2.2190377421793528e-05,1.4306265256891493e-06,5.8207657444020455e-11,0],"step_times":[2.357959032058716,2.3574414253234863,2.3560804526011148,2.357269843419393,2.3561060428619385],"step_time_avg":2.3569713592529298,"e2e_time_seconds":385.921,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"} diff --git a/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json new file mode 100644 index 000000000..94e2212ec --- /dev/null +++ b/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json @@ -0,0 +1 @@ +{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.00046849539891506237,2.0879013391095214e-05,1.3132464952529215e-06,5.8207657444020455e-11,0],"step_times":[6.436240037282308,6.217730363210042,6.462920983632405,6.463934898376465,6.473924477895101],"step_time_avg":6.4109501520792636,"e2e_time_seconds":284.0213333333333,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"} diff --git a/.github/workflows/baselines/average_baselines.py b/.github/workflows/baselines/average_baselines.py new file mode 100644 index 000000000..2c0594f5f --- /dev/null +++ b/.github/workflows/baselines/average_baselines.py @@ -0,0 +1,47 @@ +import os +import sys +import numpy as np +import json + +def main(): + if len(sys.argv) < 3: + sys.exit(1) + + config = sys.argv[1] + run_dirs = sys.argv[2:] + + # Store metrics data as list of dicts + json_fnames = [f"{r}/{config}_metrics.json" for r in run_dirs] + src_data = [] + for fname in json_fnames: + with open(fname, "r") as f: + src_data.append(json.load(f)) + + # Ensure start step, end step, interval equal across runs + src_data + for k in ["start_step", "end_step", "step_interval"]: + values = [metrics[k] for metrics in src_data] + print("checking equality for", k) + print(values) + assert all([v == values[0] for v in values]) + + # Gather metrics across dirs + avg_data = src_data[0].copy() # Use first metrics dict as a template + loss_data = np.array([metrics["loss_values"] for metrics in src_data]) + step_times_data = np.array([metrics["step_times"] for metrics in src_data]) + mean_step_times_data = np.array([metrics["step_time_avg"] for metrics in src_data]) + e2e_time_data = np.array([metrics["e2e_time_seconds"] for metrics in src_data]) + + # Average + avg_data["loss_values"] = list(np.mean(loss_data, axis=0)) + avg_data["step_times"] = list(np.mean(step_times_data, axis=0)) + avg_data["step_time_avg"] = np.mean(mean_step_times_data) + avg_data["e2e_time_seconds"] = np.mean(e2e_time_data) + + # save to file + fname = config + ".json" + with open(fname, "w") as f: + json.dump(avg_data, f) + +if __name__ == "__main__": + main() diff --git a/.github/workflows/baselines/create_pax_baselines.sh b/.github/workflows/baselines/create_pax_baselines.sh new file mode 100644 index 000000000..4a156f227 --- /dev/null +++ b/.github/workflows/baselines/create_pax_baselines.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +usage() { + echo -e "Usage: ${0} WORKFLOW_IDS..." + exit 1 +} + +[ "$#" -ge "1" ] || usage + +CONFIGS=("1DP1TP1PP" "8DP1TP1PP" "2DP1TP4PP" "16DP1TP1PP") +ALL_WF_RUNS=($*) + +# call download artifacts from this script's dir +UTIL_DIR="$(dirname "$(readlink --canonicalize -- "${BASH_SOURCE[0]}")")" +bash ${UTIL_DIR}/download_artifacts.sh ${ALL_WF_RUNS[@]} + +URLS=() +for WORKFLOW_RUN in ${ALL_WF_RUNS[@]}; do + pushd ${WORKFLOW_RUN} + for CFG in ${CONFIGS[@]}; do + python3 ${UTIL_DIR}/summarize_metrics.py ${CFG} + done + popd + URLS+=("\"https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/${WORKFLOW_RUN}/artifacts\"") +done + +for CFG in ${CONFIGS[@]}; do + # Average metrics data for this config + python3 ${UTIL_DIR}/average_baselines.py ${CFG} ${ALL_WF_RUNS[@]} + + # Append date and workflow sources + cat <<< $(jq -rc '. += {"run_urls":['$(IFS=, ; echo "${URLS[*]}")'], "date":"'$(date +%Y-%m-%d)'"}' "${CFG}.json") > ${CFG}.json +done diff --git a/.github/workflows/baselines/download_artifacts.sh b/.github/workflows/baselines/download_artifacts.sh new file mode 100644 index 000000000..2949cfb01 --- /dev/null +++ b/.github/workflows/baselines/download_artifacts.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +usage() { + echo -e "Usage: ${0} WORKFLOW_IDS..." + exit 1 +} + +if [[ -z $GH_TOKEN ]]; then + echo "GH_TOKEN env var must be set to download artifacts. Please export the GH_TOKEN var." + echo "You can create a personal access token here: https://github.com/settings/tokens" + echo "For more information, see GitHub official docs: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens" + exit 1 +fi + +[ "$#" -ge "1" ] || usage + + +for WORKFLOW_RUN in $*; do + mkdir -p $WORKFLOW_RUN + pushd $WORKFLOW_RUN + # cURL the list of artifacts + ARTIFACTS=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/${WORKFLOW_RUN}/artifacts") + + COUNT=$(echo $ARTIFACTS | jq -r '.total_count') + NAMES=$(echo $ARTIFACTS | jq -r '.artifacts[].name') + URLS=$(echo $ARTIFACTS | jq -r '.artifacts[].archive_download_url') + NAMES=($NAMES) + URLS=($URLS) + + # Download artifacts + for (( i=0; i<$COUNT; i++ )); do + N=${NAMES[$i]} + U=${URLS[$i]} + + curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${GH_TOKEN}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + --output "${N}.zip" \ + "${U}" + + unzip ${N}.zip + rm ${N}.zip + done + + popd +done diff --git a/.github/workflows/baselines/summarize_metrics.py b/.github/workflows/baselines/summarize_metrics.py new file mode 100644 index 000000000..6262bc05f --- /dev/null +++ b/.github/workflows/baselines/summarize_metrics.py @@ -0,0 +1,52 @@ +import os +import json +import glob +import sys +from statistics import mean +from test_utils import read_tb_tag, read_e2e_time + + +def _create_summary(loss, train_time, e2e_time): + steps = list(loss.keys()) + intervals = [k2 - k1 for k1, k2 in zip(loss.keys(), steps[1:])] + assert all(i == intervals[0] for i in intervals) + + baseline = { + "start_step": steps[0], + "end_step": steps[-1], + "step_interval": intervals[0], + "loss_values": list(loss.values()), + "step_times": list(train_time.values()), + "step_time_avg": mean(list(train_time.values())), + "e2e_time_seconds": e2e_time, + } + return baseline + + +def main(): + loss_summary_name = "loss" + train_time_summary_name = "Steps/sec" + if sys.argv[1]: + test_config = sys.argv[1] + else: + sys.exit(1) + + try: + event_file = os.path.join(test_config, "summaries/train/events*") + event_file = glob.glob(event_file)[0] + loss = read_tb_tag(event_file, loss_summary_name) + train_time = read_tb_tag(event_file, train_time_summary_name) + e2e_time = read_e2e_time(test_config + ".log") + + baseline = _create_summary(loss, train_time, e2e_time) + json_fname = test_config + "_metrics.json" + with open(json_fname, "w") as f: + json.dump(baseline, f) + + except KeyError as e: + print(e) + print("Run might have failed, see", test_config) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/baselines/test_pax_mgmn_metrics.py b/.github/workflows/baselines/test_pax_mgmn_metrics.py new file mode 100644 index 000000000..81afb289b --- /dev/null +++ b/.github/workflows/baselines/test_pax_mgmn_metrics.py @@ -0,0 +1,62 @@ +import pytest +import os +import json +import glob +import test_utils +from statistics import mean + +STEP_TIME_MULT = { + "1DP1TP1PP": 0.95, + "8DP1TP1PP": 0.95, + "1DP8TP1PP": 0.95, + "2DP1TP4PP": 0.95, + "16DP1TP1PP": 0.95, + "2DP2TP4PP": 0.95, +} +E2E_TIME_MULT = { + "1DP1TP1PP": 0.95, + "8DP1TP1PP": 0.95, + "1DP8TP1PP": 0.95, + "2DP1TP4PP": 0.95, + "16DP1TP1PP": 0.95, + "2DP2TP4PP": 0.95, +} +test_dir = os.path.dirname(os.path.abspath(__file__)) +baselines_dir = os.path.join(test_dir, "../PAX_MGMN") +results_dir = os.environ.get("RESULTS_DIR") +loss_summary_name = "loss" +step_time_summary_name = "Steps/sec" + +@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) +def test_loss(baseline_filename): + baseline_filepath = os.path.join(baselines_dir, baseline_filename) + test_config = baseline_filename.split(".")[0] + event_file = os.path.join(results_dir, test_config, "summaries/train/events*") + event_file = glob.glob(event_file)[0] + with open(baseline_filepath, "r") as baseline_file: + end_step = json.load(baseline_file)["end_step"] + loss_actual = test_utils.read_tb_tag(event_file, loss_summary_name) + assert loss_actual[end_step] == 0 + + +@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) +def test_step_time(baseline_filename): + baseline_filepath = os.path.join(baselines_dir, baseline_filename) + test_config = baseline_filename.split(".")[0] + event_file = os.path.join(results_dir, test_config, "summaries/train/events*") + event_file = glob.glob(event_file)[0] + with open(baseline_filepath, "r") as baseline_file: + step_time_avg_expected = json.load(baseline_file)["step_time_avg"] + step_time_values = test_utils.read_tb_tag(event_file, step_time_summary_name).values() + step_time_avg_actual = mean(step_time_values) + assert step_time_avg_actual > step_time_avg_expected * STEP_TIME_MULT[test_config] + +@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) +def test_e2e_time(baseline_filename): + baseline_filepath = os.path.join(baselines_dir, baseline_filename) + test_config = baseline_filename.split(".")[0] + run_log = os.path.join(results_dir, test_config + ".log") + with open(baseline_filepath, "r") as baseline_file: + e2e_time_expected = json.load(baseline_file)["e2e_time_seconds"] + e2e_time_actual = test_utils.read_e2e_time(run_log) + assert e2e_time_actual < e2e_time_expected / E2E_TIME_MULT[test_config] diff --git a/.github/workflows/baselines/test_utils.py b/.github/workflows/baselines/test_utils.py new file mode 100644 index 000000000..ceed8b639 --- /dev/null +++ b/.github/workflows/baselines/test_utils.py @@ -0,0 +1,22 @@ +from tensorboard.backend.event_processing import event_accumulator +from tensorboard.util import tensor_util + + +def read_tb_tag(tb_file: str, summary_name: str) -> dict: + ea = event_accumulator.EventAccumulator(tb_file) + ea.Reload() + + return { + event.step: tensor_util.make_ndarray(event.tensor_proto).item() + for event in ea.Tensors(summary_name) + } + + +def read_e2e_time(log_file: str) -> float: + with open(log_file, "r") as log: + for line in log: + if line.startswith("real"): + minutes = line.split()[1].split('m')[0] + seconds = line.split('m')[1].split('s')[0] + return float(minutes) * 60 + float(seconds) + return -100000000