From 8f3e5dcfe93d221ae0cb8c432110247e6c1f896b Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Mon, 3 Jul 2023 16:04:34 -0500 Subject: [PATCH 01/22] add baseline files --- .github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json | 1 + .github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json | 1 + .github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json | 1 + .github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json | 1 + 4 files changed, 4 insertions(+) create mode 100644 .github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json create mode 100644 .github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json create mode 100644 .github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json create mode 100644 .github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json diff --git a/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json new file mode 100644 index 000000000..b38a3d22f --- /dev/null +++ b/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json @@ -0,0 +1 @@ +{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00046839181838246685, 2.087271119914173e-05, 1.31276870736959e-06, 6.912159155233096e-11, 0.0], "step_times": [6.357304414113362, 5.979689915974935, 6.376240253448486, 6.373825391133626, 6.355693658192952], "step_time_avg": 6.288550726572673, "e2e_time_seconds": 295.73600000000005} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json new file mode 100644 index 000000000..e3f6480d9 --- /dev/null +++ b/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json @@ -0,0 +1 @@ +{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.0004682748403865844, 2.090286701180351e-05, 1.3127760970140419e-06, 5.8207657444020455e-11, 0.0], "step_times": [6.688000361124675, 6.699192523956299, 6.694862047831218, 6.698123772939046, 6.700749556223552], "step_time_avg": 6.6961856524149574, "e2e_time_seconds": 223.268} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json b/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json new file mode 100644 index 000000000..d21cf1759 --- /dev/null +++ b/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json @@ -0,0 +1 @@ +{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00043803153675980866, 2.2190377421793528e-05, 1.4306265256891493e-06, 5.8207657444020455e-11, 0.0], "step_times": [2.357959032058716, 2.3574414253234863, 2.3560804526011148, 2.357269843419393, 2.3561060428619385], "step_time_avg": 2.3569713592529298, "e2e_time_seconds": 385.921} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json new file mode 100644 index 000000000..65a671036 --- /dev/null +++ b/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json @@ -0,0 +1 @@ +{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00046849539891506237, 2.0879013391095214e-05, 1.3132464952529215e-06, 5.8207657444020455e-11, 0.0], "step_times": [6.436240037282308, 6.217730363210042, 6.462920983632405, 6.463934898376465, 6.473924477895101], "step_time_avg": 6.4109501520792636, "e2e_time_seconds": 284.0213333333333} \ No newline at end of file From 22e9e2c46d23f2d99b8d2976f2b3776e25eb1e60 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 5 Jul 2023 22:46:18 -0500 Subject: [PATCH 02/22] add pytest files --- .../baselines/pytest/test_pax_mgmn_metrics.py | 49 +++++++++++++++++++ .../workflows/baselines/pytest/test_utils.py | 25 ++++++++++ 2 files changed, 74 insertions(+) create mode 100644 .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py create mode 100644 .github/workflows/baselines/pytest/test_utils.py diff --git a/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py b/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py new file mode 100644 index 000000000..db21bd8f1 --- /dev/null +++ b/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py @@ -0,0 +1,49 @@ +import pytest +import os +import json +import glob +import test_utils +from statistics import mean + +STEP_TIME_EPSILON = 1.0 +E2E_TIME_EPSILON = 30.0 +test_dir = os.path.dirname(os.path.abspath(__file__)) +baselines_dir = os.path.join(test_dir, "../PAX_MGMN") +results_dir = os.environ.get("RESULTS_DIR") +loss_summary_name = "loss" +step_time_summary_name = "Steps/sec" + +@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) +def test_loss(baseline_filename): + baseline_filepath = os.path.join(baselines_dir, baseline_filename) + test_config = baseline_filename.split(".")[0] + event_file = os.path.join(results_dir, test_config, "summaries/train/events*") + event_file = glob.glob(event_file)[0] + with open(baseline_filepath, "r") as baseline_file: + end_step = json.load(baseline_file)["end_step"] + loss_actual = test_utils.read_tb_tag(event_file, loss_summary_name) + assert loss_actual[end_step] == 0 + + +@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) +def test_step_time(baseline_filename): + baseline_filepath = os.path.join(baselines_dir, baseline_filename) + test_config = baseline_filename.split(".")[0] + event_file = os.path.join(results_dir, test_config, "summaries/train/events*") + event_file = glob.glob(event_file)[0] + with open(baseline_filepath, "r") as baseline_file: + step_time_avg_expected = json.load(baseline_file)["step_time_avg"] + step_time_values = test_utils.read_tb_tag(event_file, step_time_summary_name).values() + step_time_avg_actual = mean(step_time_values) + assert step_time_avg_expected + STEP_TIME_EPSILON > step_time_avg_actual + +@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) +def test_e2e_time(baseline_filename): + baseline_filepath = os.path.join(baselines_dir, baseline_filename) + test_config = baseline_filename.split(".")[0] + run_log = os.path.join(results_dir, test_config + ".log") + with open(baseline_filepath, "r") as baseline_file: + e2e_time_expected = json.load(baseline_file)["e2e_time_seconds"] + e2e_time_actual = test_utils.read_e2e_time(run_log) + assert e2e_time_expected + E2E_TIME_EPSILON > e2e_time_actual + diff --git a/.github/workflows/baselines/pytest/test_utils.py b/.github/workflows/baselines/pytest/test_utils.py new file mode 100644 index 000000000..c746cb3c3 --- /dev/null +++ b/.github/workflows/baselines/pytest/test_utils.py @@ -0,0 +1,25 @@ +import sys +import json +import numpy as np +from tensorboard.backend.event_processing import event_accumulator +from tensorboard.util import tensor_util + + +def read_tb_tag(tb_file: str, summary_name: str) -> dict: + ea = event_accumulator.EventAccumulator(tb_file) + ea.Reload() + + return { + event.step: tensor_util.make_ndarray(event.tensor_proto).item() + for event in ea.Tensors(summary_name) + } + + +def read_e2e_time(log_file: str) -> float: + with open(log_file, "r") as log: + for line in log: + if line.startswith("real"): + minutes = line.split()[1].split('m')[0] + seconds = line.split('m')[1].split('s')[0] + return float(minutes) * 60 + float(seconds) + return -100000000 From 13f24d3a8f0fcf76e823a90dad504e0cd4586165 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 5 Jul 2023 23:05:17 -0500 Subject: [PATCH 03/22] add metrics check job in pax template --- .github/workflows/_sandbox.yaml | 156 +++++++++++++++++++++++++------ .github/workflows/_test_pax.yaml | 23 +++++ 2 files changed, 149 insertions(+), 30 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index 37fa6ca68..bed6dc68c 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -1,41 +1,137 @@ -name: "~Sandbox" +name: Nightly Pax MGMN performance test on: + workflow_run: + workflows: [Nightly Pax build] + types: [completed] + branches: [main] workflow_dispatch: + inputs: + PAX_IMAGE: + type: string + description: Pax container + default: 'ghcr.io/nvidia/pax:latest' + required: true + PUBLISH: + type: boolean + description: Publish dated results to tensorboard server? + default: false + required: false + +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container + +env: + DEFAULT_PAX_IMAGE: 'ghcr.io/nvidia/pax:latest' jobs: - sandbox: + + metadata: runs-on: ubuntu-22.04 + outputs: + BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} + PAX_IMAGE: ${{ steps.date.outputs.PAX_IMAGE }} + PUBLISH: ${{ steps.date.outputs.PUBLISH }} steps: - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 + - name: Set metadata + id: date + shell: bash -x -e {0} + run: | + BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') + echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT + + PAX_IMAGE=${{ inputs.PAX_IMAGE }} + PAX_IMAGE=${PAX_IMAGE:-${{ env.DEFAULT_PAX_IMAGE }}} + echo "PAX_IMAGE=${PAX_IMAGE}" >> $GITHUB_OUTPUT + echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT + + run-jobs: + needs: metadata + uses: ./.github/workflows/_test_pax.yaml + if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + with: + PAX_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} + secrets: inherit + + publish: + needs: [metadata, run-jobs] + runs-on: ubuntu-22.04 + steps: + - name: Setup SSH agent + uses: webfactory/ssh-agent@v0.8.0 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + + - name: Setup SSH known hosts + id: ssh-known-hosts + run: | + mkdir -p ~/.ssh + cat >> ~/.ssh/known_hosts << EOF + ${{ vars.SSH_KNOWN_HOSTS }} + EOF + chmod 600 ~/.ssh/known_hosts + echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT - - name: Print usage + - name: Setup SSH config + id: ssh-config run: | + mkdir -p ~/.ssh + cat >> ~/.ssh/config << EOF + ${{ vars.SSH_CONFIG }} + EOF + chmod 600 ~/.ssh/config + + - name: Create dated folder and generate TensorBoard query URL + id: mkdir + shell: bash -x -e {0} + run: | + FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/PAX" + # copy folder + ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER} + ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/ + # generate query URL + ( cat << EOF - This is an empty workflow file located in the main branch of your - repository. It serves as a testing ground for new GitHub Actions on - development branches before merging them to the main branch. By - defining and overloading this workflow on your development branch, - you can test new actions without affecting your main branch, ensuring - a smooth integration process once the changes are ready to be merged. - - Usage: - - 1. In your development branch, modify the sandbox.yml workflow file - to include the new actions you want to test. Make sure to commit - the changes to the development branch. - 2. Navigate to the 'Actions' tab in your repository, select the - '~Sandbox' workflow, and choose your development branch from the - branch dropdown menu. Click on 'Run workflow' to trigger the - workflow on your development branch. - 3. Once you have tested and verified the new actions in the Sandbox - workflow, you can incorporate them into your main workflow(s) and - merge the development branch into the main branch. Remember to - revert the changes to the sandbox.yml file in the main branch to - keep it empty for future testing. - EOF \ No newline at end of file + + ## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }} + + [view metrics](http://${{ vars.HOSTNAME_TENSORBOARD }}:6006/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) + + EOF + ) | tee $GITHUB_STEP_SUMMARY + + publish-completion: + needs: [metadata, run-jobs] + uses: ./.github/workflows/_publish_badge.yaml + if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' + secrets: inherit + with: + ENDPOINT_FILENAME: 'pax-test-completion-status.json' + PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} + SCRIPT: | + EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json" + PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) + FAILED_TESTS=$(jq -r '. | select ((.state == "FAILED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) + TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) + + echo "Test statuses:" + jq -rc 'input_filename,.' $EXIT_STATUSES + + if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then + BADGE_COLOR=brightgreen + elif [[ $PASSED_TESTS -eq 0 ]]; then + BADGE_COLOR=red + else + BADGE_COLOR=yellow + fi + echo "LABEL='Completion'" >> $GITHUB_OUTPUT + echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT + + if-upstream-failed: + runs-on: ubuntu-latest + if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' + steps: + - run: echo 'Upstream workflow failed, aborting run' && exit 1 diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index b14e8f3d4..fd8a13ff8 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -148,6 +148,29 @@ jobs: name: ${{ steps.meta.outputs.JOB_NAME }} path: output/* + metrics: + runs-on: ubuntu-22.04 + + steps: + - name: Download artifacts + uses: actions/download-artifact@v3 + + - name: Check out the repository under ${GITHUB_WORKSPACE} + uses: actions/checkout@v3 + + - name: Run pytest + shell: bash -x {0} + run: | + pwd + ls + RESULTS_DIR=$PWD pytest --report-log=report.jsonl JAX-Toolbox/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py + + - name: Upload metrics test json logs + uses: actions/upload-artifact@v3 + with: + name: metrics-test-log + path: report.jsonl + summary: runs-on: ubuntu-22.04 From ea761ca77c465603d815e885d09729107e787dd8 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 5 Jul 2023 23:13:05 -0500 Subject: [PATCH 04/22] set needs --- .github/workflows/_test_pax.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index fd8a13ff8..04105cbbc 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -149,6 +149,7 @@ jobs: path: output/* metrics: + needs: multi-gpu-multi-node runs-on: ubuntu-22.04 steps: @@ -163,6 +164,7 @@ jobs: run: | pwd ls + pip install pytest pytest-reportlog RESULTS_DIR=$PWD pytest --report-log=report.jsonl JAX-Toolbox/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py - name: Upload metrics test json logs From 6696ff88e042bac4d0e5b5f2154f72adeaf9ff2d Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 6 Jul 2023 00:37:13 -0500 Subject: [PATCH 05/22] change order of steps and rename artifacts --- .github/workflows/_test_pax.yaml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index 04105cbbc..244589f91 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -153,19 +153,23 @@ jobs: runs-on: ubuntu-22.04 steps: - - name: Download artifacts - uses: actions/download-artifact@v3 - - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v3 + - name: Download artifacts + uses: actions/download-artifact@v3 + - name: Run pytest shell: bash -x {0} run: | pwd ls + for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do + NEW_NAME=$(echo $i | cut -d'-' -f2) + mv $i $NEW_NAME + done pip install pytest pytest-reportlog - RESULTS_DIR=$PWD pytest --report-log=report.jsonl JAX-Toolbox/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py + RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py - name: Upload metrics test json logs uses: actions/upload-artifact@v3 From 82fb1be8d13ff73dad3fbfa31ca52c490f1784b8 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 6 Jul 2023 01:49:09 -0500 Subject: [PATCH 06/22] remove unneeded imports --- .github/workflows/baselines/pytest/test_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/baselines/pytest/test_utils.py b/.github/workflows/baselines/pytest/test_utils.py index c746cb3c3..ceed8b639 100644 --- a/.github/workflows/baselines/pytest/test_utils.py +++ b/.github/workflows/baselines/pytest/test_utils.py @@ -1,6 +1,3 @@ -import sys -import json -import numpy as np from tensorboard.backend.event_processing import event_accumulator from tensorboard.util import tensor_util From 6fb8c6a03a25bdbba333757790202d8175b45c0a Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 6 Jul 2023 01:49:55 -0500 Subject: [PATCH 07/22] move debug prints --- .github/workflows/_test_pax.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index 244589f91..3cda2a13d 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -162,13 +162,13 @@ jobs: - name: Run pytest shell: bash -x {0} run: | - pwd - ls for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do NEW_NAME=$(echo $i | cut -d'-' -f2) mv $i $NEW_NAME done pip install pytest pytest-reportlog + pwd + ls RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py - name: Upload metrics test json logs From 53cd89a1160ec1f208177dbbbdd0c0021fd7c472 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 6 Jul 2023 03:04:25 -0500 Subject: [PATCH 08/22] install tensorboard --- .github/workflows/_test_pax.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index 3cda2a13d..53d015c64 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -166,7 +166,7 @@ jobs: NEW_NAME=$(echo $i | cut -d'-' -f2) mv $i $NEW_NAME done - pip install pytest pytest-reportlog + pip install pytest pytest-reportlog tensorboard pwd ls RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py From a6c50cb126f9c5c36c5b505c3f5b820709b82afc Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 6 Jul 2023 04:41:35 -0500 Subject: [PATCH 09/22] fix dir structure --- .github/workflows/_test_pax.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index 53d015c64..2bf27aadb 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -162,13 +162,12 @@ jobs: - name: Run pytest shell: bash -x {0} run: | + ls * for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do - NEW_NAME=$(echo $i | cut -d'-' -f2) - mv $i $NEW_NAME + SUBDIR=$(echo $i | cut -d'-' -f2) + mv $i/$SUBDIR* . done pip install pytest pytest-reportlog tensorboard - pwd - ls RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py - name: Upload metrics test json logs From bee8baec5bfe00180c5895288025f089ef2c2f73 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 6 Jul 2023 07:00:30 -0500 Subject: [PATCH 10/22] ignore pytest failure --- .github/workflows/_test_pax.yaml | 3 +-- .../workflows/baselines/pytest/test_pax_mgmn_metrics.py | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index 2bf27aadb..12777b406 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -162,13 +162,12 @@ jobs: - name: Run pytest shell: bash -x {0} run: | - ls * for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do SUBDIR=$(echo $i | cut -d'-' -f2) mv $i/$SUBDIR* . done pip install pytest pytest-reportlog tensorboard - RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py + RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py || true - name: Upload metrics test json logs uses: actions/upload-artifact@v3 diff --git a/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py b/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py index db21bd8f1..068d2a596 100644 --- a/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py +++ b/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py @@ -5,8 +5,8 @@ import test_utils from statistics import mean -STEP_TIME_EPSILON = 1.0 -E2E_TIME_EPSILON = 30.0 +STEP_TIME_DELTA = 1.0 +E2E_TIME_DELTA = 30.0 test_dir = os.path.dirname(os.path.abspath(__file__)) baselines_dir = os.path.join(test_dir, "../PAX_MGMN") results_dir = os.environ.get("RESULTS_DIR") @@ -35,7 +35,7 @@ def test_step_time(baseline_filename): step_time_avg_expected = json.load(baseline_file)["step_time_avg"] step_time_values = test_utils.read_tb_tag(event_file, step_time_summary_name).values() step_time_avg_actual = mean(step_time_values) - assert step_time_avg_expected + STEP_TIME_EPSILON > step_time_avg_actual + assert step_time_avg_expected + STEP_TIME_DELTA > step_time_avg_actual @pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) def test_e2e_time(baseline_filename): @@ -45,5 +45,5 @@ def test_e2e_time(baseline_filename): with open(baseline_filepath, "r") as baseline_file: e2e_time_expected = json.load(baseline_file)["e2e_time_seconds"] e2e_time_actual = test_utils.read_e2e_time(run_log) - assert e2e_time_expected + E2E_TIME_EPSILON > e2e_time_actual + assert e2e_time_expected + E2E_TIME_DELTA > e2e_time_actual From 701761da98b10555b54bbf0e7766b4eb72b12c93 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Fri, 7 Jul 2023 12:25:04 -0500 Subject: [PATCH 11/22] restore sandbox --- .github/workflows/_sandbox.yaml | 156 ++++++-------------------------- 1 file changed, 30 insertions(+), 126 deletions(-) diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml index bed6dc68c..37fa6ca68 100644 --- a/.github/workflows/_sandbox.yaml +++ b/.github/workflows/_sandbox.yaml @@ -1,137 +1,41 @@ -name: Nightly Pax MGMN performance test +name: "~Sandbox" on: - workflow_run: - workflows: [Nightly Pax build] - types: [completed] - branches: [main] workflow_dispatch: - inputs: - PAX_IMAGE: - type: string - description: Pax container - default: 'ghcr.io/nvidia/pax:latest' - required: true - PUBLISH: - type: boolean - description: Publish dated results to tensorboard server? - default: false - required: false - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -env: - DEFAULT_PAX_IMAGE: 'ghcr.io/nvidia/pax:latest' jobs: - - metadata: + sandbox: runs-on: ubuntu-22.04 - outputs: - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - PAX_IMAGE: ${{ steps.date.outputs.PAX_IMAGE }} - PUBLISH: ${{ steps.date.outputs.PUBLISH }} steps: - - name: Set metadata - id: date - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - - PAX_IMAGE=${{ inputs.PAX_IMAGE }} - PAX_IMAGE=${PAX_IMAGE:-${{ env.DEFAULT_PAX_IMAGE }}} - echo "PAX_IMAGE=${PAX_IMAGE}" >> $GITHUB_OUTPUT - echo "PUBLISH=${{ inputs.PUBLISH }}" >> $GITHUB_OUTPUT - - run-jobs: - needs: metadata - uses: ./.github/workflows/_test_pax.yaml - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - with: - PAX_IMAGE: ${{ needs.metadata.outputs.PAX_IMAGE }} - secrets: inherit - - publish: - needs: [metadata, run-jobs] - runs-on: ubuntu-22.04 - steps: - - name: Setup SSH agent - uses: webfactory/ssh-agent@v0.8.0 + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 with: - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - - - name: Setup SSH known hosts - id: ssh-known-hosts - run: | - mkdir -p ~/.ssh - cat >> ~/.ssh/known_hosts << EOF - ${{ vars.SSH_KNOWN_HOSTS }} - EOF - chmod 600 ~/.ssh/known_hosts - echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} - - name: Setup SSH config - id: ssh-config + - name: Print usage run: | - mkdir -p ~/.ssh - cat >> ~/.ssh/config << EOF - ${{ vars.SSH_CONFIG }} - EOF - chmod 600 ~/.ssh/config - - - name: Create dated folder and generate TensorBoard query URL - id: mkdir - shell: bash -x -e {0} - run: | - FOLDER="${{ needs.metadata.outputs.BUILD_DATE }}/PAX" - # copy folder - ssh -T tensorboard mkdir -p /tensorboard-logs/${FOLDER} - ssh -T tensorboard rsync -rt /tensorboard-logs/${GITHUB_RUN_ID}/ /tensorboard-logs/${FOLDER}/ - # generate query URL - ( cat << EOF - - ## PAX MGMN nightly training: ${{ needs.metadata.outputs.BUILD_DATE }} - - [view metrics](http://${{ vars.HOSTNAME_TENSORBOARD }}:6006/#scalars®exInput=$(jq -nr --arg url "${FOLDER}" '$url|@uri')&_smoothingWeight=0&tagFilter=seqs_per) - - EOF - ) | tee $GITHUB_STEP_SUMMARY - - publish-completion: - needs: [metadata, run-jobs] - uses: ./.github/workflows/_publish_badge.yaml - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') || github.event_name == 'workflow_dispatch' - secrets: inherit - with: - ENDPOINT_FILENAME: 'pax-test-completion-status.json' - PUBLISH: ${{ github.event_name == 'workflow_run' || needs.metadata.outputs.PUBLISH == 'true' }} - SCRIPT: | - EXIT_STATUSES="${GITHUB_RUN_ID}-*DP*TP*PP/*-status.json" - PASSED_TESTS=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - FAILED_TESTS=$(jq -r '. | select ((.state == "FAILED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - TOTAL_TESTS=$(ls $EXIT_STATUSES | wc -l) - - echo "Test statuses:" - jq -rc 'input_filename,.' $EXIT_STATUSES - - if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]] || [[ $PASSED_TESTS -eq $TOTAL_TESTS ]]; then - BADGE_COLOR=brightgreen - elif [[ $PASSED_TESTS -eq 0 ]]; then - BADGE_COLOR=red - else - BADGE_COLOR=yellow - fi - echo "LABEL='Completion'" >> $GITHUB_OUTPUT - echo "MESSAGE='${PASSED_TESTS}/${TOTAL_TESTS} passed'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT - - if-upstream-failed: - runs-on: ubuntu-latest - if: (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure') && github.event_name != 'workflow_dispatch' - steps: - - run: echo 'Upstream workflow failed, aborting run' && exit 1 + This is an empty workflow file located in the main branch of your + repository. It serves as a testing ground for new GitHub Actions on + development branches before merging them to the main branch. By + defining and overloading this workflow on your development branch, + you can test new actions without affecting your main branch, ensuring + a smooth integration process once the changes are ready to be merged. + + Usage: + + 1. In your development branch, modify the sandbox.yml workflow file + to include the new actions you want to test. Make sure to commit + the changes to the development branch. + 2. Navigate to the 'Actions' tab in your repository, select the + '~Sandbox' workflow, and choose your development branch from the + branch dropdown menu. Click on 'Run workflow' to trigger the + workflow on your development branch. + 3. Once you have tested and verified the new actions in the Sandbox + workflow, you can incorporate them into your main workflow(s) and + merge the development branch into the main branch. Remember to + revert the changes to the sandbox.yml file in the main branch to + keep it empty for future testing. + EOF \ No newline at end of file From 93fb42f6898b3e08a3f21e4c529fc9bd37136ad8 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 13 Jul 2023 18:25:18 -0500 Subject: [PATCH 12/22] add script for creating baseline/results json --- .../baselines/pytest/create_baseline.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/workflows/baselines/pytest/create_baseline.py diff --git a/.github/workflows/baselines/pytest/create_baseline.py b/.github/workflows/baselines/pytest/create_baseline.py new file mode 100644 index 000000000..d8bd3f2c7 --- /dev/null +++ b/.github/workflows/baselines/pytest/create_baseline.py @@ -0,0 +1,52 @@ +import os +import json +import glob +import sys +import numpy as np +from test_utils import read_tb_tag, read_e2e_time + + +def _create_baseline(loss, train_time, e2e_time): + steps = list(loss.keys()) + intervals = [k2 - k1 for k1, k2 in zip(loss.keys(), steps[1:])] + assert all(i == intervals[0] for i in intervals) + + baseline = { + "start_step": steps[0], + "end_step": steps[-1], + "step_interval": intervals[0], + "loss_values": list(loss.values()), + "step_times": list(train_time.values()), + "step_time_avg": np.mean(list(train_time.values())), + "e2e_time_seconds": e2e_time, + } + return baseline + + +def main(): + loss_summary_name = "loss" + train_time_summary_name = "Steps/sec" + if sys.argv[1]: + test_config = sys.argv[1] + else: + sys.exit(1) + + try: + event_file = os.path.join(test_config, "summaries/train/events*") + event_file = glob.glob(event_file)[0] + loss = read_tb_tag(event_file, loss_summary_name) + train_time = read_tb_tag(event_file, train_time_summary_name) + e2e_time = read_e2e_time(test_config + ".log") + + baseline = _create_baseline(loss, train_time, e2e_time) + json_fname = test_config + "_metrics.json" + with open(json_fname, "w") as f: + json.dump(baseline, f) + + except KeyError as e: + print(e) + print("Run might have failed, see", test_config) + + +if __name__ == "__main__": + main() From 2a4803cfc1eb9e13f944aaa62c45c68cf4dbf479 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 13 Jul 2023 19:07:58 -0500 Subject: [PATCH 13/22] write metrics to step summary --- .github/workflows/_test_pax.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index 12777b406..2e1096def 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -165,7 +165,14 @@ jobs: for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do SUBDIR=$(echo $i | cut -d'-' -f2) mv $i/$SUBDIR* . + python3 .github/workflows/baselines/pytest/create_baseline.py $SUBDIR # create result json in baseline format done + ( + cat << EOF + ## PAX MGMN Test Metrics + $(for i in *_metrics.json; do echo $i | cut -d'.' -f1; echo '```json'; jq . $i; echo '```'; done) + EOF + ) >> $GITHUB_STEP_SUMMARY pip install pytest pytest-reportlog tensorboard RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py || true From aad5718b0493c33c4a5d430fe9c1c5a15ac9448e Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Fri, 14 Jul 2023 01:56:46 -0500 Subject: [PATCH 14/22] use stats instead of numpy --- .github/workflows/baselines/pytest/create_baseline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/baselines/pytest/create_baseline.py b/.github/workflows/baselines/pytest/create_baseline.py index d8bd3f2c7..8676c049d 100644 --- a/.github/workflows/baselines/pytest/create_baseline.py +++ b/.github/workflows/baselines/pytest/create_baseline.py @@ -2,7 +2,7 @@ import json import glob import sys -import numpy as np +from statistics import mean from test_utils import read_tb_tag, read_e2e_time @@ -17,7 +17,7 @@ def _create_baseline(loss, train_time, e2e_time): "step_interval": intervals[0], "loss_values": list(loss.values()), "step_times": list(train_time.values()), - "step_time_avg": np.mean(list(train_time.values())), + "step_time_avg": mean(list(train_time.values())), "e2e_time_seconds": e2e_time, } return baseline From e2f26fda50b75d288d0b0abd3556f9bf1223bf38 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Fri, 14 Jul 2023 14:09:11 -0500 Subject: [PATCH 15/22] move pip installs --- .github/workflows/_test_pax.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index 2e1096def..e11904557 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -162,6 +162,7 @@ jobs: - name: Run pytest shell: bash -x {0} run: | + pip install pytest pytest-reportlog tensorboard for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do SUBDIR=$(echo $i | cut -d'-' -f2) mv $i/$SUBDIR* . From b8d8b87b7619f3931f5bbec72d603a1b4e1227b8 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Fri, 14 Jul 2023 14:09:42 -0500 Subject: [PATCH 16/22] avoid heredoc to write step summary --- .github/workflows/_test_pax.yaml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index e11904557..db786d0f8 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -168,13 +168,15 @@ jobs: mv $i/$SUBDIR* . python3 .github/workflows/baselines/pytest/create_baseline.py $SUBDIR # create result json in baseline format done - ( - cat << EOF - ## PAX MGMN Test Metrics - $(for i in *_metrics.json; do echo $i | cut -d'.' -f1; echo '```json'; jq . $i; echo '```'; done) - EOF - ) >> $GITHUB_STEP_SUMMARY - pip install pytest pytest-reportlog tensorboard + + echo '## PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY + for i in *_metrics.json; do + echo $i | cut -d'.' -f1 + echo '```json' + jq . $i + echo '```' + done | tee -a $GITHUB_STEP_SUMMARY + RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py || true - name: Upload metrics test json logs From fe37b046ee1935a8b09d16f5a229b87012e910cf Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Fri, 14 Jul 2023 15:08:38 -0500 Subject: [PATCH 17/22] change delta to multiplier, fix conditions --- .../baselines/pytest/test_pax_mgmn_metrics.py | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py b/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py index 068d2a596..81afb289b 100644 --- a/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py +++ b/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py @@ -5,8 +5,22 @@ import test_utils from statistics import mean -STEP_TIME_DELTA = 1.0 -E2E_TIME_DELTA = 30.0 +STEP_TIME_MULT = { + "1DP1TP1PP": 0.95, + "8DP1TP1PP": 0.95, + "1DP8TP1PP": 0.95, + "2DP1TP4PP": 0.95, + "16DP1TP1PP": 0.95, + "2DP2TP4PP": 0.95, +} +E2E_TIME_MULT = { + "1DP1TP1PP": 0.95, + "8DP1TP1PP": 0.95, + "1DP8TP1PP": 0.95, + "2DP1TP4PP": 0.95, + "16DP1TP1PP": 0.95, + "2DP2TP4PP": 0.95, +} test_dir = os.path.dirname(os.path.abspath(__file__)) baselines_dir = os.path.join(test_dir, "../PAX_MGMN") results_dir = os.environ.get("RESULTS_DIR") @@ -35,7 +49,7 @@ def test_step_time(baseline_filename): step_time_avg_expected = json.load(baseline_file)["step_time_avg"] step_time_values = test_utils.read_tb_tag(event_file, step_time_summary_name).values() step_time_avg_actual = mean(step_time_values) - assert step_time_avg_expected + STEP_TIME_DELTA > step_time_avg_actual + assert step_time_avg_actual > step_time_avg_expected * STEP_TIME_MULT[test_config] @pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) def test_e2e_time(baseline_filename): @@ -45,5 +59,4 @@ def test_e2e_time(baseline_filename): with open(baseline_filepath, "r") as baseline_file: e2e_time_expected = json.load(baseline_file)["e2e_time_seconds"] e2e_time_actual = test_utils.read_e2e_time(run_log) - assert e2e_time_expected + E2E_TIME_DELTA > e2e_time_actual - + assert e2e_time_actual < e2e_time_expected / E2E_TIME_MULT[test_config] From 02c0ece5466e7a32f4f69435653210856cc642d4 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 18 Jul 2023 15:07:26 -0500 Subject: [PATCH 18/22] reorg and rename files --- .github/workflows/_test_pax.yaml | 4 ++-- .../{pytest/create_baseline.py => summarize_metrics.py} | 4 ++-- .../workflows/baselines/{pytest => }/test_pax_mgmn_metrics.py | 0 .github/workflows/baselines/{pytest => }/test_utils.py | 0 4 files changed, 4 insertions(+), 4 deletions(-) rename .github/workflows/baselines/{pytest/create_baseline.py => summarize_metrics.py} (92%) rename .github/workflows/baselines/{pytest => }/test_pax_mgmn_metrics.py (100%) rename .github/workflows/baselines/{pytest => }/test_utils.py (100%) diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml index db786d0f8..584110e93 100644 --- a/.github/workflows/_test_pax.yaml +++ b/.github/workflows/_test_pax.yaml @@ -166,7 +166,7 @@ jobs: for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do SUBDIR=$(echo $i | cut -d'-' -f2) mv $i/$SUBDIR* . - python3 .github/workflows/baselines/pytest/create_baseline.py $SUBDIR # create result json in baseline format + python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format done echo '## PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY @@ -177,7 +177,7 @@ jobs: echo '```' done | tee -a $GITHUB_STEP_SUMMARY - RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py || true + RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/test_pax_mgmn_metrics.py || true - name: Upload metrics test json logs uses: actions/upload-artifact@v3 diff --git a/.github/workflows/baselines/pytest/create_baseline.py b/.github/workflows/baselines/summarize_metrics.py similarity index 92% rename from .github/workflows/baselines/pytest/create_baseline.py rename to .github/workflows/baselines/summarize_metrics.py index 8676c049d..6262bc05f 100644 --- a/.github/workflows/baselines/pytest/create_baseline.py +++ b/.github/workflows/baselines/summarize_metrics.py @@ -6,7 +6,7 @@ from test_utils import read_tb_tag, read_e2e_time -def _create_baseline(loss, train_time, e2e_time): +def _create_summary(loss, train_time, e2e_time): steps = list(loss.keys()) intervals = [k2 - k1 for k1, k2 in zip(loss.keys(), steps[1:])] assert all(i == intervals[0] for i in intervals) @@ -38,7 +38,7 @@ def main(): train_time = read_tb_tag(event_file, train_time_summary_name) e2e_time = read_e2e_time(test_config + ".log") - baseline = _create_baseline(loss, train_time, e2e_time) + baseline = _create_summary(loss, train_time, e2e_time) json_fname = test_config + "_metrics.json" with open(json_fname, "w") as f: json.dump(baseline, f) diff --git a/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py b/.github/workflows/baselines/test_pax_mgmn_metrics.py similarity index 100% rename from .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py rename to .github/workflows/baselines/test_pax_mgmn_metrics.py diff --git a/.github/workflows/baselines/pytest/test_utils.py b/.github/workflows/baselines/test_utils.py similarity index 100% rename from .github/workflows/baselines/pytest/test_utils.py rename to .github/workflows/baselines/test_utils.py From c638e9a6944c7fc20e810acdab97beb9eef67d4b Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 18 Jul 2023 15:49:04 -0500 Subject: [PATCH 19/22] baseline generation scripts for arbitrary workflow runs --- .../workflows/baselines/average_baselines.py | 42 ++++++++++++++++ .../baselines/create_pax_baselines.sh | 33 +++++++++++++ .../workflows/baselines/download_artifacts.sh | 48 +++++++++++++++++++ 3 files changed, 123 insertions(+) create mode 100644 .github/workflows/baselines/average_baselines.py create mode 100644 .github/workflows/baselines/create_pax_baselines.sh create mode 100644 .github/workflows/baselines/download_artifacts.sh diff --git a/.github/workflows/baselines/average_baselines.py b/.github/workflows/baselines/average_baselines.py new file mode 100644 index 000000000..219df938d --- /dev/null +++ b/.github/workflows/baselines/average_baselines.py @@ -0,0 +1,42 @@ +import os +import sys +import numpy as np +import json + +def main(): + if len(sys.argv) < 3: + sys.exit(1) + + config = sys.argv[1] + run_dirs = sys.argv[2:] + + # Store metrics data as list of dicts + json_fnames = [f"{r}/{config}_metrics.json" for r in run_dirs] + src_data = [] + for fname in json_fnames: + with open(fname, "r") as f: + src_data.append(json.load(f)) + + # TODO: Ensure start step, end step, interval equal across runs + assert ... + + # Gather metrics across dirs + avg_data = src_data[0].copy() # Use first metrics dict as a template + loss_data = np.array([metrics["loss_values"] for metrics in src_data]) + step_times_data = np.array([metrics["step_times"] for metrics in src_data]) + mean_step_times_data = np.array([metrics["step_time_avg"] for metrics in src_data]) + e2e_time_data = np.array([metrics["e2e_time_seconds"] for metrics in src_data]) + + # Average + avg_data["loss_values"] = list(np.mean(loss_data, axis=0)) + avg_data["step_times"] = list(np.mean(step_times_data, axis=0)) + avg_data["step_time_avg"] = np.mean(mean_step_times_data) + avg_data["e2e_time_seconds"] = np.mean(e2e_time_data) + + # save to file + fname = config + ".json" + with open(fname, "w") as f: + json.dump(avg_data, f) + +if __name__ == "__main__": + main() diff --git a/.github/workflows/baselines/create_pax_baselines.sh b/.github/workflows/baselines/create_pax_baselines.sh new file mode 100644 index 000000000..4a156f227 --- /dev/null +++ b/.github/workflows/baselines/create_pax_baselines.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +usage() { + echo -e "Usage: ${0} WORKFLOW_IDS..." + exit 1 +} + +[ "$#" -ge "1" ] || usage + +CONFIGS=("1DP1TP1PP" "8DP1TP1PP" "2DP1TP4PP" "16DP1TP1PP") +ALL_WF_RUNS=($*) + +# call download artifacts from this script's dir +UTIL_DIR="$(dirname "$(readlink --canonicalize -- "${BASH_SOURCE[0]}")")" +bash ${UTIL_DIR}/download_artifacts.sh ${ALL_WF_RUNS[@]} + +URLS=() +for WORKFLOW_RUN in ${ALL_WF_RUNS[@]}; do + pushd ${WORKFLOW_RUN} + for CFG in ${CONFIGS[@]}; do + python3 ${UTIL_DIR}/summarize_metrics.py ${CFG} + done + popd + URLS+=("\"https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/${WORKFLOW_RUN}/artifacts\"") +done + +for CFG in ${CONFIGS[@]}; do + # Average metrics data for this config + python3 ${UTIL_DIR}/average_baselines.py ${CFG} ${ALL_WF_RUNS[@]} + + # Append date and workflow sources + cat <<< $(jq -rc '. += {"run_urls":['$(IFS=, ; echo "${URLS[*]}")'], "date":"'$(date +%Y-%m-%d)'"}' "${CFG}.json") > ${CFG}.json +done diff --git a/.github/workflows/baselines/download_artifacts.sh b/.github/workflows/baselines/download_artifacts.sh new file mode 100644 index 000000000..a7ae0afe5 --- /dev/null +++ b/.github/workflows/baselines/download_artifacts.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +usage() { + echo -e "Usage: ${0} WORKFLOW_IDS..." + exit 1 +} + +if [[ -z $GH_TOKEN ]]; then + echo "GH_TOKEN env var must be set to download artifacts. Please export the GH_TOKEN var." # TODO: add token creation URL to message + exit 1 +fi + +[ "$#" -ge "1" ] || usage + + +for WORKFLOW_RUN in $*; do + mkdir -p $WORKFLOW_RUN + pushd $WORKFLOW_RUN + # cURL the list of artifacts + ARTIFACTS=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/${WORKFLOW_RUN}/artifacts") + + COUNT=$(echo $ARTIFACTS | jq -r '.total_count') + NAMES=$(echo $ARTIFACTS | jq -r '.artifacts[].name') + URLS=$(echo $ARTIFACTS | jq -r '.artifacts[].archive_download_url') + NAMES=($NAMES) + URLS=($URLS) + + # Download artifacts + for (( i=0; i<$COUNT; i++ )); do + N=${NAMES[$i]} + U=${URLS[$i]} + + curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${GH_TOKEN}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + --output "${N}.zip" \ + "${U}" + + unzip ${N}.zip + rm ${N}.zip + done + + popd +done From cef46d686650b6e2fbe43e6e0ac5a3c723e05692 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 18 Jul 2023 15:55:12 -0500 Subject: [PATCH 20/22] update baselines --- .github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json | 2 +- .github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json | 2 +- .github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json | 2 +- .github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json index b38a3d22f..6154dd526 100644 --- a/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json +++ b/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json @@ -1 +1 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00046839181838246685, 2.087271119914173e-05, 1.31276870736959e-06, 6.912159155233096e-11, 0.0], "step_times": [6.357304414113362, 5.979689915974935, 6.376240253448486, 6.373825391133626, 6.355693658192952], "step_time_avg": 6.288550726572673, "e2e_time_seconds": 295.73600000000005} \ No newline at end of file +{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.00046839181838246685,2.087271119914173e-05,1.31276870736959e-06,6.912159155233096e-11,0],"step_times":[6.357304414113362,5.979689915974935,6.376240253448486,6.373825391133626,6.355693658192952],"step_time_avg":6.288550726572673,"e2e_time_seconds":295.73600000000005,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"} diff --git a/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json index e3f6480d9..a994227aa 100644 --- a/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json +++ b/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json @@ -1 +1 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.0004682748403865844, 2.090286701180351e-05, 1.3127760970140419e-06, 5.8207657444020455e-11, 0.0], "step_times": [6.688000361124675, 6.699192523956299, 6.694862047831218, 6.698123772939046, 6.700749556223552], "step_time_avg": 6.6961856524149574, "e2e_time_seconds": 223.268} \ No newline at end of file +{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.0004682748403865844,2.090286701180351e-05,1.3127760970140419e-06,5.8207657444020455e-11,0],"step_times":[6.688000361124675,6.699192523956299,6.694862047831218,6.698123772939046,6.700749556223552],"step_time_avg":6.6961856524149574,"e2e_time_seconds":223.268,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"} diff --git a/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json b/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json index d21cf1759..92caac7c4 100644 --- a/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json +++ b/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json @@ -1 +1 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00043803153675980866, 2.2190377421793528e-05, 1.4306265256891493e-06, 5.8207657444020455e-11, 0.0], "step_times": [2.357959032058716, 2.3574414253234863, 2.3560804526011148, 2.357269843419393, 2.3561060428619385], "step_time_avg": 2.3569713592529298, "e2e_time_seconds": 385.921} \ No newline at end of file +{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.00043803153675980866,2.2190377421793528e-05,1.4306265256891493e-06,5.8207657444020455e-11,0],"step_times":[2.357959032058716,2.3574414253234863,2.3560804526011148,2.357269843419393,2.3561060428619385],"step_time_avg":2.3569713592529298,"e2e_time_seconds":385.921,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"} diff --git a/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json index 65a671036..94e2212ec 100644 --- a/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json +++ b/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json @@ -1 +1 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00046849539891506237, 2.0879013391095214e-05, 1.3132464952529215e-06, 5.8207657444020455e-11, 0.0], "step_times": [6.436240037282308, 6.217730363210042, 6.462920983632405, 6.463934898376465, 6.473924477895101], "step_time_avg": 6.4109501520792636, "e2e_time_seconds": 284.0213333333333} \ No newline at end of file +{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.00046849539891506237,2.0879013391095214e-05,1.3132464952529215e-06,5.8207657444020455e-11,0],"step_times":[6.436240037282308,6.217730363210042,6.462920983632405,6.463934898376465,6.473924477895101],"step_time_avg":6.4109501520792636,"e2e_time_seconds":284.0213333333333,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"} From ab193079b6d760e38dc3faa5874c25b0129c254c Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 18 Jul 2023 16:31:00 -0500 Subject: [PATCH 21/22] more detailed error message --- .github/workflows/baselines/download_artifacts.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/baselines/download_artifacts.sh b/.github/workflows/baselines/download_artifacts.sh index a7ae0afe5..2949cfb01 100644 --- a/.github/workflows/baselines/download_artifacts.sh +++ b/.github/workflows/baselines/download_artifacts.sh @@ -6,7 +6,9 @@ usage() { } if [[ -z $GH_TOKEN ]]; then - echo "GH_TOKEN env var must be set to download artifacts. Please export the GH_TOKEN var." # TODO: add token creation URL to message + echo "GH_TOKEN env var must be set to download artifacts. Please export the GH_TOKEN var." + echo "You can create a personal access token here: https://github.com/settings/tokens" + echo "For more information, see GitHub official docs: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens" exit 1 fi From 681253785bc9ee25ac219d2a8b4df265a0b5de92 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 18 Jul 2023 16:32:54 -0500 Subject: [PATCH 22/22] ensure metric step indexes match when averaging --- .github/workflows/baselines/average_baselines.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/baselines/average_baselines.py b/.github/workflows/baselines/average_baselines.py index 219df938d..2c0594f5f 100644 --- a/.github/workflows/baselines/average_baselines.py +++ b/.github/workflows/baselines/average_baselines.py @@ -17,8 +17,13 @@ def main(): with open(fname, "r") as f: src_data.append(json.load(f)) - # TODO: Ensure start step, end step, interval equal across runs - assert ... + # Ensure start step, end step, interval equal across runs + src_data + for k in ["start_step", "end_step", "step_interval"]: + values = [metrics[k] for metrics in src_data] + print("checking equality for", k) + print(values) + assert all([v == values[0] for v in values]) # Gather metrics across dirs avg_data = src_data[0].copy() # Use first metrics dict as a template