NVIDIA · maanug-nv · Jul 20, 2023 · Jul 3, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml
@@ -148,6 +148,33 @@ jobs:
           name: ${{ steps.meta.outputs.JOB_NAME }}
           path: output/*
 
+  metrics:
+    needs: multi-gpu-multi-node
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Check out the repository under ${GITHUB_WORKSPACE}
+        uses: actions/checkout@v3
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
+
+      - name: Run pytest
+        shell: bash -x {0}
+        run: |
+          for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do
+            SUBDIR=$(echo $i | cut -d'-' -f2)
+            mv $i/$SUBDIR* .
+          done
+          pip install pytest pytest-reportlog tensorboard
+          RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/pytest/test_pax_mgmn_metrics.py || true
+
+      - name: Upload metrics test json logs
+        uses: actions/upload-artifact@v3
+        with:
+          name: metrics-test-log
+          path: report.jsonl
+
   summary:
     runs-on: ubuntu-22.04
 

diff --git a/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json
@@ -0,0 +1 @@
+{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00046839181838246685, 2.087271119914173e-05, 1.31276870736959e-06, 6.912159155233096e-11, 0.0], "step_times": [6.357304414113362, 5.979689915974935, 6.376240253448486, 6.373825391133626, 6.355693658192952], "step_time_avg": 6.288550726572673, "e2e_time_seconds": 295.73600000000005}
diff --git a/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json
@@ -0,0 +1 @@
+{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.0004682748403865844, 2.090286701180351e-05, 1.3127760970140419e-06, 5.8207657444020455e-11, 0.0], "step_times": [6.688000361124675, 6.699192523956299, 6.694862047831218, 6.698123772939046, 6.700749556223552], "step_time_avg": 6.6961856524149574, "e2e_time_seconds": 223.268}
diff --git a/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json b/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json
@@ -0,0 +1 @@
+{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00043803153675980866, 2.2190377421793528e-05, 1.4306265256891493e-06, 5.8207657444020455e-11, 0.0], "step_times": [2.357959032058716, 2.3574414253234863, 2.3560804526011148, 2.357269843419393, 2.3561060428619385], "step_time_avg": 2.3569713592529298, "e2e_time_seconds": 385.921}
diff --git a/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json
@@ -0,0 +1 @@
+{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00046849539891506237, 2.0879013391095214e-05, 1.3132464952529215e-06, 5.8207657444020455e-11, 0.0], "step_times": [6.436240037282308, 6.217730363210042, 6.462920983632405, 6.463934898376465, 6.473924477895101], "step_time_avg": 6.4109501520792636, "e2e_time_seconds": 284.0213333333333}
diff --git a/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py b/.github/workflows/baselines/pytest/test_pax_mgmn_metrics.py
@@ -0,0 +1,49 @@
+import pytest
+import os
+import json
+import glob
+import test_utils
+from statistics import mean
+
+STEP_TIME_DELTA = 1.0
+E2E_TIME_DELTA = 30.0
+test_dir = os.path.dirname(os.path.abspath(__file__))
+baselines_dir = os.path.join(test_dir, "../PAX_MGMN")
+results_dir = os.environ.get("RESULTS_DIR")
+loss_summary_name = "loss"
+step_time_summary_name = "Steps/sec"
+
+@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir))
+def test_loss(baseline_filename):
+    baseline_filepath = os.path.join(baselines_dir, baseline_filename)
+    test_config = baseline_filename.split(".")[0]
+    event_file = os.path.join(results_dir, test_config, "summaries/train/events*")
+    event_file = glob.glob(event_file)[0]
+    with open(baseline_filepath, "r") as baseline_file:
+        end_step = json.load(baseline_file)["end_step"]
+        loss_actual = test_utils.read_tb_tag(event_file, loss_summary_name)
+        assert loss_actual[end_step] == 0
+
+
+@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir))
+def test_step_time(baseline_filename):
+    baseline_filepath = os.path.join(baselines_dir, baseline_filename)
+    test_config = baseline_filename.split(".")[0]
+    event_file = os.path.join(results_dir, test_config, "summaries/train/events*")
+    event_file = glob.glob(event_file)[0]
+    with open(baseline_filepath, "r") as baseline_file:
+        step_time_avg_expected = json.load(baseline_file)["step_time_avg"]
+        step_time_values = test_utils.read_tb_tag(event_file, step_time_summary_name).values()
+        step_time_avg_actual = mean(step_time_values)
+        assert step_time_avg_expected + STEP_TIME_DELTA > step_time_avg_actual
+
+@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir))
+def test_e2e_time(baseline_filename):
+    baseline_filepath = os.path.join(baselines_dir, baseline_filename)
+    test_config = baseline_filename.split(".")[0]
+    run_log = os.path.join(results_dir, test_config + ".log")
+    with open(baseline_filepath, "r") as baseline_file:
+        e2e_time_expected = json.load(baseline_file)["e2e_time_seconds"]
+        e2e_time_actual = test_utils.read_e2e_time(run_log)
+        assert e2e_time_expected + E2E_TIME_DELTA > e2e_time_actual
+
diff --git a/.github/workflows/baselines/pytest/test_utils.py b/.github/workflows/baselines/pytest/test_utils.py
@@ -0,0 +1,22 @@
+from tensorboard.backend.event_processing import event_accumulator
+from tensorboard.util import tensor_util
+
+
+def read_tb_tag(tb_file: str, summary_name: str) -> dict:
+    ea = event_accumulator.EventAccumulator(tb_file)
+    ea.Reload()
+
+    return {
+        event.step: tensor_util.make_ndarray(event.tensor_proto).item()
+        for event in ea.Tensors(summary_name)
+    }
+
+
+def read_e2e_time(log_file: str) -> float:
+    with open(log_file, "r") as log:
+        for line in log:
+            if line.startswith("real"):
+                minutes = line.split()[1].split('m')[0]
+                seconds = line.split('m')[1].split('s')[0]
+                return float(minutes) * 60 + float(seconds)
+    return -100000000