diff --git a/.github/workflows/_test_pax.yaml b/.github/workflows/_test_pax.yaml
index b14e8f3d4..584110e93 100644
--- a/.github/workflows/_test_pax.yaml
+++ b/.github/workflows/_test_pax.yaml
@@ -148,6 +148,43 @@ jobs:
           name: ${{ steps.meta.outputs.JOB_NAME }}
           path: output/*
 
+  metrics:
+    needs: multi-gpu-multi-node
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Check out the repository under ${GITHUB_WORKSPACE}
+        uses: actions/checkout@v3
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v3
+
+      - name: Run pytest
+        shell: bash -x {0}
+        run: |
+          pip install pytest pytest-reportlog tensorboard
+          for i in ${GITHUB_RUN_ID}-*DP*TP*PP; do
+            SUBDIR=$(echo $i | cut -d'-' -f2)
+            mv $i/$SUBDIR* .
+            python3 .github/workflows/baselines/summarize_metrics.py $SUBDIR # create result json in baseline format
+          done
+
+          echo '## PAX MGMN Test Metrics' >> $GITHUB_STEP_SUMMARY
+          for i in *_metrics.json; do
+            echo $i | cut -d'.' -f1
+            echo '```json'
+            jq . $i
+            echo '```'
+          done | tee -a $GITHUB_STEP_SUMMARY
+
+          RESULTS_DIR=$PWD pytest --report-log=report.jsonl .github/workflows/baselines/test_pax_mgmn_metrics.py || true
+
+      - name: Upload metrics test json logs
+        uses: actions/upload-artifact@v3
+        with:
+          name: metrics-test-log
+          path: report.jsonl
+
   summary:
     runs-on: ubuntu-22.04
 
diff --git a/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json
new file mode 100644
index 000000000..6154dd526
--- /dev/null
+++ b/.github/workflows/baselines/PAX_MGMN/16DP1TP1PP.json
@@ -0,0 +1 @@
+{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.00046839181838246685,2.087271119914173e-05,1.31276870736959e-06,6.912159155233096e-11,0],"step_times":[6.357304414113362,5.979689915974935,6.376240253448486,6.373825391133626,6.355693658192952],"step_time_avg":6.288550726572673,"e2e_time_seconds":295.73600000000005,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"}
diff --git a/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json
new file mode 100644
index 000000000..a994227aa
--- /dev/null
+++ b/.github/workflows/baselines/PAX_MGMN/1DP1TP1PP.json
@@ -0,0 +1 @@
+{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.0004682748403865844,2.090286701180351e-05,1.3127760970140419e-06,5.8207657444020455e-11,0],"step_times":[6.688000361124675,6.699192523956299,6.694862047831218,6.698123772939046,6.700749556223552],"step_time_avg":6.6961856524149574,"e2e_time_seconds":223.268,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"}
diff --git a/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json b/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json
new file mode 100644
index 000000000..92caac7c4
--- /dev/null
+++ b/.github/workflows/baselines/PAX_MGMN/2DP1TP4PP.json
@@ -0,0 +1 @@
+{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.00043803153675980866,2.2190377421793528e-05,1.4306265256891493e-06,5.8207657444020455e-11,0],"step_times":[2.357959032058716,2.3574414253234863,2.3560804526011148,2.357269843419393,2.3561060428619385],"step_time_avg":2.3569713592529298,"e2e_time_seconds":385.921,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"}
diff --git a/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json
new file mode 100644
index 000000000..94e2212ec
--- /dev/null
+++ b/.github/workflows/baselines/PAX_MGMN/8DP1TP1PP.json
@@ -0,0 +1 @@
+{"start_step":100,"end_step":500,"step_interval":100,"loss_values":[0.00046849539891506237,2.0879013391095214e-05,1.3132464952529215e-06,5.8207657444020455e-11,0],"step_times":[6.436240037282308,6.217730363210042,6.462920983632405,6.463934898376465,6.473924477895101],"step_time_avg":6.4109501520792636,"e2e_time_seconds":284.0213333333333,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160692471/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160694203/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/5160696525/artifacts"],"date":"2023-07-18"}
diff --git a/.github/workflows/baselines/average_baselines.py b/.github/workflows/baselines/average_baselines.py
new file mode 100644
index 000000000..2c0594f5f
--- /dev/null
+++ b/.github/workflows/baselines/average_baselines.py
@@ -0,0 +1,47 @@
+import os
+import sys
+import numpy as np
+import json
+
+def main():
+    if len(sys.argv) < 3:
+        sys.exit(1)
+
+    config = sys.argv[1]
+    run_dirs = sys.argv[2:]
+
+    # Store metrics data as list of dicts
+    json_fnames = [f"{r}/{config}_metrics.json" for r in run_dirs]
+    src_data = []
+    for fname in json_fnames:
+        with open(fname, "r") as f:
+            src_data.append(json.load(f))
+
+    # Ensure start step, end step, interval equal across runs
+    src_data
+    for k in ["start_step", "end_step", "step_interval"]:
+        values = [metrics[k] for metrics in src_data]
+        print("checking equality for", k)
+        print(values)
+        assert all([v == values[0] for v in values])
+
+    # Gather metrics across dirs
+    avg_data = src_data[0].copy()  # Use first metrics dict as a template
+    loss_data = np.array([metrics["loss_values"] for metrics in src_data])
+    step_times_data = np.array([metrics["step_times"] for metrics in src_data])
+    mean_step_times_data = np.array([metrics["step_time_avg"] for metrics in src_data])
+    e2e_time_data = np.array([metrics["e2e_time_seconds"] for metrics in src_data])
+
+    # Average
+    avg_data["loss_values"] = list(np.mean(loss_data, axis=0))
+    avg_data["step_times"] = list(np.mean(step_times_data, axis=0))
+    avg_data["step_time_avg"] = np.mean(mean_step_times_data)
+    avg_data["e2e_time_seconds"] = np.mean(e2e_time_data)
+
+    # save to file
+    fname = config + ".json"
+    with open(fname, "w") as f:
+        json.dump(avg_data, f)
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/baselines/create_pax_baselines.sh b/.github/workflows/baselines/create_pax_baselines.sh
new file mode 100644
index 000000000..4a156f227
--- /dev/null
+++ b/.github/workflows/baselines/create_pax_baselines.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+usage() {
+    echo -e "Usage: ${0} WORKFLOW_IDS..."
+    exit 1
+}
+
+[ "$#" -ge "1" ] || usage
+
+CONFIGS=("1DP1TP1PP" "8DP1TP1PP" "2DP1TP4PP" "16DP1TP1PP")
+ALL_WF_RUNS=($*)
+
+# call download artifacts from this  script's dir
+UTIL_DIR="$(dirname "$(readlink --canonicalize -- "${BASH_SOURCE[0]}")")"
+bash ${UTIL_DIR}/download_artifacts.sh ${ALL_WF_RUNS[@]}
+
+URLS=()
+for WORKFLOW_RUN in ${ALL_WF_RUNS[@]}; do
+  pushd ${WORKFLOW_RUN}
+  for CFG in ${CONFIGS[@]}; do
+    python3 ${UTIL_DIR}/summarize_metrics.py ${CFG}
+  done
+  popd
+  URLS+=("\"https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/${WORKFLOW_RUN}/artifacts\"")
+done
+
+for CFG in ${CONFIGS[@]}; do
+  # Average metrics data for this config
+  python3 ${UTIL_DIR}/average_baselines.py ${CFG} ${ALL_WF_RUNS[@]}
+  
+  # Append date and workflow sources
+  cat <<< $(jq -rc '. += {"run_urls":['$(IFS=, ; echo "${URLS[*]}")'], "date":"'$(date +%Y-%m-%d)'"}' "${CFG}.json") > ${CFG}.json
+done
diff --git a/.github/workflows/baselines/download_artifacts.sh b/.github/workflows/baselines/download_artifacts.sh
new file mode 100644
index 000000000..2949cfb01
--- /dev/null
+++ b/.github/workflows/baselines/download_artifacts.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+usage() {
+    echo -e "Usage: ${0} WORKFLOW_IDS..."
+    exit 1
+}
+
+if [[ -z $GH_TOKEN ]]; then
+  echo "GH_TOKEN env var must be set to download artifacts. Please export the GH_TOKEN var."
+  echo "You can create a personal access token here: https://github.com/settings/tokens"
+  echo "For more information, see GitHub official docs: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens"
+  exit 1
+fi
+
+[ "$#" -ge "1" ] || usage
+
+
+for WORKFLOW_RUN in $*; do
+  mkdir -p $WORKFLOW_RUN
+  pushd $WORKFLOW_RUN
+  # cURL the list of artifacts
+  ARTIFACTS=$(curl -L \
+    -H "Accept: application/vnd.github+json" \
+    -H "X-GitHub-Api-Version: 2022-11-28" \
+    "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/${WORKFLOW_RUN}/artifacts")
+
+  COUNT=$(echo $ARTIFACTS | jq -r '.total_count')
+  NAMES=$(echo $ARTIFACTS | jq -r '.artifacts[].name')
+  URLS=$(echo $ARTIFACTS | jq -r '.artifacts[].archive_download_url')
+  NAMES=($NAMES)
+  URLS=($URLS)
+
+  # Download artifacts
+  for (( i=0; i<$COUNT; i++ )); do
+    N=${NAMES[$i]}
+    U=${URLS[$i]}
+
+    curl -L \
+      -H "Accept: application/vnd.github+json" \
+      -H "Authorization: Bearer ${GH_TOKEN}" \
+      -H "X-GitHub-Api-Version: 2022-11-28" \
+      --output "${N}.zip" \
+      "${U}"
+
+    unzip ${N}.zip
+    rm ${N}.zip
+  done
+
+  popd
+done
diff --git a/.github/workflows/baselines/summarize_metrics.py b/.github/workflows/baselines/summarize_metrics.py
new file mode 100644
index 000000000..6262bc05f
--- /dev/null
+++ b/.github/workflows/baselines/summarize_metrics.py
@@ -0,0 +1,52 @@
+import os
+import json
+import glob
+import sys
+from statistics import mean
+from test_utils import read_tb_tag, read_e2e_time
+
+
+def _create_summary(loss, train_time, e2e_time):
+    steps = list(loss.keys())
+    intervals = [k2 - k1 for k1, k2 in zip(loss.keys(), steps[1:])]
+    assert all(i == intervals[0] for i in intervals)
+
+    baseline = {
+        "start_step": steps[0],
+        "end_step": steps[-1],
+        "step_interval": intervals[0],
+        "loss_values": list(loss.values()),
+        "step_times": list(train_time.values()),
+        "step_time_avg": mean(list(train_time.values())),
+        "e2e_time_seconds": e2e_time,
+    }
+    return baseline
+
+
+def main():
+    loss_summary_name = "loss"
+    train_time_summary_name = "Steps/sec"
+    if sys.argv[1]:
+        test_config = sys.argv[1]
+    else:
+        sys.exit(1)
+
+    try:
+        event_file = os.path.join(test_config, "summaries/train/events*")
+        event_file = glob.glob(event_file)[0]
+        loss = read_tb_tag(event_file, loss_summary_name)
+        train_time = read_tb_tag(event_file, train_time_summary_name)
+        e2e_time = read_e2e_time(test_config + ".log")
+
+        baseline = _create_summary(loss, train_time, e2e_time)
+        json_fname = test_config + "_metrics.json"
+        with open(json_fname, "w") as f:
+            json.dump(baseline, f)
+
+    except KeyError as e:
+        print(e)
+        print("Run might have failed, see", test_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/baselines/test_pax_mgmn_metrics.py b/.github/workflows/baselines/test_pax_mgmn_metrics.py
new file mode 100644
index 000000000..81afb289b
--- /dev/null
+++ b/.github/workflows/baselines/test_pax_mgmn_metrics.py
@@ -0,0 +1,62 @@
+import pytest
+import os
+import json
+import glob
+import test_utils
+from statistics import mean
+
+STEP_TIME_MULT = {
+    "1DP1TP1PP":  0.95,
+    "8DP1TP1PP":  0.95,
+    "1DP8TP1PP":  0.95,
+    "2DP1TP4PP":  0.95,
+    "16DP1TP1PP": 0.95,
+    "2DP2TP4PP":  0.95,
+}
+E2E_TIME_MULT = {
+    "1DP1TP1PP":  0.95,
+    "8DP1TP1PP":  0.95,
+    "1DP8TP1PP":  0.95,
+    "2DP1TP4PP":  0.95,
+    "16DP1TP1PP": 0.95,
+    "2DP2TP4PP":  0.95,
+}
+test_dir = os.path.dirname(os.path.abspath(__file__))
+baselines_dir = os.path.join(test_dir, "../PAX_MGMN")
+results_dir = os.environ.get("RESULTS_DIR")
+loss_summary_name = "loss"
+step_time_summary_name = "Steps/sec"
+
+@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir))
+def test_loss(baseline_filename):
+    baseline_filepath = os.path.join(baselines_dir, baseline_filename)
+    test_config = baseline_filename.split(".")[0]
+    event_file = os.path.join(results_dir, test_config, "summaries/train/events*")
+    event_file = glob.glob(event_file)[0]
+    with open(baseline_filepath, "r") as baseline_file:
+        end_step = json.load(baseline_file)["end_step"]
+        loss_actual = test_utils.read_tb_tag(event_file, loss_summary_name)
+        assert loss_actual[end_step] == 0
+
+
+@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir))
+def test_step_time(baseline_filename):
+    baseline_filepath = os.path.join(baselines_dir, baseline_filename)
+    test_config = baseline_filename.split(".")[0]
+    event_file = os.path.join(results_dir, test_config, "summaries/train/events*")
+    event_file = glob.glob(event_file)[0]
+    with open(baseline_filepath, "r") as baseline_file:
+        step_time_avg_expected = json.load(baseline_file)["step_time_avg"]
+        step_time_values = test_utils.read_tb_tag(event_file, step_time_summary_name).values()
+        step_time_avg_actual = mean(step_time_values)
+        assert step_time_avg_actual > step_time_avg_expected * STEP_TIME_MULT[test_config]
+
+@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir))
+def test_e2e_time(baseline_filename):
+    baseline_filepath = os.path.join(baselines_dir, baseline_filename)
+    test_config = baseline_filename.split(".")[0]
+    run_log = os.path.join(results_dir, test_config + ".log")
+    with open(baseline_filepath, "r") as baseline_file:
+        e2e_time_expected = json.load(baseline_file)["e2e_time_seconds"]
+        e2e_time_actual = test_utils.read_e2e_time(run_log)
+        assert e2e_time_actual < e2e_time_expected / E2E_TIME_MULT[test_config]
diff --git a/.github/workflows/baselines/test_utils.py b/.github/workflows/baselines/test_utils.py
new file mode 100644
index 000000000..ceed8b639
--- /dev/null
+++ b/.github/workflows/baselines/test_utils.py
@@ -0,0 +1,22 @@
+from tensorboard.backend.event_processing import event_accumulator
+from tensorboard.util import tensor_util
+
+
+def read_tb_tag(tb_file: str, summary_name: str) -> dict:
+    ea = event_accumulator.EventAccumulator(tb_file)
+    ea.Reload()
+
+    return {
+        event.step: tensor_util.make_ndarray(event.tensor_proto).item()
+        for event in ea.Tensors(summary_name)
+    }
+
+
+def read_e2e_time(log_file: str) -> float:
+    with open(log_file, "r") as log:
+        for line in log:
+            if line.startswith("real"):
+                minutes = line.split()[1].split('m')[0]
+                seconds = line.split('m')[1].split('s')[0]
+                return float(minutes) * 60 + float(seconds)
+    return -100000000