From c9e8c6a81e635cdb4a1d40407f257bb94cc176ed Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 14 Oct 2024 00:47:26 -0700
Subject: [PATCH 01/11] TC for Metric P0 nv_load_time per model

---
 qa/L0_metrics/general_metrics_test.py | 79 +++++++++++++++++++++++++++
 qa/L0_metrics/test.sh                 | 31 ++++++++++-
 2 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 qa/L0_metrics/general_metrics_test.py

diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
new file mode 100644
index 0000000000..adbc78197f
--- /dev/null
+++ b/qa/L0_metrics/general_metrics_test.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import re
+import unittest
+import requests
+
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
+MODEL_LOAD_TIME = "nv_model_load_time{model="
+
+def get_model_load_times():
+    r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
+    r.raise_for_status()
+    pattern = re.compile(rf'{MODEL_LOAD_TIME}"(.*?)".*?\ (\d+\.\d+)')
+    model_load_times = {}
+    matches = pattern.findall(r.text)
+    for match in matches:
+        model_name, load_time = match
+        model_load_times[model_name] = float(load_time)
+    return model_load_times
+
+class TestGeneralMetrics(tu.TestResultCollector):
+    def setUp(self):
+        self.model_name = "libtorch_float32_float32_float32"
+    
+    def test_metrics_load_time(self):
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name)
+        
+        self.assertIsNotNone(load_time, "Model Load time not found")
+        
+        dict_size = len(model_load_times)
+        self.assertEqual(dict_size, 1, "Too many model_load_time entries found")
+
+    def test_metrics_load_time_explicit_load(self):
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name)
+        
+        self.assertIsNotNone(load_time, "Model Load time not found")
+        
+        dict_size = len(model_load_times)
+        self.assertEqual(dict_size, 1, "Too many model_load_time entries found")
+
+    def test_metrics_load_time_explicit_unload(self):
+        r = requests.get(f"http://localhost:8000/v2/repository/models/")
+        r.raise_for_status()
+        print(r.text)
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name)
+        
+        self.assertIsNone(load_time, "Model Load time found even after unload")
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
index 76e99e7c48..f966b5fb23 100755
--- a/qa/L0_metrics/test.sh
+++ b/qa/L0_metrics/test.sh
@@ -45,7 +45,6 @@ SERVER=${TRITON_DIR}/bin/tritonserver
 BASE_SERVER_ARGS="--model-repository=${MODELDIR}"
 SERVER_ARGS="${BASE_SERVER_ARGS}"
 SERVER_LOG="./inference_server.log"
-PYTHON_TEST="metrics_config_test.py"
 source ../common/util.sh
 
 CLIENT_LOG="client.log"
@@ -132,12 +131,42 @@ fi
 kill_server
 set -e
 
+### General metrics tests
+
+set +e
+CLIENT_PY="./general_metrics_test.py"
+CLIENT_LOG="general_metrics_test_client.log"
+SERVER_LOG="general_metrics_test_server.log"
+SERVER_ARGS="$BASE_SERVER_ARGS --log-verbose=1"
+PYTHON_TEST="general_metrics_test.py"
+run_and_check_server
+# Test 1 for normal mode
+python3 ${PYTHON_TEST} TestGeneralMetrics.test_metrics_load_time -v 2>&1 | tee ${CLIENT_LOG}
+kill_server
+
+set +e
+CLIENT_PY="./general_metrics_test.py"
+CLIENT_LOG="general_metrics_test_client.log"
+SERVER_LOG="general_metrics_test_server.log"
+SERVER_ARGS="$BASE_SERVER_ARGS --model-control-mode=explicit --log-verbose=1"
+run_and_check_server
+export MODEL_NAME='libtorch_float32_float32_float32'
+code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/load`
+# Test 2 for explicit mode LOAD
+python3 ${PYTHON_TEST} TestGeneralMetrics.test_metrics_load_time_explicit_load -v 2>&1 | tee ${CLIENT_LOG}
+
+code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/unload`
+# Test 3 for explicit mode UNLOAD
+python3 ${PYTHON_TEST} TestGeneralMetrics.test_metrics_load_time_explicit_unload -v 2>&1 | tee ${CLIENT_LOG}
+kill_server
+
 ### Pinned memory metrics tests
 set +e
 CLIENT_PY="./pinned_memory_metrics_test.py"
 CLIENT_LOG="pinned_memory_metrics_test_client.log"
 SERVER_LOG="pinned_memory_metrics_test_server.log"
 SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1"
+PYTHON_TEST="metrics_config_test.py"
 run_and_check_server
 python3 ${PYTHON_TEST} MetricsConfigTest.test_pinned_memory_metrics_exist -v 2>&1 | tee ${CLIENT_LOG}
 check_unit_test

From 5b1f62f32c8e8b726f1bf73fa64a8170015e29cb Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 14 Oct 2024 01:13:51 -0700
Subject: [PATCH 02/11] Fix Pre-Commit

---
 qa/L0_metrics/general_metrics_test.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
index adbc78197f..a80b5dafa5 100644
--- a/qa/L0_metrics/general_metrics_test.py
+++ b/qa/L0_metrics/general_metrics_test.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+# /usr/bin/python
 # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,11 +28,13 @@
 import os
 import re
 import unittest
+
 import requests
 
 _tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 MODEL_LOAD_TIME = "nv_model_load_time{model="
 
+
 def get_model_load_times():
     r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
     r.raise_for_status()
@@ -44,25 +46,26 @@ def get_model_load_times():
         model_load_times[model_name] = float(load_time)
     return model_load_times
 
+
 class TestGeneralMetrics(tu.TestResultCollector):
     def setUp(self):
         self.model_name = "libtorch_float32_float32_float32"
-    
+
     def test_metrics_load_time(self):
         model_load_times = get_model_load_times()
         load_time = model_load_times.get(self.model_name)
-        
+
         self.assertIsNotNone(load_time, "Model Load time not found")
-        
+
         dict_size = len(model_load_times)
         self.assertEqual(dict_size, 1, "Too many model_load_time entries found")
 
     def test_metrics_load_time_explicit_load(self):
         model_load_times = get_model_load_times()
         load_time = model_load_times.get(self.model_name)
-        
+
         self.assertIsNotNone(load_time, "Model Load time not found")
-        
+
         dict_size = len(model_load_times)
         self.assertEqual(dict_size, 1, "Too many model_load_time entries found")
 
@@ -72,8 +75,9 @@ def test_metrics_load_time_explicit_unload(self):
         print(r.text)
         model_load_times = get_model_load_times()
         load_time = model_load_times.get(self.model_name)
-        
+
         self.assertIsNone(load_time, "Model Load time found even after unload")
 
+
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From d421e497bc2b1126131063874bc50da11e678593 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 14 Oct 2024 23:01:47 -0700
Subject: [PATCH 03/11] Fix review comments

---
 qa/L0_metrics/general_metrics_test.py |  2 +-
 qa/L0_metrics/test.sh                 | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
index a80b5dafa5..1c452515d0 100644
--- a/qa/L0_metrics/general_metrics_test.py
+++ b/qa/L0_metrics/general_metrics_test.py
@@ -47,7 +47,7 @@ def get_model_load_times():
     return model_load_times
 
 
-class TestGeneralMetrics(tu.TestResultCollector):
+class TestGeneralMetrics(unittest.TestCase):
     def setUp(self):
         self.model_name = "libtorch_float32_float32_float32"
 
diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
index f966b5fb23..1cb85f4004 100755
--- a/qa/L0_metrics/test.sh
+++ b/qa/L0_metrics/test.sh
@@ -140,8 +140,8 @@ SERVER_LOG="general_metrics_test_server.log"
 SERVER_ARGS="$BASE_SERVER_ARGS --log-verbose=1"
 PYTHON_TEST="general_metrics_test.py"
 run_and_check_server
-# Test 1 for normal mode
-python3 ${PYTHON_TEST} TestGeneralMetrics.test_metrics_load_time -v 2>&1 | tee ${CLIENT_LOG}
+# Test 1 for default model control mode (all models loaded at startup)
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time >> $CLIENT_LOG 2>&1
 kill_server
 
 set +e
@@ -150,14 +150,14 @@ CLIENT_LOG="general_metrics_test_client.log"
 SERVER_LOG="general_metrics_test_server.log"
 SERVER_ARGS="$BASE_SERVER_ARGS --model-control-mode=explicit --log-verbose=1"
 run_and_check_server
-export MODEL_NAME='libtorch_float32_float32_float32'
+MODEL_NAME='libtorch_float32_float32_float32'
 code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/load`
 # Test 2 for explicit mode LOAD
-python3 ${PYTHON_TEST} TestGeneralMetrics.test_metrics_load_time_explicit_load -v 2>&1 | tee ${CLIENT_LOG}
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_load.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_load >> $CLIENT_LOG 2>&1
 
 code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/unload`
 # Test 3 for explicit mode UNLOAD
-python3 ${PYTHON_TEST} TestGeneralMetrics.test_metrics_load_time_explicit_unload -v 2>&1 | tee ${CLIENT_LOG}
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_unload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_unload >> $CLIENT_LOG 2>&1
 kill_server
 
 ### Pinned memory metrics tests

From d47ebe5068c25c2275c22bb86ff45baaa06f258d Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Thu, 17 Oct 2024 18:02:02 -0700
Subject: [PATCH 04/11] Update Docs for new metric model load time

---
 docs/user_guide/metrics.md            | 9 +++++++++
 qa/L0_metrics/general_metrics_test.py | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
index b8fc0d8ee0..88f8b49c3a 100644
--- a/docs/user_guide/metrics.md
+++ b/docs/user_guide/metrics.md
@@ -183,6 +183,15 @@ There are some places where a request would not be considered pending:
     generally brief, it will not be considered pending from Triton's
     perspective until Triton core has received the request from the frontend.
 
+#### Load Time Per-Model
+The *Model Load Duration* reflects the time to load a model from storage into GPU/CPU in seconds.
+```
+# HELP nv_model_load_duration_secs Model load time in seconds
+# TYPE nv_model_load_duration_secs gauge
+nv_model_load_duration_secs{model="input_all_optional",version="2"} 1.532738387
+nv_model_load_duration_secs{model="input_all_optional",version="1"} 11.68753265
+```
+
 ### Latencies
 
 Starting in 23.04, Triton exposes the ability to choose the types of metrics
diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
index 1c452515d0..9dd1b672eb 100644
--- a/qa/L0_metrics/general_metrics_test.py
+++ b/qa/L0_metrics/general_metrics_test.py
@@ -32,7 +32,7 @@
 import requests
 
 _tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
-MODEL_LOAD_TIME = "nv_model_load_time{model="
+MODEL_LOAD_TIME = "nv_model_load_duration_secs{model="
 
 
 def get_model_load_times():

From 3fcc6499be0ab452d0d779048efdb3643552dda8 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 18 Oct 2024 15:08:28 -0700
Subject: [PATCH 05/11] Remove logs causing test to fail

---
 qa/L0_metrics/general_metrics_test.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
index 9dd1b672eb..f32739963d 100644
--- a/qa/L0_metrics/general_metrics_test.py
+++ b/qa/L0_metrics/general_metrics_test.py
@@ -70,9 +70,6 @@ def test_metrics_load_time_explicit_load(self):
         self.assertEqual(dict_size, 1, "Too many model_load_time entries found")
 
     def test_metrics_load_time_explicit_unload(self):
-        r = requests.get(f"http://localhost:8000/v2/repository/models/")
-        r.raise_for_status()
-        print(r.text)
         model_load_times = get_model_load_times()
         load_time = model_load_times.get(self.model_name)
 

From 748d3c59a1352cba915ace58e56bb8372f468f47 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Sat, 19 Oct 2024 01:44:03 -0700
Subject: [PATCH 06/11] Fix review comments add new test for versions

---
 qa/L0_metrics/general_metrics_test.py         | 108 ++++++++++++++++--
 qa/L0_metrics/test.sh                         |  12 ++
 .../input_all_optional/1/model.py             |  49 ++++++++
 .../input_all_optional/2/model.py             |  47 ++++++++
 .../input_all_optional/config.pbtxt           |  59 ++++++++++
 5 files changed, 265 insertions(+), 10 deletions(-)
 create mode 100644 qa/L0_metrics/version_models/input_all_optional/1/model.py
 create mode 100644 qa/L0_metrics/version_models/input_all_optional/2/model.py
 create mode 100644 qa/L0_metrics/version_models/input_all_optional/config.pbtxt

diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
index f32739963d..7f135cb344 100644
--- a/qa/L0_metrics/general_metrics_test.py
+++ b/qa/L0_metrics/general_metrics_test.py
@@ -27,6 +27,7 @@
 
 import os
 import re
+import time
 import unittest
 
 import requests
@@ -39,21 +40,60 @@ def get_model_load_times():
     r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
     r.raise_for_status()
     pattern = re.compile(rf'{MODEL_LOAD_TIME}"(.*?)".*?\ (\d+\.\d+)')
-    model_load_times = {}
-    matches = pattern.findall(r.text)
-    for match in matches:
-        model_name, load_time = match
-        model_load_times[model_name] = float(load_time)
-    return model_load_times
+    # Initialize an empty dictionary to store the data
+    model_data = {}
+    lines = r.text.strip().split("\n")
+    for line in lines:
+        # Use regex to extract model name, version, and load time
+        match = re.match(
+            r"nv_model_load_duration_secs\{model=\"(.*?)\",version=\"(.*?)\"\} (.*)",
+            line,
+        )
+        if match:
+            model_name = match.group(1)
+            model_version = match.group(2)
+            load_time = float(match.group(3))
+            # Store in dictionary
+            if model_name not in model_data:
+                model_data[model_name] = {}
+            model_data[model_name][model_version] = load_time
+    return model_data
+
+
+def load_model_explicit(model_name, server_url="http://localhost:8000"):
+    endpoint = f"{server_url}/v2/repository/models/{model_name}/load"
+    response = requests.post(endpoint)
+
+    if response.status_code == 200:
+        print(f"Model '{model_name}' loaded successfully.")
+    else:
+        print(
+            f"Failed to load model '{model_name}'. Status code: {response.status_code}"
+        )
+        print("Response:", response.text)
+
+
+def unload_model_explicit(model_name, server_url="http://localhost:8000"):
+    endpoint = f"{server_url}/v2/repository/models/{model_name}/unload"
+    response = requests.post(endpoint)
+
+    if response.status_code == 200:
+        print(f"Model '{model_name}' unloaded successfully.")
+    else:
+        print(
+            f"Failed to load model '{model_name}'. Status code: {response.status_code}"
+        )
+        print("Response:", response.text)
 
 
 class TestGeneralMetrics(unittest.TestCase):
     def setUp(self):
         self.model_name = "libtorch_float32_float32_float32"
+        self.model_name_multiple_versions = "input_all_optional"
 
     def test_metrics_load_time(self):
         model_load_times = get_model_load_times()
-        load_time = model_load_times.get(self.model_name)
+        load_time = model_load_times.get(self.model_name, {}).get("1")
 
         self.assertIsNotNone(load_time, "Model Load time not found")
 
@@ -62,7 +102,7 @@ def test_metrics_load_time(self):
 
     def test_metrics_load_time_explicit_load(self):
         model_load_times = get_model_load_times()
-        load_time = model_load_times.get(self.model_name)
+        load_time = model_load_times.get(self.model_name, {}).get("1")
 
         self.assertIsNotNone(load_time, "Model Load time not found")
 
@@ -71,10 +111,58 @@ def test_metrics_load_time_explicit_load(self):
 
     def test_metrics_load_time_explicit_unload(self):
         model_load_times = get_model_load_times()
-        load_time = model_load_times.get(self.model_name)
-
+        load_time = model_load_times.get(self.model_name, {}).get("1")
         self.assertIsNone(load_time, "Model Load time found even after unload")
 
+    def test_metrics_load_time_multiple_version_reload(self):
+        # Part 1 load multiple versions of the same model and check if slow and fast models reflect the metric correctly
+        load_model_explicit(self.model_name_multiple_versions)
+        model_load_times = get_model_load_times()
+        load_time_slow = model_load_times.get(
+            self.model_name_multiple_versions, {}
+        ).get("1")
+        load_time_fast = model_load_times.get(
+            self.model_name_multiple_versions, {}
+        ).get("2")
+        # Fail the test if load_time_slow is less than load_time_fast
+        self.assertGreaterEqual(
+            load_time_slow,
+            load_time_fast,
+            "Slow load time should be greater than or equal to fast load time",
+        )
+        # Fail the test if load_time_slow is less than 10 seconds as manual delay is 10 seconds
+        self.assertGreaterEqual(
+            load_time_slow,
+            10,
+            "Slow load time should be greater than or equal to fast load time",
+        )
+
+        # Part 2 load multiple versions AGAIN and compare with prev values expect to be the same
+        # as triton does not actually load the model again.
+        load_model_explicit(self.model_name_multiple_versions)
+        model_load_times_new = get_model_load_times()
+        load_time_slow_new = model_load_times_new.get(
+            self.model_name_multiple_versions, {}
+        ).get("1")
+        load_time_fast_new = model_load_times_new.get(
+            self.model_name_multiple_versions, {}
+        ).get("2")
+        self.assertEqual(load_time_fast_new, load_time_fast)
+        self.assertEqual(load_time_slow_new, load_time_slow)
+
+        # Part 3 unload the model and expect the metrics to go away as model is not loaded now
+        unload_model_explicit(self.model_name_multiple_versions)
+        time.sleep(1)
+        model_load_times_new = get_model_load_times()
+        load_time_slow_new = model_load_times_new.get(
+            self.model_name_multiple_versions, {}
+        ).get("1")
+        load_time_fast_new = model_load_times_new.get(
+            self.model_name_multiple_versions, {}
+        ).get("2")
+        self.assertIsNone(load_time_slow_new, "Model Load time found even after unload")
+        self.assertIsNone(load_time_fast_new, "Model Load time found even after unload")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
index 1cb85f4004..4ea1971f5c 100755
--- a/qa/L0_metrics/test.sh
+++ b/qa/L0_metrics/test.sh
@@ -160,6 +160,18 @@ code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/
 python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_unload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_unload >> $CLIENT_LOG 2>&1
 kill_server
 
+# Test 4 for explicit mode LOAD and UNLOAD with multiple versions
+set +e
+CLIENT_PY="./general_metrics_test.py"
+CLIENT_LOG="general_metrics_test_client.log"
+SERVER_LOG="general_metrics_test_server.log"
+VERSION_DIR="${PWD}/version_models"
+SERVER_ARGS="$BASE_SERVER_ARGS --model-repository=${VERSION_DIR} --model-control-mode=explicit --log-verbose=1"
+run_and_check_server
+python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_multiple_version_reload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_multiple_version_reload >> $CLIENT_LOG 2>&1
+
+kill_server
+
 ### Pinned memory metrics tests
 set +e
 CLIENT_PY="./pinned_memory_metrics_test.py"
diff --git a/qa/L0_metrics/version_models/input_all_optional/1/model.py b/qa/L0_metrics/version_models/input_all_optional/1/model.py
new file mode 100644
index 0000000000..fecf42b66f
--- /dev/null
+++ b/qa/L0_metrics/version_models/input_all_optional/1/model.py
@@ -0,0 +1,49 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+import time
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        time.sleep(10)
+        self.model_config = json.loads(args["model_config"])
+
+    def execute(self, requests):
+        """This function is called on inference request."""
+
+        responses = []
+        for _ in requests:
+            # Include one of each specially parsed JSON value: nan, inf, and -inf
+            out_0 = np.array([1], dtype=np.float32)
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0)
+            responses.append(pb_utils.InferenceResponse([out_tensor_0]))
+
+        return responses
diff --git a/qa/L0_metrics/version_models/input_all_optional/2/model.py b/qa/L0_metrics/version_models/input_all_optional/2/model.py
new file mode 100644
index 0000000000..40f8b25579
--- /dev/null
+++ b/qa/L0_metrics/version_models/input_all_optional/2/model.py
@@ -0,0 +1,47 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def initialize(self, args):
+        self.model_config = json.loads(args["model_config"])
+
+    def execute(self, requests):
+        """This function is called on inference request."""
+
+        responses = []
+        for _ in requests:
+            # Include one of each specially parsed JSON value: nan, inf, and -inf
+            out_0 = np.array([1], dtype=np.float32)
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0)
+            responses.append(pb_utils.InferenceResponse([out_tensor_0]))
+
+        return responses
diff --git a/qa/L0_metrics/version_models/input_all_optional/config.pbtxt b/qa/L0_metrics/version_models/input_all_optional/config.pbtxt
new file mode 100644
index 0000000000..e3653342b4
--- /dev/null
+++ b/qa/L0_metrics/version_models/input_all_optional/config.pbtxt
@@ -0,0 +1,59 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "input_all_optional"
+backend: "python"
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+  },
+  {
+    name: "INPUT1"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+  },
+  {
+    name: "INPUT2"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+  }
+]
+
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+instance_group [{ kind: KIND_CPU }]
+version_policy: { all { }}

From 9f3f57703d1aaba60521c6855135eaae3d1626e7 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Sat, 19 Oct 2024 01:48:45 -0700
Subject: [PATCH 07/11] Pre-Commit Fix

---
 qa/L0_metrics/version_models/input_all_optional/1/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/L0_metrics/version_models/input_all_optional/1/model.py b/qa/L0_metrics/version_models/input_all_optional/1/model.py
index fecf42b66f..86cd368fe0 100644
--- a/qa/L0_metrics/version_models/input_all_optional/1/model.py
+++ b/qa/L0_metrics/version_models/input_all_optional/1/model.py
@@ -25,8 +25,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
-
 import time
+
 import numpy as np
 import triton_python_backend_utils as pb_utils
 

From f74507387775f3824e698d55bbf15ccb9e54baf6 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 21 Oct 2024 17:53:31 -0700
Subject: [PATCH 08/11] Comments fixed

---
 qa/L0_metrics/general_metrics_test.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
index 7f135cb344..f0002def65 100644
--- a/qa/L0_metrics/general_metrics_test.py
+++ b/qa/L0_metrics/general_metrics_test.py
@@ -39,7 +39,6 @@
 def get_model_load_times():
     r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
     r.raise_for_status()
-    pattern = re.compile(rf'{MODEL_LOAD_TIME}"(.*?)".*?\ (\d+\.\d+)')
     # Initialize an empty dictionary to store the data
     model_data = {}
     lines = r.text.strip().split("\n")
@@ -115,6 +114,11 @@ def test_metrics_load_time_explicit_unload(self):
         self.assertIsNone(load_time, "Model Load time found even after unload")
 
     def test_metrics_load_time_multiple_version_reload(self):
+        # Part 0 check start condistion, metric should not be present
+        model_load_times = get_model_load_times()
+        load_time = model_load_times.get(self.model_name, {}).get("1")
+        self.assertIsNone(load_time, "Model Load time found even before model load")
+
         # Part 1 load multiple versions of the same model and check if slow and fast models reflect the metric correctly
         load_model_explicit(self.model_name_multiple_versions)
         model_load_times = get_model_load_times()
@@ -136,6 +140,12 @@ def test_metrics_load_time_multiple_version_reload(self):
             10,
             "Slow load time should be greater than or equal to fast load time",
         )
+        # Fail the test if load_time_fast is greater than generous 2 seconds
+        self.assertLess(
+            load_time_fast,
+            2,
+            "Model taking too much time to load",
+        )
 
         # Part 2 load multiple versions AGAIN and compare with prev values expect to be the same
         # as triton does not actually load the model again.

From f07f5efb2084cac624fa30c9b033b9aae83123c5 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 4 Nov 2024 15:47:01 -0800
Subject: [PATCH 09/11] Review Comments Fixed

---
 qa/L0_metrics/general_metrics_test.py | 19 ++++++++-----------
 qa/L0_metrics/test.sh                 | 12 +++++-------
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
index f0002def65..e2cbb74f63 100644
--- a/qa/L0_metrics/general_metrics_test.py
+++ b/qa/L0_metrics/general_metrics_test.py
@@ -62,25 +62,22 @@ def get_model_load_times():
 def load_model_explicit(model_name, server_url="http://localhost:8000"):
     endpoint = f"{server_url}/v2/repository/models/{model_name}/load"
     response = requests.post(endpoint)
-
-    if response.status_code == 200:
+    try:
+        self.assertEqual(response.status_code, 200)
         print(f"Model '{model_name}' loaded successfully.")
-    else:
-        print(
-            f"Failed to load model '{model_name}'. Status code: {response.status_code}"
-        )
+    except AssertionError:
+        print(f"Failed to load model '{model_name}'. Status code: {response.status_code}")
         print("Response:", response.text)
 
-
 def unload_model_explicit(model_name, server_url="http://localhost:8000"):
     endpoint = f"{server_url}/v2/repository/models/{model_name}/unload"
     response = requests.post(endpoint)
-
-    if response.status_code == 200:
+    try:
+        self.assertEqual(response.status_code, 200)
         print(f"Model '{model_name}' unloaded successfully.")
-    else:
+    except AssertionError:
         print(
-            f"Failed to load model '{model_name}'. Status code: {response.status_code}"
+            f"Failed to unload model '{model_name}'. Status code: {response.status_code}"
         )
         print("Response:", response.text)
 
diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
index 4ea1971f5c..c12814abc7 100755
--- a/qa/L0_metrics/test.sh
+++ b/qa/L0_metrics/test.sh
@@ -143,34 +143,32 @@ run_and_check_server
 # Test 1 for default model control mode (all models loaded at startup)
 python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time >> $CLIENT_LOG 2>&1
 kill_server
+set -e
 
 set +e
-CLIENT_PY="./general_metrics_test.py"
-CLIENT_LOG="general_metrics_test_client.log"
-SERVER_LOG="general_metrics_test_server.log"
 SERVER_ARGS="$BASE_SERVER_ARGS --model-control-mode=explicit --log-verbose=1"
 run_and_check_server
 MODEL_NAME='libtorch_float32_float32_float32'
-code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/load`
+curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/load
 # Test 2 for explicit mode LOAD
 python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_load.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_load >> $CLIENT_LOG 2>&1
 
-code=`curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/unload`
+curl -s -w %{http_code} -X POST ${TRITONSERVER_IPADDR}:8000/v2/repository/models/${MODEL_NAME}/unload
 # Test 3 for explicit mode UNLOAD
 python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_explicit_unload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_explicit_unload >> $CLIENT_LOG 2>&1
 kill_server
+set -e
 
 # Test 4 for explicit mode LOAD and UNLOAD with multiple versions
 set +e
 CLIENT_PY="./general_metrics_test.py"
-CLIENT_LOG="general_metrics_test_client.log"
-SERVER_LOG="general_metrics_test_server.log"
 VERSION_DIR="${PWD}/version_models"
 SERVER_ARGS="$BASE_SERVER_ARGS --model-repository=${VERSION_DIR} --model-control-mode=explicit --log-verbose=1"
 run_and_check_server
 python3 -m pytest --junitxml="general_metrics_test.test_metrics_load_time_multiple_version_reload.report.xml" $CLIENT_PY::TestGeneralMetrics::test_metrics_load_time_multiple_version_reload >> $CLIENT_LOG 2>&1
 
 kill_server
+set -e
 
 ### Pinned memory metrics tests
 set +e

From b752a5be60f33764d083897bca179f438dacd172 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 4 Nov 2024 15:49:29 -0800
Subject: [PATCH 10/11] Pre-Commit Fix

---
 qa/L0_metrics/general_metrics_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/qa/L0_metrics/general_metrics_test.py b/qa/L0_metrics/general_metrics_test.py
index e2cbb74f63..7877e34332 100644
--- a/qa/L0_metrics/general_metrics_test.py
+++ b/qa/L0_metrics/general_metrics_test.py
@@ -66,9 +66,12 @@ def load_model_explicit(model_name, server_url="http://localhost:8000"):
         self.assertEqual(response.status_code, 200)
         print(f"Model '{model_name}' loaded successfully.")
     except AssertionError:
-        print(f"Failed to load model '{model_name}'. Status code: {response.status_code}")
+        print(
+            f"Failed to load model '{model_name}'. Status code: {response.status_code}"
+        )
         print("Response:", response.text)
 
+
 def unload_model_explicit(model_name, server_url="http://localhost:8000"):
     endpoint = f"{server_url}/v2/repository/models/{model_name}/unload"
     response = requests.post(endpoint)

From 9329e5517385cfa156f6b00a11210233b54259f4 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 6 Nov 2024 11:59:24 -0800
Subject: [PATCH 11/11] Extra assignment removed

---
 qa/L0_metrics/test.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
index c8fffd7a38..92b74036cf 100755
--- a/qa/L0_metrics/test.sh
+++ b/qa/L0_metrics/test.sh
@@ -161,7 +161,6 @@ set -e
 
 # Test 4 for explicit mode LOAD and UNLOAD with multiple versions
 set +e
-CLIENT_PY="./general_metrics_test.py"
 VERSION_DIR="${PWD}/version_models"
 SERVER_ARGS="$BASE_SERVER_ARGS --model-repository=${VERSION_DIR} --model-control-mode=explicit --log-verbose=1"
 run_and_check_server