triton-inference-server · pvijayakrish · Sep 7, 2024 · Sep 7, 2024 · Sep 11, 2024 · Sep 17, 2024
diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min
@@ -37,9 +37,9 @@ RUN choco install unzip -y
 #
 # Installing TensorRT
 #
-ARG TENSORRT_VERSION=10.3.0.26
-ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.5.zip"
-ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5.zip
+ARG TENSORRT_VERSION=10.4.0.26
+ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows.win10.cuda-12.6.zip"
+ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip
 # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
 ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP}
 RUN unzip /tmp/%TENSORRT_ZIP%
@@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
 #
 # Installing cuDNN
 #
-ARG CUDNN_VERSION=9.3.0.75
+ARG CUDNN_VERSION=9.4.0.58
 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
-ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.2.1.18_cuda12-archive.zip
+ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.4.0.58_cuda12-archive.zip
 ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
 RUN unzip /tmp/%CUDNN_ZIP%
 RUN move cudnn-* cudnn
@@ -175,15 +175,15 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi
 
 RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"
 
-ARG CUDNN_VERSION=9.3.0.75
+ARG CUDNN_VERSION=9.4.0.58
 ENV CUDNN_VERSION ${CUDNN_VERSION}
 COPY --from=dependency_base /cudnn /cudnn
 RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
 RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
 RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\."
 LABEL CUDNN_VERSION="${CUDNN_VERSION}"
 
-ARG TENSORRT_VERSION=10.3.0.26
+ARG TENSORRT_VERSION=10.4.0.26
 ENV TRT_VERSION ${TENSORRT_VERSION}
 COPY --from=dependency_base /TensorRT /TensorRT
 RUN setx PATH "c:\TensorRT\lib;%PATH%"

diff --git a/README.md b/README.md
@@ -28,25 +28,8 @@
 
 # Triton Inference Server
 
-📣 **vLLM x Triton Meetup at Fort Mason on Sept 9th 4:00 - 9:00 pm**
-
-We are excited to announce that we will be hosting our Triton user meetup with the vLLM team at
-[Fort Mason](https://maps.app.goo.gl/9Lr3fxRssrpQCGK58) on Sept 9th 4:00 - 9:00 pm. Join us for this
-exclusive event where you will learn about the newest vLLM and Triton features, get a
-glimpse into the roadmaps, and connect with fellow users, the NVIDIA Triton and vLLM teams. Seating is limited and registration confirmation
-is required to attend - please register [here](https://lu.ma/87q3nvnh) to join
-the meetup.
-
-___
-
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
-[!WARNING]
-
-##### LATEST RELEASE
-You are currently on the `main` branch which tracks under-development progress towards the next release.
-The current release is version [2.49.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.08 container release on NVIDIA GPU Cloud (NGC).
-
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
 multiple deep learning and machine learning frameworks, including TensorRT,
@@ -74,7 +57,7 @@ Major features include:
 - Provides [Backend API](https://github.com/triton-inference-server/backend) that
   allows adding custom backends and pre/post processing operations
 - Supports writing custom backends in python, a.k.a.
-  [Python-based backends.](https://github.com/triton-inference-server/backend/blob/main/docs/python_based_backends.md#python-based-backends)
+  [Python-based backends.](https://github.com/triton-inference-server/backend/blob/r24.09/docs/python_based_backends.md#python-based-backends)
 - Model pipelines using
   [Ensembling](docs/user_guide/architecture.md#ensemble-models) or [Business
   Logic Scripting
@@ -103,16 +86,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r24.08 https://github.com/triton-inference-server/server.git
+git clone -b r24.09 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.08-py3 tritonserver --model-repository=/models
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.09-py3 tritonserver --model-repository=/models
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.08-py3-sdk
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.09-py3-sdk
 /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following
@@ -187,10 +170,10 @@ configuration](docs/user_guide/model_configuration.md) for the model.
   [Python](https://github.com/triton-inference-server/python_backend), and more
 - Not all the above backends are supported on every platform supported by Triton.
   Look at the
-  [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md)
+  [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/r24.09/docs/backend_platform_support_matrix.md)
   to learn which backends are supported on your target platform.
 - Learn how to [optimize performance](docs/user_guide/optimization.md) using the
-  [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
+  [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/r24.09/README.md)
   and
   [Model Analyzer](https://github.com/triton-inference-server/model_analyzer)
 - Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in
@@ -204,14 +187,14 @@ A Triton *client* application sends inference and other requests to Triton. The
 [Python and C++ client libraries](https://github.com/triton-inference-server/client)
 provide APIs to simplify this communication.
 
-- Review client examples for [C++](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/examples),
-  [Python](https://github.com/triton-inference-server/client/blob/main/src/python/examples),
-  and [Java](https://github.com/triton-inference-server/client/blob/main/src/java/src/main/java/triton/client/examples)
+- Review client examples for [C++](https://github.com/triton-inference-server/client/blob/r24.09/src/c%2B%2B/examples),
+  [Python](https://github.com/triton-inference-server/client/blob/r24.09/src/python/examples),
+  and [Java](https://github.com/triton-inference-server/client/blob/r24.09/src/java/src/main/java/triton/client/examples)
 - Configure [HTTP](https://github.com/triton-inference-server/client#http-options)
   and [gRPC](https://github.com/triton-inference-server/client#grpc-options)
   client options
 - Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP
-  request without any additional metadata](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#raw-binary-request)
+  request without any additional metadata](https://github.com/triton-inference-server/server/blob/r24.09/docs/protocol/extension_binary_data.md#raw-binary-request)
 
 ### Extend Triton
 
@@ -220,7 +203,7 @@ designed for modularity and flexibility
 
 - [Customize Triton Inference Server container](docs/customization_guide/compose.md) for your use case
 - [Create custom backends](https://github.com/triton-inference-server/backend)
-  in either [C/C++](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api)
+  in either [C/C++](https://github.com/triton-inference-server/backend/blob/r24.09/README.md#triton-backend-api)
   or [Python](https://github.com/triton-inference-server/python_backend)
 - Create [decoupled backends and models](docs/user_guide/decoupled_models.md) that can send
   multiple responses for a request or not send any responses for a request
@@ -229,7 +212,7 @@ designed for modularity and flexibility
   decryption, or conversion
 - Deploy Triton on [Jetson and JetPack](docs/user_guide/jetson.md)
 - [Use Triton on AWS
-   Inferentia](https://github.com/triton-inference-server/python_backend/tree/main/inferentia)
+   Inferentia](https://github.com/triton-inference-server/python_backend/tree/r24.09/inferentia)
 
 ### Additional Documentation
 

diff --git a/TRITON_VERSION b/TRITON_VERSION
@@ -1 +1 @@
-2.50.0dev
+2.50.0
diff --git a/build.py b/build.py
@@ -70,10 +70,10 @@
 # incorrectly load the other version of the openvino libraries.
 #
 TRITON_VERSION_MAP = {
-    "2.50.0dev": (
-        "24.09dev",  # triton container
-        "24.08",  # upstream container
-        "1.18.1",  # ORT
+    "2.50.0": (
+        "24.09",  # triton container
+        "24.09",  # upstream container
+        "1.19.2",  # ORT
         "2024.0.0",  # ORT OpenVINO
         "2024.0.0",  # Standalone OpenVINO
         "3.2.6",  # DCGM version