diff --git a/CMakeLists.txt b/CMakeLists.txt index fc4ac10b7669a..1a6a311e97633 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.5.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0") # @@ -507,7 +507,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd + GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 24bb7299338ac..40430dae10c5b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -424,11 +424,7 @@ function (define_gpu_extension_target GPU_MOD_NAME) # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of # dependencies that are not necessary and may not be installed. if (GPU_LANGUAGE STREQUAL "CUDA") - if ("${CUDA_CUDA_LIB}" STREQUAL "") - set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}") - endif() - target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB} - ${CUDA_LIBRARIES}) + target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver) else() target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) endif() diff --git a/pyproject.toml b/pyproject.toml index e0c56ab79cad0..e78f5652f486b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.4.0", + "torch == 2.5.0", "wheel", "jinja2", ] diff --git a/requirements-build.txt b/requirements-build.txt index 6144a56da8c47..ea2b688bb3108 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.4.0 +torch==2.5.0 wheel jinja2 diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 3b3c2f876919e..92fa303d687a2 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py # for pynvml package -torch == 2.4.0 +torch == 2.5.0 # These must be updated alongside torch -torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0 +torchvision == 0.20 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers == 0.0.28.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.0 diff --git a/requirements-openvino.txt b/requirements-openvino.txt index ac54cf0c3288f..7ad0d1e7f704b 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -1,7 +1,7 @@ # Common dependencies -r requirements-common.txt -torch == 2.4.0 # should be aligned with "common" vLLM torch version +torch == 2.5.0 # should be aligned with "common" vLLM torch version openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index 75625b35209ce..fcfc159e4f5a0 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -8,7 +8,7 @@ from vllm.platforms import current_platform -from ...utils import check_outputs_equal +from ...utils import check_logprobs_close, check_outputs_equal MODELS = [ "meta-llama/Llama-2-7b-hf", @@ -43,18 +43,40 @@ def test_models( dtype: str, max_tokens: int, ) -> None: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + if model == "openbmb/MiniCPM3-4B": + # the output becomes slightly different when upgrading to + # pytorch 2.5 . Changing to logprobs checks instead of exact + # output checks. + NUM_LOG_PROBS = 8 + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, NUM_LOG_PROBS) + + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + else: + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 30bbf5107475d..9c5212ace1346 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -7,6 +7,7 @@ from typing import Callable, List, Tuple, TypeVar import pynvml +import torch from typing_extensions import ParamSpec from vllm.logger import init_logger @@ -26,6 +27,10 @@ " and cause errors. See https://pypi.org/project/pynvml " "for more information.") +# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models +# see https://github.com/huggingface/diffusers/issues/9704 for details +torch.backends.cuda.enable_cudnn_sdp(False) + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, # all the related functions work on real physical device ids.