diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index d2ae926daa7c0..f4ead8d277736 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,12 +23,10 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " pip install pytest matplotlib einops transformers_stream_generator - pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \ - --ignore=tests/models/test_oot_registration.py \ - --ignore=tests/models/test_registry.py \ - --ignore=tests/models/test_fp8.py \ - --ignore=tests/models/test_jamba.py \ - --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported + pytest -v -s tests/models/decoder_only/language \ + --ignore=tests/models/test_fp8.py \ + --ignore=tests/models/decoder_only/language/test_jamba.py \ + --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported # Run compressed-tensor test docker exec cpu-test bash -c " diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d0732ec3fe2fb..9b0cb6663a55b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -94,7 +94,6 @@ steps: - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - - label: Distributed Tests (4 GPUs) # 10min working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -164,15 +163,6 @@ steps: - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference_encoder_decoder.py -- label: Models Test # 1hr10min - source_file_dependencies: - - vllm/ - - tests/models - commands: - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s models/test_oot_registration.py # it needs a clean process - - pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py - - label: torch compile integration test source_file_dependencies: - vllm/ @@ -180,14 +170,6 @@ steps: - pytest -v -s ./compile/test_full_graph.py - pytest -v -s ./compile/test_wrapper.py - -- label: Vision Language Models Test # 42min - #mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - commands: - - pytest -v -s models -m vlm - - label: Prefix Caching Test # 7min #mirror_hardwares: [amd] source_file_dependencies: @@ -286,6 +268,45 @@ steps: commands: - pytest -v -s tool_use +##### models test ##### + +- label: Basic Models Test # 3min + source_file_dependencies: + - vllm/ + - tests/models + commands: + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s models/*.py --ignore=models/test_oot_registration.py + +- label: Decoder-only Language Models Test # 1h3min + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/language + commands: + - pytest -v -s models/decoder_only/language + +- label: Decoder-only Multi-Modal Models Test # 56min + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/audio_language + - tests/models/decoder_only/vision_language + commands: + - pytest -v -s models/decoder_only/audio_language + - pytest -v -s models/decoder_only/vision_language + +- label: Other Models Test # 5min + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/models/embedding/language + - tests/models/encoder_decoder/language + commands: + - pytest -v -s models/embedding/language + - pytest -v -s models/encoder_decoder/language + ##### 1 GPU test ##### ##### multi gpus test ##### @@ -311,11 +332,11 @@ steps: - tests/distributed/ commands: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed' - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed' - label: Distributed Tests (2 GPUs) # 28min #mirror_hardwares: [amd] @@ -328,11 +349,10 @@ steps: - vllm/model_executor/models/ - tests/distributed/ commands: - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py - - pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py - - pytest -v -s distributed/test_chunked_prefill_distributed.py - - pytest -v -s distributed/test_multimodal_broadcast.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index faac2b97722b7..6c7f7f7d5d992 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -342,7 +342,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore We have the following levels of testing for models: -1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py `_ and `test_big_models.py `_ for the models that have passed this test. +1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests `_ for the models that have passed this test. 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. 3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests `_ and `examples `_ for the models that have passed this test. 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/pyproject.toml b/pyproject.toml index d9e3278db4d19..6b682f5d4dd4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,5 +85,6 @@ skip_gitignore = true [tool.pytest.ini_options] markers = [ "skip_global_cleanup", - "vlm: run tests for vision language models only", + "core_model: run this model test in each PR instead of just daily", + "distributed_2_gpus: run this test only in distributed tests for 2 GPUs", ] diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index b970cd48f9170..0fe88e792520a 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -15,12 +15,15 @@ from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata from ..models.utils import check_outputs_equal +from ..utils import multi_gpu_test MODELS = [ "facebook/opt-125m", "meta-llama/Llama-2-7b-hf", ] +TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") + def test_vllm_gc_ed(): """Verify vllm instance is GC'ed when it is deleted""" @@ -70,6 +73,65 @@ def test_models( ) +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "model, distributed_executor_backend, attention_backend, " + "test_suite", [ + ("facebook/opt-125m", "ray", "", "L4"), + ("facebook/opt-125m", "mp", "", "L4"), + ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), + ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), + ("facebook/opt-125m", "ray", "", "A100"), + ("facebook/opt-125m", "mp", "", "A100"), + ("facebook/opt-125m", "mp", "FLASHINFER", "A100"), + ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), + ]) +def test_models_distributed( + hf_runner, + vllm_runner, + example_prompts, + model: str, + distributed_executor_backend: str, + attention_backend: str, + test_suite: str, +) -> None: + + if test_suite != TARGET_TEST_SUITE: + pytest.skip(f"Skip test for {test_suite}") + + if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa + # test ray adag + os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" + os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" + + if attention_backend: + os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend + + dtype = "half" + max_tokens = 5 + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + def test_model_with_failure(vllm_runner) -> None: try: with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 9c34b2a13fd53..14c5447680729 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -6,11 +6,13 @@ Run `pytest tests/models/test_chunked_prefill.py`. """ +import os from contextlib import nullcontext import pytest from ..models.utils import check_logprobs_close, check_outputs_equal +from ..utils import multi_gpu_test MODELS = [ "facebook/opt-125m", @@ -66,6 +68,59 @@ def test_models( ) +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) +@pytest.mark.parametrize("model", MODELS) +def test_models_distributed( + hf_runner, + vllm_runner, + example_prompts, + model: str, + distributed_executor_backend: str, +) -> None: + if (model == "meta-llama/Llama-2-7b-hf" + and distributed_executor_backend == "ray"): + # test ray adag + os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" + os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" + + dtype = "half" + max_tokens = 5 + chunked_prefill_token_size = 16 + + # Add a chunked prefill config. + max_num_seqs = min(chunked_prefill_token_size, 256) + assert chunked_prefill_token_size != -1 + enable_chunked_prefill = True + max_num_batched_tokens = chunked_prefill_token_size + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + + with vllm_runner( + model, + dtype=dtype, + tensor_parallel_size=2, + max_num_seqs=max_num_seqs, + enable_chunked_prefill=enable_chunked_prefill, + max_num_batched_tokens=max_num_batched_tokens, + distributed_executor_backend=distributed_executor_backend, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + @pytest.mark.parametrize( "kv_cache_dtype,model", [("fp8_e4m3", diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 50d399bef1878..00806c3e129b1 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -19,10 +19,13 @@ "facebook/opt-125m", ] -assert ENABLE_ARTIFICIAL_PREEMPT is True, ( - "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. " - "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest " - "tests/basic_correctness/test_preemption.py`") + +@pytest.fixture(scope="module", autouse=True) +def check_settings(): + assert ENABLE_ARTIFICIAL_PREEMPT is True, ( + "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. " + "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest " + "tests/basic_correctness/test_preemption.py`") @pytest.fixture diff --git a/tests/conftest.py b/tests/conftest.py index 620f8b4983517..e4c7b96e82429 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,8 +6,8 @@ import tempfile from collections import UserList from enum import Enum -from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict, - TypeVar, Union) +from typing import (Any, Callable, Dict, List, Optional, Tuple, Type, + TypedDict, TypeVar, Union) import numpy as np import pytest @@ -18,6 +18,7 @@ from PIL import Image from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding, BatchFeature) +from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset @@ -260,7 +261,7 @@ def __init__( *, model_kwargs: Optional[Dict[str, Any]] = None, is_embedding_model: bool = False, - auto_cls=AutoModelForCausalLM, + auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, postprocess_inputs: Callable[[BatchEncoding], BatchEncoding] = identity, ) -> None: @@ -292,20 +293,14 @@ def __init__( trust_remote_code=True, ) - try: - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoProcessor # noqa: F401 - self.processor = AutoProcessor.from_pretrained( - model_name, - torch_dtype=torch_dtype, - trust_remote_code=True, - ) - except Exception as exc: - logger.warning( - "Unable to auto-load HuggingFace processor for model (%s). " - "Using tokenizer instead. Reason: %s", model_name, exc) - self.processor = self.tokenizer + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoProcessor # noqa: F401 + self.processor = AutoProcessor.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) self.postprocess_inputs = postprocess_inputs diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py deleted file mode 100644 index e254686f269b1..0000000000000 --- a/tests/distributed/test_basic_distributed_correctness.py +++ /dev/null @@ -1,80 +0,0 @@ -"""Compare the outputs of HF and distributed vLLM when using greedy sampling. - -Run: -```sh -cd $VLLM_PATH/tests - -pytest distributed/test_basic_distributed_correctness.py -``` -""" -import os - -import pytest - -from vllm.utils import cuda_device_count_stateless - -from ..models.utils import check_outputs_equal -from ..utils import fork_new_process_for_each_test - -TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") - - -@pytest.mark.skipif(cuda_device_count_stateless() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "model, distributed_executor_backend, attention_backend, " - "test_suite", [ - ("facebook/opt-125m", "ray", "", "L4"), - ("facebook/opt-125m", "mp", "", "L4"), - ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), - ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), - ("facebook/opt-125m", "ray", "", "A100"), - ("facebook/opt-125m", "mp", "", "A100"), - ("facebook/opt-125m", "mp", "FLASHINFER", "A100"), - ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), - ]) -@fork_new_process_for_each_test -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - distributed_executor_backend: str, - attention_backend: str, - test_suite: str, -) -> None: - - if test_suite != TARGET_TEST_SUITE: - pytest.skip(f"Skip test for {test_suite}") - - if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa - # test ray adag - os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" - os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" - - if attention_backend: - os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend - - dtype = "half" - max_tokens = 5 - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) diff --git a/tests/distributed/test_basic_distributed_correctness_enc_dec.py b/tests/distributed/test_basic_distributed_correctness_enc_dec.py deleted file mode 100644 index f00d5ef584a2a..0000000000000 --- a/tests/distributed/test_basic_distributed_correctness_enc_dec.py +++ /dev/null @@ -1,102 +0,0 @@ -"""For encoder/decoder models only: -Compare the outputs of HF and distributed vLLM when using greedy sampling. - -Run: -```sh -cd $VLLM_PATH/tests - -pytest distributed/test_basic_distributed_correctness_enc_dec.py -``` -""" - -import pytest -from transformers import AutoModelForSeq2SeqLM - -from vllm.utils import cuda_device_count_stateless - -from ..conftest import DecoderPromptType -from ..models.utils import check_logprobs_close -from ..utils import fork_new_process_for_each_test - - -@pytest.mark.skipif(cuda_device_count_stateless() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize("model, distributed_executor_backend", [ - ("facebook/bart-large-cnn", "ray"), - ("facebook/bart-large-cnn", "mp"), -]) -@fork_new_process_for_each_test -def test_models( - model: str, - distributed_executor_backend: str, - hf_runner, - vllm_runner, - example_encoder_decoder_prompts, -) -> None: - ''' - Test vLLM BART inference on more than one GPU, comparing - outputs against HF as a baseline. - - Fork a new process for each test, to prevent CUDA from - being re-initialized by successive tests within the same - process. - - Arguments: - - * model: the HF ID of the specific BART variant under test - * distributed_executor_backend - * hf_runner: HuggingFace (HF) test model runner - * vllm_runner: vLLM test model runner - * example_encoder_decoder_prompts: test fixture which provides a - dictionary of dummy prompts - ''' - - dtype = "float" - max_tokens = 64 - num_logprobs = 5 - - # Example inputs with non-trivial (i.e. not None/empty) encoder & - # decoder prompts. - test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM] - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner( - model, - dtype=dtype, - tensor_parallel_size=2, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - ) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - test_prompts, max_tokens, num_logprobs) - - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( - test_prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py deleted file mode 100644 index 262845f19822f..0000000000000 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ /dev/null @@ -1,75 +0,0 @@ -"""Compare the outputs of HF and distributed vLLM when using greedy sampling. - -Run: -```sh -pytest test_chunked_prefill_distributed.py -``` -""" - -import os - -import pytest - -from vllm.utils import cuda_device_count_stateless - -from ..models.utils import check_outputs_equal -from ..utils import fork_new_process_for_each_test - - -@pytest.mark.skipif(cuda_device_count_stateless() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize("model, distributed_executor_backend", [ - ("facebook/opt-125m", "ray"), - ("meta-llama/Llama-2-7b-hf", "ray"), - ("facebook/opt-125m", "mp"), - ("meta-llama/Llama-2-7b-hf", "mp"), -]) -@fork_new_process_for_each_test -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - distributed_executor_backend: str, -) -> None: - if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray": # noqa - assert distributed_executor_backend == "ray" - # test ray adag - os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" - os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" - - dtype = "half" - max_tokens = 5 - chunked_prefill_token_size = 16 - - # Add a chunked prefill config. - max_num_seqs = min(chunked_prefill_token_size, 256) - assert chunked_prefill_token_size != -1 - enable_chunked_prefill = True - max_num_batched_tokens = chunked_prefill_token_size - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - with vllm_runner( - model, - dtype=dtype, - tensor_parallel_size=2, - max_num_seqs=max_num_seqs, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py deleted file mode 100644 index 73ef863c2f193..0000000000000 --- a/tests/distributed/test_multimodal_broadcast.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Compare the outputs of HF and distributed vLLM when using greedy sampling. - -Run: -```sh -pytest -s -v test_multimodal_broadcast.py -``` -""" - -import pytest - -from vllm.utils import cuda_device_count_stateless - -from ..utils import fork_new_process_for_each_test - - -@pytest.mark.skipif(cuda_device_count_stateless() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize("model, distributed_executor_backend", [ - ("llava-hf/llava-1.5-7b-hf", "ray"), - ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"), - ("facebook/chameleon-7b", "ray"), - ("llava-hf/llava-1.5-7b-hf", "mp"), - ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"), - ("facebook/chameleon-7b", "mp"), -]) -@fork_new_process_for_each_test -def test_models(hf_runner, vllm_runner, image_assets, model: str, - distributed_executor_backend: str) -> None: - - dtype = "half" - max_tokens = 5 - num_logprobs = 5 - tensor_parallel_size = 2 - - if model.startswith("llava-hf/llava-1.5"): - from ..models.test_llava import models, run_test - elif model.startswith("llava-hf/llava-v1.6"): - from ..models.test_llava_next import run_test # type: ignore[no-redef] - from ..models.test_llava_next import models - elif model.startswith("facebook/chameleon"): - from ..models.test_chameleon import run_test # type: ignore[no-redef] - from ..models.test_chameleon import models - else: - raise NotImplementedError(f"Unsupported model: {model}") - - run_test( - hf_runner, - vllm_runner, - image_assets, - model=models[0], - # So that LLaVA-NeXT processor may return nested list - size_factors=[0.25, 0.5, 1.0], - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - ) diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py index 07e84d0ad54cd..defc4e23c8ce2 100644 --- a/tests/distributed/test_same_node.py +++ b/tests/distributed/test_same_node.py @@ -1,13 +1,13 @@ import os -import torch +import torch.distributed as dist from vllm.distributed.parallel_state import in_the_same_node_as -torch.distributed.init_process_group(backend="gloo") -test_result = all( - in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0)) +if __name__ == "__main__": + dist.init_process_group(backend="gloo") + test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0)) -expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1" -assert test_result == expected, f"Expected {expected}, got {test_result}" -print("Same node test passed!") + expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1" + assert test_result == expected, f"Expected {expected}, got {test_result}" + print("Same node test passed!") diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index dbddd69c07dbc..5746932c30a45 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -10,7 +10,6 @@ import torch from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType -from vllm.attention.backends.xformers import XFormersBackend from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) @@ -521,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend: * Backend instance ''' if backend_name == STR_XFORMERS_ATTN_VAL: + # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs. + from vllm.attention.backends.xformers import XFormersBackend + return XFormersBackend() raise AssertionError( f"Unrecognized backend_name {backend_name} for unit test") diff --git a/tests/models/decoder_only/__init__.py b/tests/models/decoder_only/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/decoder_only/audio_language/__init__.py b/tests/models/decoder_only/audio_language/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py similarity index 98% rename from tests/models/test_ultravox.py rename to tests/models/decoder_only/audio_language/test_ultravox.py index e98db9b65f484..bfffd34d1142c 100644 --- a/tests/models/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -7,10 +7,8 @@ from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from ..conftest import HfRunner, VllmRunner -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import HfRunner, VllmRunner +from ...utils import check_logprobs_close MODEL_NAME = "fixie-ai/ultravox-v0_3" diff --git a/tests/models/decoder_only/language/__init__.py b/tests/models/decoder_only/language/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/test_aqlm.py b/tests/models/decoder_only/language/test_aqlm.py similarity index 100% rename from tests/models/test_aqlm.py rename to tests/models/decoder_only/language/test_aqlm.py diff --git a/tests/models/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py similarity index 97% rename from tests/models/test_big_models.py rename to tests/models/decoder_only/language/test_big_models.py index c3e48b56ee58f..c5783fe19dd3f 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -7,7 +7,7 @@ import pytest import torch -from .utils import check_outputs_equal +from ...utils import check_outputs_equal MODELS = [ "meta-llama/Llama-2-7b-hf", diff --git a/tests/models/test_danube3_4b.py b/tests/models/decoder_only/language/test_danube3_4b.py similarity index 97% rename from tests/models/test_danube3_4b.py rename to tests/models/decoder_only/language/test_danube3_4b.py index bfaa275f73c19..bdd498edc293d 100644 --- a/tests/models/test_danube3_4b.py +++ b/tests/models/decoder_only/language/test_danube3_4b.py @@ -6,7 +6,7 @@ """ import pytest -from .utils import check_outputs_equal +from ...utils import check_outputs_equal MODELS = ["h2oai/h2o-danube3-4b-base"] diff --git a/tests/models/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py similarity index 98% rename from tests/models/test_fp8.py rename to tests/models/decoder_only/language/test_fp8.py index 17acdb52322fd..5a947ce62c785 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -10,7 +10,7 @@ from tests.kernels.utils import override_backend_env_variable from tests.quantization.utils import is_quant_method_supported -from ..models.utils import check_logprobs_close +from ...utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py similarity index 98% rename from tests/models/test_gguf.py rename to tests/models/decoder_only/language/test_gguf.py index 196cd88e039a1..8fc64a10c84af 100644 --- a/tests/models/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -11,7 +11,7 @@ from tests.quantization.utils import is_quant_method_supported -from .utils import check_logprobs_close +from ...utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py similarity index 98% rename from tests/models/test_gptq_marlin.py rename to tests/models/decoder_only/language/test_gptq_marlin.py index 4abbc41c9c287..2155e83dbe915 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/decoder_only/language/test_gptq_marlin.py @@ -15,7 +15,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT -from .utils import check_logprobs_close +from ...utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py similarity index 97% rename from tests/models/test_gptq_marlin_24.py rename to tests/models/decoder_only/language/test_gptq_marlin_24.py index 60d9ae2f1c629..d65be05f141b4 100644 --- a/tests/models/test_gptq_marlin_24.py +++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py @@ -10,9 +10,10 @@ import pytest -from tests.models.utils import check_logprobs_close from tests.quantization.utils import is_quant_method_supported +from ...utils import check_logprobs_close + @dataclass class ModelPair: diff --git a/tests/models/test_granite.py b/tests/models/decoder_only/language/test_granite.py similarity index 97% rename from tests/models/test_granite.py rename to tests/models/decoder_only/language/test_granite.py index 2435b5dc3ff88..82c753855e714 100644 --- a/tests/models/test_granite.py +++ b/tests/models/decoder_only/language/test_granite.py @@ -6,7 +6,7 @@ import pytest -from .utils import check_logprobs_close +from ...utils import check_logprobs_close TRANSFORMERS_VERSION = tuple( map(int, diff --git a/tests/models/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py similarity index 99% rename from tests/models/test_jamba.py rename to tests/models/decoder_only/language/test_jamba.py index efb7b1c607721..36fa67a22b0f6 100644 --- a/tests/models/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -1,8 +1,9 @@ import pytest -from tests.models.utils import check_outputs_equal from vllm.worker.model_runner import _get_graph_batch_size +from ...utils import check_outputs_equal + MODELS = ["ai21labs/Jamba-tiny-random"] diff --git a/tests/models/test_marlin.py b/tests/models/decoder_only/language/test_marlin.py similarity index 98% rename from tests/models/test_marlin.py rename to tests/models/decoder_only/language/test_marlin.py index e86f6e29d1567..c802346dee8af 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/decoder_only/language/test_marlin.py @@ -16,7 +16,7 @@ from tests.quantization.utils import is_quant_method_supported -from .utils import check_logprobs_close +from ...utils import check_logprobs_close @dataclass diff --git a/tests/models/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py similarity index 98% rename from tests/models/test_mistral.py rename to tests/models/decoder_only/language/test_mistral.py index 0741174497e32..687ba6a03a691 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -4,7 +4,7 @@ """ import pytest -from .utils import check_logprobs_close +from ...utils import check_logprobs_close MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", diff --git a/tests/models/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py similarity index 100% rename from tests/models/test_modelopt.py rename to tests/models/decoder_only/language/test_modelopt.py diff --git a/tests/models/test_models.py b/tests/models/decoder_only/language/test_models.py similarity index 97% rename from tests/models/test_models.py rename to tests/models/decoder_only/language/test_models.py index 4cd2cb665c8f0..68055cbe29095 100644 --- a/tests/models/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -7,7 +7,7 @@ """ import pytest -from .utils import check_outputs_equal +from ...utils import check_outputs_equal MODELS = [ "facebook/opt-125m", diff --git a/tests/models/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py similarity index 98% rename from tests/models/test_phimoe.py rename to tests/models/decoder_only/language/test_phimoe.py index 2fb2eecc94672..dbdf5a1b934a6 100644 --- a/tests/models/test_phimoe.py +++ b/tests/models/decoder_only/language/test_phimoe.py @@ -7,7 +7,7 @@ from vllm.utils import is_cpu -from .utils import check_logprobs_close +from ...utils import check_logprobs_close MODELS = [ "microsoft/Phi-3.5-MoE-instruct", diff --git a/tests/models/decoder_only/vision_language/__init__.py b/tests/models/decoder_only/vision_language/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/test_blip2.py b/tests/models/decoder_only/vision_language/test_blip2.py similarity index 95% rename from tests/models/test_blip2.py rename to tests/models/decoder_only/vision_language/test_blip2.py index 5d48bad0d7b35..e1e32b96d89ac 100644 --- a/tests/models/test_blip2.py +++ b/tests/models/decoder_only/vision_language/test_blip2.py @@ -6,10 +6,8 @@ from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs -from ..conftest import IMAGE_ASSETS -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import IMAGE_ASSETS +from ...utils import check_logprobs_close HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": @@ -56,7 +54,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, dtype: str, max_tokens: int, num_logprobs: int) -> None: """Inference result should be the same between hf and vllm. - All the image fixtures for the test is under tests/images. + All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. For vllm runner, we provide MultiModalData objects and corresponding MultiModalConfig as input. diff --git a/tests/models/decoder_only/vision_language/test_broadcast.py b/tests/models/decoder_only/vision_language/test_broadcast.py new file mode 100644 index 0000000000000..d01490d74bd4d --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_broadcast.py @@ -0,0 +1,42 @@ +import pytest + +from ....utils import multi_gpu_test + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) +@pytest.mark.parametrize("model", [ + "llava-hf/llava-1.5-7b-hf", + "llava-hf/llava-v1.6-mistral-7b-hf", + "facebook/chameleon-7b", +]) +def test_models(hf_runner, vllm_runner, image_assets, + distributed_executor_backend, model) -> None: + + dtype = "half" + max_tokens = 5 + num_logprobs = 5 + tensor_parallel_size = 2 + + if model.startswith("llava-hf/llava-1.5"): + from .test_llava import models, run_test + elif model.startswith("llava-hf/llava-v1.6"): + from .test_llava_next import models, run_test # type: ignore[no-redef] + elif model.startswith("facebook/chameleon"): + from .test_chameleon import models, run_test # type: ignore[no-redef] + else: + raise NotImplementedError(f"Unsupported model: {model}") + + run_test( + hf_runner, + vllm_runner, + image_assets, + model=models[0], + # So that LLaVA-NeXT processor may return nested list + size_factors=[0.25, 0.5, 1.0], + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + ) diff --git a/tests/models/test_chameleon.py b/tests/models/decoder_only/vision_language/test_chameleon.py similarity index 95% rename from tests/models/test_chameleon.py rename to tests/models/decoder_only/vision_language/test_chameleon.py index e02b4b1ed72bd..8334451970a4f 100644 --- a/tests/models/test_chameleon.py +++ b/tests/models/decoder_only/vision_language/test_chameleon.py @@ -6,10 +6,8 @@ from vllm.multimodal.utils import rescale_image_size from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_outputs_equal - -pytestmark = pytest.mark.vlm +from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from ...utils import check_outputs_equal HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": @@ -36,7 +34,7 @@ def run_test( ): """Inference result should be the same between hf and vllm. - All the image fixtures for the test is under tests/images. + All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. For vllm runner, we provide MultiModalDataDict objects and corresponding vision language config as input. diff --git a/tests/models/test_fuyu.py b/tests/models/decoder_only/vision_language/test_fuyu.py similarity index 95% rename from tests/models/test_fuyu.py rename to tests/models/decoder_only/vision_language/test_fuyu.py index 0d666d8f71a92..94b8431424db5 100644 --- a/tests/models/test_fuyu.py +++ b/tests/models/decoder_only/vision_language/test_fuyu.py @@ -6,10 +6,8 @@ from vllm.sequence import SampleLogprobs from vllm.utils import is_cpu -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from ...utils import check_logprobs_close HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": @@ -46,7 +44,7 @@ def run_test( ): """Inference result should be the same between hf and vllm. - All the image fixtures for the test is under tests/images. + All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. diff --git a/tests/models/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py similarity index 97% rename from tests/models/test_intern_vit.py rename to tests/models/decoder_only/vision_language/test_intern_vit.py index 816f846f69bae..3c3b95b38baac 100644 --- a/tests/models/test_intern_vit.py +++ b/tests/models/decoder_only/vision_language/test_intern_vit.py @@ -6,9 +6,7 @@ from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModel, CLIPImageProcessor -from ..conftest import _ImageAssets, cleanup - -pytestmark = pytest.mark.vlm +from ....conftest import _ImageAssets, cleanup # we use snapshot_download to prevent conflicts between # dynamic_module and trust_remote_code for hf_runner diff --git a/tests/models/test_internvl.py b/tests/models/decoder_only/vision_language/test_internvl.py similarity index 98% rename from tests/models/test_internvl.py rename to tests/models/decoder_only/vision_language/test_internvl.py index 881068b3afe41..a756f8214edee 100644 --- a/tests/models/test_internvl.py +++ b/tests/models/decoder_only/vision_language/test_internvl.py @@ -9,11 +9,9 @@ from vllm.multimodal.utils import rescale_image_size from vllm.utils import is_cpu -from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, + _ImageAssets) +from ...utils import check_logprobs_close HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": @@ -78,7 +76,7 @@ def run_test( ): """Inference result should be the same between hf and vllm. - All the image fixtures for the test is under tests/images. + All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. diff --git a/tests/models/test_llava.py b/tests/models/decoder_only/vision_language/test_llava.py similarity index 96% rename from tests/models/test_llava.py rename to tests/models/decoder_only/vision_language/test_llava.py index 84ca23f6222a9..fd28a9367b4b2 100644 --- a/tests/models/test_llava.py +++ b/tests/models/decoder_only/vision_language/test_llava.py @@ -8,11 +8,9 @@ from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, + _ImageAssets) +from ...utils import check_logprobs_close _LIMIT_IMAGE_PER_PROMPT = 4 @@ -143,7 +141,7 @@ def _run_test( ): """Inference result should be the same between hf and vllm. - All the image fixtures for the test is under tests/images. + All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. @@ -239,7 +237,7 @@ def process(hf_inputs: BatchEncoding): @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: + dtype, max_tokens, num_logprobs) -> None: run_test( hf_runner, vllm_runner, diff --git a/tests/models/test_llava_image_embeds.py b/tests/models/decoder_only/vision_language/test_llava_image_embeds.py similarity index 96% rename from tests/models/test_llava_image_embeds.py rename to tests/models/decoder_only/vision_language/test_llava_image_embeds.py index cc444fe32e79b..66414032509ed 100644 --- a/tests/models/test_llava_image_embeds.py +++ b/tests/models/decoder_only/vision_language/test_llava_image_embeds.py @@ -5,10 +5,8 @@ from vllm.sequence import SampleLogprobs -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from ...utils import check_logprobs_close HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": @@ -62,7 +60,7 @@ def run_test( ): """Inference result should be the same between hf and vllm. - All the image fixtures for the test is under tests/images. + All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. For vllm runner, we provide MultiModalDataDict objects and corresponding vision language config as input. diff --git a/tests/models/test_llava_next.py b/tests/models/decoder_only/vision_language/test_llava_next.py similarity index 97% rename from tests/models/test_llava_next.py rename to tests/models/decoder_only/vision_language/test_llava_next.py index d5fe0cbe32880..f833fe0c8bbb4 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/decoder_only/vision_language/test_llava_next.py @@ -6,11 +6,9 @@ from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs -from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, + _ImageAssets) +from ...utils import check_logprobs_close _LIMIT_IMAGE_PER_PROMPT = 4 @@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, dtype, max_tokens, num_logprobs) -> None: """Inference result should be the same between hf and vllm. - All the image fixtures for the test is under tests/images. + All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. diff --git a/tests/models/test_llava_next_video.py b/tests/models/decoder_only/vision_language/test_llava_next_video.py similarity index 98% rename from tests/models/test_llava_next_video.py rename to tests/models/decoder_only/vision_language/test_llava_next_video.py index 6856b15f22ec3..373c8964054cd 100644 --- a/tests/models/test_llava_next_video.py +++ b/tests/models/decoder_only/vision_language/test_llava_next_video.py @@ -8,10 +8,8 @@ sample_frames_from_video) from vllm.sequence import SampleLogprobs -from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets +from ...utils import check_logprobs_close _PREFACE = ( "A chat between a curious human and an artificial intelligence assistant. " diff --git a/tests/models/test_minicpmv.py b/tests/models/decoder_only/vision_language/test_minicpmv.py similarity index 97% rename from tests/models/test_minicpmv.py rename to tests/models/decoder_only/vision_language/test_minicpmv.py index 99e49c14f1f26..7bf5d75f400f9 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/decoder_only/vision_language/test_minicpmv.py @@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner +from ...utils import check_logprobs_close # The image token is placed before "user" on purpose so that the test can pass HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ @@ -65,7 +63,7 @@ def run_test( ): """Inference result should be the same between hf and vllm. - All the image fixtures for the test is under tests/images. + All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. diff --git a/tests/models/test_paligemma.py b/tests/models/decoder_only/vision_language/test_paligemma.py similarity index 96% rename from tests/models/test_paligemma.py rename to tests/models/decoder_only/vision_language/test_paligemma.py index beddaaf608a18..d7e29ea76ba4e 100644 --- a/tests/models/test_paligemma.py +++ b/tests/models/decoder_only/vision_language/test_paligemma.py @@ -8,10 +8,8 @@ from vllm.sequence import SampleLogprobs from vllm.utils import is_hip -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from ...utils import check_logprobs_close HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": @@ -69,7 +67,7 @@ def run_test( ): """Inference result should be the same between hf and vllm. - All the image fixtures for the test is under tests/images. + All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. diff --git a/tests/models/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py similarity index 97% rename from tests/models/test_phi3v.py rename to tests/models/decoder_only/vision_language/test_phi3v.py index 6ecbf07a08b7c..e248151c40a60 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -9,10 +9,8 @@ from vllm.sequence import SampleLogprobs from vllm.utils import is_cpu, is_hip -from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ...utils import check_logprobs_close HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": @@ -71,7 +69,7 @@ def run_test( ): """Inference result should be the same between hf and vllm. - All the image fixtures for the test is under tests/images. + All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. diff --git a/tests/models/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py similarity index 90% rename from tests/models/test_pixtral.py rename to tests/models/decoder_only/vision_language/test_pixtral.py index 1fbfd77218ca7..072bedfc01a1f 100644 --- a/tests/models/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -5,7 +5,7 @@ import json import uuid from dataclasses import asdict -from typing import Any, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple import pytest from mistral_common.protocol.instruct.messages import ImageURLChunk @@ -17,9 +17,11 @@ from vllm.multimodal import MultiModalDataBuiltins from vllm.sequence import Logprob, SampleLogprobs -from .utils import check_logprobs_close +from ....utils import VLLM_PATH +from ...utils import check_logprobs_close -pytestmark = pytest.mark.vlm +if TYPE_CHECKING: + from _typeshed import StrPath MODELS = ["mistralai/Pixtral-12B-2409"] IMG_URLS = [ @@ -83,14 +85,21 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt: LIMIT_MM_PER_PROMPT = dict(image=4) MAX_MODEL_LEN = [8192, 65536] -FIXTURE_LOGPROBS_CHAT = "tests/models/fixtures/pixtral_chat.json" -FIXTURE_LOGPROBS_ENGINE = "tests/models/fixtures/pixtral_chat_engine.json" + +FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures" +assert FIXTURES_PATH.exists() + +FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json" +FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json" OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]] # For the test author to store golden output in JSON -def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None: +def _dump_outputs_w_logprobs( + outputs: OutputsLogprobs, + filename: "StrPath", +) -> None: json_data = [(tokens, text, [{k: asdict(v) for k, v in token_logprobs.items()} @@ -101,7 +110,7 @@ def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None: json.dump(json_data, f) -def load_outputs_w_logprobs(filename: str) -> OutputsLogprobs: +def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs: with open(filename, "rb") as f: json_data = json.load(f) diff --git a/tests/models/test_qwen.py b/tests/models/decoder_only/vision_language/test_qwen.py similarity index 98% rename from tests/models/test_qwen.py rename to tests/models/decoder_only/vision_language/test_qwen.py index 5e7f1de99d6c3..e4f79092b7606 100644 --- a/tests/models/test_qwen.py +++ b/tests/models/decoder_only/vision_language/test_qwen.py @@ -10,11 +10,9 @@ from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size -from ..conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput, - VllmRunner, _ImageAssets) -from .utils import check_logprobs_close - -pytestmark = pytest.mark.vlm +from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput, + VllmRunner, _ImageAssets) +from ...utils import check_logprobs_close text_only_models = [ "Qwen/Qwen-7B-Chat" # Has no visual component diff --git a/tests/models/embedding/__init__.py b/tests/models/embedding/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/embedding/language/__init__.py b/tests/models/embedding/language/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/test_embedding.py b/tests/models/embedding/language/test_embedding.py similarity index 100% rename from tests/models/test_embedding.py rename to tests/models/embedding/language/test_embedding.py diff --git a/tests/models/encoder_decoder/__init__.py b/tests/models/encoder_decoder/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/encoder_decoder/language/__init__.py b/tests/models/encoder_decoder/language/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py similarity index 69% rename from tests/models/test_bart.py rename to tests/models/encoder_decoder/language/test_bart.py index 660b61d1a7ade..758a9b743b397 100644 --- a/tests/models/test_bart.py +++ b/tests/models/encoder_decoder/language/test_bart.py @@ -1,8 +1,8 @@ """Compare the outputs of HF and vLLM for BART models using greedy sampling. -Run `pytest tests/models/test_bart.py`. +Run `pytest tests/models/encoder_decoder/language/test_bart.py`. """ -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Type from vllm.utils import is_cpu @@ -16,8 +16,10 @@ from vllm.sequence import SampleLogprobs - from ..conftest import DecoderPromptType - from .utils import check_logprobs_close + from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt, + HfRunner, VllmRunner) + from ....utils import multi_gpu_test + from ...utils import check_logprobs_close MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] @@ -34,20 +36,18 @@ def vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs - @pytest.mark.parametrize("model", MODELS) - @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) - @pytest.mark.parametrize("max_tokens", [64]) - @pytest.mark.parametrize("num_logprobs", [5]) - @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) - def test_models( - hf_runner, - vllm_runner, - example_encoder_decoder_prompts, + def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + decoder_prompt_type: DecoderPromptType, model: str, + *, dtype: str, max_tokens: int, num_logprobs: int, - decoder_prompt_type: DecoderPromptType, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, ) -> None: ''' Test the vLLM BART model for a variety of encoder/decoder input prompts, @@ -116,8 +116,29 @@ def test_models( token during the process of validating the vLLM decoded output. ''' - test_case_prompts = example_encoder_decoder_prompts[ - decoder_prompt_type] + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default). + + # Note: currently encoder/decoder models are only compatible with + # enforce_eager=True. Normally this is not a problem because + # for encoder/decoder models vLLM will + # default to enforce_eager=True if enforce_eager + # is left unspecified. However, the + # VllmRunner test fixture (which wraps around the LLM class) defaults to + # enforce_eager=False (a behavior which a number of already-exisitng + # decoder-only unit tests expect), so when testing an encoder/decoder + # model we must explicitly specify enforce_eager=True in the VllmRunner + # constructor. + with vllm_runner( + model, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + prompts, max_tokens, num_logprobs) # Configuration settings for HF baseline hf_kwargs = { @@ -135,26 +156,12 @@ def test_models( auto_cls=AutoModelForSeq2SeqLM) as hf_model: hf_outputs = ( hf_model.generate_encoder_decoder_greedy_logprobs_limit( - test_case_prompts, + prompts, max_tokens, num_logprobs, **hf_kwargs, )) - # Note: currently encoder/decoder models are only compatible with - # enforce_eager=True. Normally this is not a problem because - # for encoder/decoder models vLLM will - # default to enforce_eager=True if enforce_eager - # is left unspecified. However, the - # VllmRunner test fixture (which wraps around the LLM class) defaults to - # enforce_eager=False (a behavior which a number of already-exisitng - # decoder-only unit tests expect), so when testing an encoder/decoder - # model we must explicitly specify enforce_eager=True in the VllmRunner - # constructor. - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - test_case_prompts, max_tokens, num_logprobs) - hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE else 0) @@ -168,3 +175,49 @@ def test_models( name_1="vllm", num_outputs_0_skip_tokens=hf_skip_tokens, ) + + @pytest.mark.parametrize("model", MODELS) + @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) + @pytest.mark.parametrize("max_tokens", [64]) + @pytest.mark.parametrize("num_logprobs", [5]) + @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) + def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, + model, dtype, max_tokens, num_logprobs, + decoder_prompt_type) -> None: + + run_test( + hf_runner, + vllm_runner, + example_encoder_decoder_prompts[decoder_prompt_type], + decoder_prompt_type, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) + + @multi_gpu_test(num_gpus=2) + @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) + @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) + @pytest.mark.parametrize("dtype", ["float"]) + @pytest.mark.parametrize("max_tokens", [64]) + @pytest.mark.parametrize("num_logprobs", [5]) + @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM]) + def test_models_distributed(hf_runner, vllm_runner, + example_encoder_decoder_prompts, + distributed_executor_backend, model, dtype, + max_tokens, num_logprobs, + decoder_prompt_type) -> None: + run_test( + hf_runner, + vllm_runner, + example_encoder_decoder_prompts[decoder_prompt_type], + decoder_prompt_type, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend, + ) diff --git a/tests/utils.py b/tests/utils.py index 3c519fb6e50e0..f6c2be17ebdcf 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Optional import openai +import pytest import requests from openai.types.completion import Completion from transformers import AutoTokenizer @@ -22,7 +23,8 @@ from vllm.entrypoints.openai.cli_args import make_arg_parser from vllm.model_executor.model_loader.loader import get_model_loader from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip +from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless, + get_open_port, is_hip) if current_platform.is_rocm(): from amdsmi import (amdsmi_get_gpu_vram_usage, @@ -452,6 +454,22 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: return wrapper +def multi_gpu_test(*, num_gpus: int): + """ + Decorate a test to be run only when multiple GPUs are available. + """ + test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus") + test_skipif = pytest.mark.skipif( + cuda_device_count_stateless() < num_gpus, + reason=f"Need at least {num_gpus} GPUs to run the test.", + ) + + def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: + return test_selector(test_skipif(fork_new_process_for_each_test(f))) + + return wrapper + + async def completions_with_server_args( prompts: List[str], model_name: str,