From d26ee9be17b98c884e495d762c529dd80badfc91 Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Wed, 7 Aug 2024 20:39:15 +0000 Subject: [PATCH] Add max-batch-size to benchmark_throughput.py Currently hf-max-batch-size exists to allow hf runs to specify the maximum batch size to use. The vLLM Engine will take in a max batch size, in the form of max_num_seqs. Open this capability to users so we can attempt to change the batch size externally. --- benchmarks/benchmark_throughput.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 302746e316514..5d7e0905e77cf 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -79,12 +79,14 @@ def run_vllm( enable_prefix_caching: bool, enable_chunked_prefill: bool, max_num_batched_tokens: int, + max_batch_size: Optional[int], distributed_executor_backend: Optional[str], gpu_memory_utilization: float = 0.9, worker_use_ray: bool = False, download_dir: Optional[str] = None, ) -> float: from vllm import LLM, SamplingParams + max_num_seqs = { 'max_num_seqs' : max_batch_size } if max_batch_size is not None else {} llm = LLM( model=model, tokenizer=tokenizer, @@ -106,6 +108,7 @@ def run_vllm( enable_chunked_prefill=enable_chunked_prefill, max_num_batched_tokens=max_num_batched_tokens, distributed_executor_backend=distributed_executor_backend, + **max_num_seqs, ) # Add the requests to the engine. @@ -231,13 +234,13 @@ def main(args: argparse.Namespace): args.max_model_len, args.enforce_eager, args.kv_cache_dtype, args.quantization_param_path, args.device, args.enable_prefix_caching, args.enable_chunked_prefill, - args.max_num_batched_tokens, args.distributed_executor_backend, + args.max_num_batched_tokens, args.max_batch_size, args.distributed_executor_backend, args.gpu_memory_utilization, args.worker_use_ray, args.download_dir) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, - args.use_beam_search, args.hf_max_batch_size, + args.use_beam_search, args.max_batch_size, args.trust_remote_code) elif args.backend == "mii": elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size, @@ -298,10 +301,10 @@ def main(args: argparse.Namespace): default=1000, help="Number of prompts to process.") parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--hf-max-batch-size", + parser.add_argument("--max-batch-size", type=int, default=None, - help="Maximum batch size for HF backend.") + help="Maximum batch size for vLLM or HF.") parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') @@ -404,11 +407,8 @@ def main(args: argparse.Namespace): else: assert args.input_len is None - if args.backend == "vllm": - if args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") - elif args.backend == "hf": - if args.hf_max_batch_size is None: + if args.backend == "hf": + if args.max_batch_size is None: raise ValueError("HF max batch size is required for HF backend.") if args.quantization is not None: raise ValueError("Quantization is only for vLLM backend.") @@ -421,8 +421,8 @@ def main(args: argparse.Namespace): raise ValueError("Beam search is not supported for MII backend.") if args.quantization is not None: raise ValueError("Quantization is only for vLLM backend.") - if args.hf_max_batch_size is not None: - raise ValueError("HF max batch size is only for HF backend.") + if args.max_batch_size is not None: + raise ValueError("Max batch size is only for HF or vLLM backends") if args.tokenizer != args.model: raise ValueError("Tokenizer must be the same as the model for MII " "backend.")