From ddc56ee758c7167870d872f38730b563f14b9b5a Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Fri, 10 May 2024 15:33:52 +0200 Subject: [PATCH] Set TP argument correctly when instantiating PagedKVCacheManager (#94) #### Motivation Users are seeing runtime errors when trying to use TP>1 with speculative decoding. #### Modifications We need to set the tensor parallel argument correctly when we instantiate the PagedKVCacheManager. #### Result I have verified that this change resolves the reported issue. #### Related Issues https://huggingface.co/ibm-fms/llama3-8b-accelerator/discussions/1 Signed-off-by: Thomas Parnell --- server/text_generation_server/models/paged_causal_lm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/text_generation_server/models/paged_causal_lm.py b/server/text_generation_server/models/paged_causal_lm.py index 3524f738..c82fcd54 100644 --- a/server/text_generation_server/models/paged_causal_lm.py +++ b/server/text_generation_server/models/paged_causal_lm.py @@ -327,7 +327,7 @@ def __init__( model_config.num_attention_heads, model_config.hidden_size, kv_heads=model_config.num_key_value_heads, - tensor_parallel_size=1, + tensor_parallel_size=self.engine.world_size, dtype=dtype, device=self.device, total_num_gpu_blocks=total_num_gpu_blocks,