From ddc56ee758c7167870d872f38730b563f14b9b5a Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 10 May 2024 15:33:52 +0200
Subject: [PATCH] Set TP argument correctly when instantiating
 PagedKVCacheManager (#94)

#### Motivation

Users are seeing runtime errors when trying to use TP>1 with speculative
decoding.

#### Modifications

We need to set the tensor parallel argument correctly when we
instantiate the PagedKVCacheManager.

#### Result

I have verified that this change resolves the reported issue.

#### Related Issues

https://huggingface.co/ibm-fms/llama3-8b-accelerator/discussions/1

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 server/text_generation_server/models/paged_causal_lm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/text_generation_server/models/paged_causal_lm.py b/server/text_generation_server/models/paged_causal_lm.py
index 3524f738..c82fcd54 100644
--- a/server/text_generation_server/models/paged_causal_lm.py
+++ b/server/text_generation_server/models/paged_causal_lm.py
@@ -327,7 +327,7 @@ def __init__(
             model_config.num_attention_heads,
             model_config.hidden_size,
             kv_heads=model_config.num_key_value_heads,
-            tensor_parallel_size=1,
+            tensor_parallel_size=self.engine.world_size,
             dtype=dtype,
             device=self.device,
             total_num_gpu_blocks=total_num_gpu_blocks,