From 43317d2ce88e30c5310aa700008c4e7cddbf0d2b Mon Sep 17 00:00:00 2001 From: Spycsh Date: Thu, 10 Oct 2024 21:37:35 -0700 Subject: [PATCH 1/9] draft mega --- ChatQnA/chatqna_no_wrapper_static_batching.py | 241 ++++++++++++++++++ .../compose_no_wrapper_static_batching.yaml | 215 ++++++++++++++++ 2 files changed, 456 insertions(+) create mode 100644 ChatQnA/chatqna_no_wrapper_static_batching.py create mode 100644 ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_static_batching.yaml diff --git a/ChatQnA/chatqna_no_wrapper_static_batching.py b/ChatQnA/chatqna_no_wrapper_static_batching.py new file mode 100644 index 000000000..55fa87f99 --- /dev/null +++ b/ChatQnA/chatqna_no_wrapper_static_batching.py @@ -0,0 +1,241 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +import re + +from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType +from langchain_core.prompts import PromptTemplate + + +class ChatTemplate: + @staticmethod + def generate_rag_prompt(question, documents): + context_str = "\n".join(documents) + if context_str and len(re.findall("[\u4E00-\u9FFF]", context_str)) / len(context_str) >= 0.3: + # chinese context + template = """ +### 你将扮演一个乐于助人、尊重他人并诚实的助手,你的目标是帮助用户解答问题。有效地利用来自本地知识库的搜索结果。确保你的回答中只包含相关信息。如果你不确定问题的答案,请避免分享不准确的信息。 +### 搜索结果:{context} +### 问题:{question} +### 回答: +""" + else: + template = """ +### You are a helpful, respectful and honest assistant to help the user with questions. \ +Please refer to the search results obtained from the local knowledge base. \ +But be careful to not incorporate the information that you think is not relevant to the question. \ +If you don't know the answer to a question, please don't share false information. \n +### Search results: {context} \n +### Question: {question} \n +### Answer: +""" + return template.format(context=context_str, question=question) + + +MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0") +MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888)) +# EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0") +# EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000)) +# RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0") +# RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000)) +# RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0") +# RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000)) +# LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0") +# LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000)) +# EMBEDDING_SERVER_HOST_IP = os.getenv("EMBEDDING_SERVER_HOST_IP", "0.0.0.0") +# EMBEDDING_SERVER_PORT = int(os.getenv("EMBEDDING_SERVER_PORT", 6006)) +RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0") +RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000)) +# RERANK_SERVER_HOST_IP = os.getenv("RERANK_SERVER_HOST_IP", "0.0.0.0") +# RERANK_SERVER_PORT = int(os.getenv("RERANK_SERVER_PORT", 8808)) +# Embed/Rerank use the same host:ip, diff routers +EMBEDDING_RERANK_SERVICE_HOST_IP = os.getenv("EMBEDDING_RERANK_SERVICE_HOST_IP", "0.0.0.0") +EMBEDDING_RERANK_SERVICE_PORT = os.getenv("EMBEDDING_RERANK_SERVICE_PORT", 6001) +LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0") +LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 9009)) + + +def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs): + # if self.services[cur_node].service_type == ServiceType.EMBEDDING: + # inputs["inputs"] = inputs["text"] + # del inputs["text"] + if self.services[cur_node].service_type == ServiceType.RETRIEVER: + # prepare the retriever params + retriever_parameters = kwargs.get("retriever_parameters", None) + if retriever_parameters: + inputs.update(retriever_parameters.dict()) + elif self.services[cur_node].service_type == ServiceType.LLM: + # convert TGI/vLLM to unified OpenAI /v1/chat/completions format + next_inputs = {} + next_inputs["model"] = "tgi" # specifically clarify the fake model to make the format unified + next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}] + next_inputs["max_tokens"] = llm_parameters_dict["max_new_tokens"] + next_inputs["top_p"] = llm_parameters_dict["top_p"] + next_inputs["stream"] = inputs["streaming"] + next_inputs["frequency_penalty"] = inputs["repetition_penalty"] + next_inputs["temperature"] = inputs["temperature"] + inputs = next_inputs + + return inputs + + +def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_dict, **kwargs): + next_data = {} + # if self.services[cur_node].service_type == ServiceType.EMBEDDING: + # assert isinstance(data, list) + # next_data = {"text": inputs["inputs"], "embedding": data[0]} + if self.services[cur_node].service_type == ServiceType.RETRIEVER: + # TODO align outputs!! + next_data = data + # docs = [doc["text"] for doc in data["retrieved_docs"]] + + # with_rerank = runtime_graph.downstream(cur_node)[0].startswith("rerank") + # if with_rerank and docs: + # # forward to rerank + # # prepare inputs for rerank + # next_data["query"] = data["initial_query"] + # next_data["texts"] = [doc["text"] for doc in data["retrieved_docs"]] + # else: + # # forward to llm + # if not docs: + # # delete the rerank from retriever -> rerank -> llm + # for ds in reversed(runtime_graph.downstream(cur_node)): + # for nds in runtime_graph.downstream(ds): + # runtime_graph.add_edge(cur_node, nds) + # runtime_graph.delete_node_if_exists(ds) + + # # handle template + # # if user provides template, then format the prompt with it + # # otherwise, use the default template + # prompt = data["initial_query"] + # chat_template = llm_parameters_dict["chat_template"] + # if chat_template: + # prompt_template = PromptTemplate.from_template(chat_template) + # input_variables = prompt_template.input_variables + # if sorted(input_variables) == ["context", "question"]: + # prompt = prompt_template.format(question=data["initial_query"], context="\n".join(docs)) + # elif input_variables == ["question"]: + # prompt = prompt_template.format(question=data["initial_query"]) + # else: + # print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']") + # prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs) + # else: + # prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs) + + # next_data["inputs"] = prompt + + elif self.services[cur_node].service_type == ServiceType.RERANK: + # TODO align outputs!! + next_data = data + # # rerank the inputs with the scores + # reranker_parameters = kwargs.get("reranker_parameters", None) + # top_n = reranker_parameters.top_n if reranker_parameters else 1 + # docs = inputs["texts"] + # reranked_docs = [] + # for best_response in data[:top_n]: + # reranked_docs.append(docs[best_response["index"]]) + + # # handle template + # # if user provides template, then format the prompt with it + # # otherwise, use the default template + # prompt = inputs["query"] + # chat_template = llm_parameters_dict["chat_template"] + # if chat_template: + # prompt_template = PromptTemplate.from_template(chat_template) + # input_variables = prompt_template.input_variables + # if sorted(input_variables) == ["context", "question"]: + # prompt = prompt_template.format(question=prompt, context="\n".join(docs)) + # elif input_variables == ["question"]: + # prompt = prompt_template.format(question=prompt) + # else: + # print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']") + # prompt = ChatTemplate.generate_rag_prompt(prompt, docs) + # else: + # prompt = ChatTemplate.generate_rag_prompt(prompt, docs) + + # next_data["inputs"] = prompt + else: + next_data = data + + return next_data + + +def align_generator(self, gen, **kwargs): + # openai reaponse format + # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n' + for line in gen: + line = line.decode("utf-8") + start = line.find("{") + end = line.rfind("}") + 1 + + json_str = line[start:end] + try: + # sometimes yield empty chunk, do a fallback here + json_data = json.loads(json_str) + if json_data["choices"][0]["finish_reason"] != "eos_token": + yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n" + except Exception as e: + yield f"data: {repr(json_str.encode('utf-8'))}\n\n" + yield "data: [DONE]\n\n" + + +class ChatQnAService: + def __init__(self, host="0.0.0.0", port=8000): + self.host = host + self.port = port + ServiceOrchestrator.align_inputs = align_inputs + ServiceOrchestrator.align_outputs = align_outputs + ServiceOrchestrator.align_generator = align_generator + self.megaservice = ServiceOrchestrator() + + def add_remote_service(self): + + embedding = MicroService( + name="embedding", + host=EMBEDDING_RERANK_SERVICE_HOST_IP, + port=EMBEDDING_RERANK_SERVICE_PORT, + # endpoint="/embed", + endpoint="/v1/embeddings", + use_remote_service=True, + service_type=ServiceType.EMBEDDING, + ) + + retriever = MicroService( + name="retriever", + host=RETRIEVER_SERVICE_HOST_IP, + port=RETRIEVER_SERVICE_PORT, + endpoint="/v1/retrieval", + use_remote_service=True, + service_type=ServiceType.RETRIEVER, + ) + + rerank = MicroService( + name="rerank", + host=EMBEDDING_RERANK_SERVICE_HOST_IP, + port=EMBEDDING_RERANK_SERVICE_PORT, + # endpoint="/rerank", + endpoint="/v1/reranking", + use_remote_service=True, + service_type=ServiceType.RERANK, + ) + + llm = MicroService( + name="llm", + host=LLM_SERVER_HOST_IP, + port=LLM_SERVER_PORT, + endpoint="/v1/chat/completions", + use_remote_service=True, + service_type=ServiceType.LLM, + ) + self.megaservice.add(embedding).add(retriever).add(rerank).add(llm) + self.megaservice.flow_to(embedding, retriever) + self.megaservice.flow_to(retriever, rerank) + self.megaservice.flow_to(rerank, llm) + self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port) + + +if __name__ == "__main__": + chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT) + chatqna.add_remote_service() diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_static_batching.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_static_batching.yaml new file mode 100644 index 000000000..bc01f15ed --- /dev/null +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_static_batching.yaml @@ -0,0 +1,215 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + redis-vector-db: + image: redis/redis-stack:7.2.0-v9 + container_name: redis-vector-db + ports: + - "6379:6379" + - "8001:8001" + dataprep-redis-service: + image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest} + container_name: dataprep-redis-server + depends_on: + - redis-vector-db + - tei-embedding-service + ports: + - "6007:6007" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + REDIS_URL: ${REDIS_URL} + INDEX_NAME: ${INDEX_NAME} + TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + embedding-reranking-service: + image: ${REGISTRY:-opea}/embedding-reranking-local:${TAG:-latest} + container_name: embedding-reranking-server + ports: + - "6001:6001" + runtime: habana + cap_add: + - SYS_NICE + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HABANA_VISIBLE_DEVICES: all + OMPI_MCA_btl_vader_single_copy_mechanism: none + LOGFLAG=${LOGFLAG} + # tei-embedding-service: + # image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} + # container_name: tei-embedding-gaudi-server + # ports: + # - "8090:80" + # volumes: + # - "./data:/data" + # runtime: habana + # cap_add: + # - SYS_NICE + # ipc: host + # environment: + # no_proxy: ${no_proxy} + # http_proxy: ${http_proxy} + # https_proxy: ${https_proxy} + # HABANA_VISIBLE_DEVICES: all + # OMPI_MCA_btl_vader_single_copy_mechanism: none + # MAX_WARMUP_SEQUENCE_LENGTH: 512 + # INIT_HCCL_ON_ACQUIRE: 0 + # ENABLE_EXPERIMENTAL_FLAGS: true + # command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate + # embedding: + # image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest} + # container_name: embedding-tei-server + # depends_on: + # - tei-embedding-service + # ports: + # - "6000:6000" + # ipc: host + # environment: + # no_proxy: ${no_proxy} + # http_proxy: ${http_proxy} + # https_proxy: ${https_proxy} + # TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + # restart: unless-stopped + retriever: + image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest} + container_name: retriever-redis-server + depends_on: + - redis-vector-db + ports: + - "7000:7000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + REDIS_URL: ${REDIS_URL} + INDEX_NAME: ${INDEX_NAME} + restart: unless-stopped + # tei-reranking-service: + # image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + # container_name: tei-reranking-gaudi-server + # ports: + # - "8808:80" + # volumes: + # - "./data:/data" + # shm_size: 1g + # environment: + # no_proxy: ${no_proxy} + # http_proxy: ${http_proxy} + # https_proxy: ${https_proxy} + # HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + # HF_HUB_DISABLE_PROGRESS_BARS: 1 + # HF_HUB_ENABLE_HF_TRANSFER: 0 + # command: --model-id ${RERANK_MODEL_ID} --auto-truncate + # reranking: + # image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest} + # container_name: reranking-tei-gaudi-server + # depends_on: + # - tei-reranking-service + # ports: + # - "8000:8000" + # ipc: host + # environment: + # no_proxy: ${no_proxy} + # http_proxy: ${http_proxy} + # https_proxy: ${https_proxy} + # TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT} + # HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + # HF_HUB_DISABLE_PROGRESS_BARS: 1 + # HF_HUB_ENABLE_HF_TRANSFER: 0 + # restart: unless-stopped + tgi-service: + image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + container_name: tgi-gaudi-server + ports: + - "8005:80" + volumes: + - "./data:/data" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + HABANA_VISIBLE_DEVICES: all + OMPI_MCA_btl_vader_single_copy_mechanism: none + runtime: habana + cap_add: + - SYS_NICE + ipc: host + command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 + # llm: + # image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest} + # container_name: llm-tgi-gaudi-server + # depends_on: + # - tgi-service + # ports: + # - "9000:9000" + # ipc: host + # environment: + # no_proxy: ${no_proxy} + # http_proxy: ${http_proxy} + # https_proxy: ${https_proxy} + # TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} + # HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + # HF_HUB_DISABLE_PROGRESS_BARS: 1 + # HF_HUB_ENABLE_HF_TRANSFER: 0 + # restart: unless-stopped + chaqna-gaudi-backend-server: + image: ${REGISTRY:-opea}/chatqna-no-wrapper:${TAG:-latest} + container_name: chatqna-gaudi-backend-server + depends_on: + - redis-vector-db + # - tei-embedding-service + # - embedding + - retriever + # - tei-reranking-service + # - reranking + - tgi-service + # - llm + ports: + - "8888:8888" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP} + # - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP} + # - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-8090} + - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP} + - EMBEDDING_RERANK_SERVICE_HOST_IP=${EMBEDDING_RERANK_SERVICE_HOST_IP} + - EMBEDDING_RERANK_SERVICE_PORT=${EMBEDDING_RERANK_SERVICE_HOST_IP:-6001} + # - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP} + # - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-8808} + - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP} + - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8005} + - LOGFLAG=${LOGFLAG} + ipc: host + restart: always + chaqna-gaudi-ui-server: + image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest} + container_name: chatqna-gaudi-ui-server + depends_on: + - chaqna-gaudi-backend-server + ports: + - "5173:5173" + environment: + - no_proxy=${no_proxy} + - https_proxy=${https_proxy} + - http_proxy=${http_proxy} + - CHAT_BASE_URL=${BACKEND_SERVICE_ENDPOINT} + - UPLOAD_FILE_BASE_URL=${DATAPREP_SERVICE_ENDPOINT} + - GET_FILE=${DATAPREP_GET_FILE_ENDPOINT} + - DELETE_FILE=${DATAPREP_DELETE_FILE_ENDPOINT} + ipc: host + restart: always + +networks: + default: + driver: bridge From f2e9866b9c942f20ea487f6ce0d791a5fbd9002a Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 15 Oct 2024 00:04:26 -0700 Subject: [PATCH 2/9] static -> dynamic --- ...=> chatqna_no_wrapper_dynamic_batching.py} | 0 ... compose_no_wrapper_dynamic_batching.yaml} | 107 ++---------------- 2 files changed, 9 insertions(+), 98 deletions(-) rename ChatQnA/{chatqna_no_wrapper_static_batching.py => chatqna_no_wrapper_dynamic_batching.py} (100%) rename ChatQnA/docker_compose/intel/hpu/gaudi/{compose_no_wrapper_static_batching.yaml => compose_no_wrapper_dynamic_batching.yaml} (51%) diff --git a/ChatQnA/chatqna_no_wrapper_static_batching.py b/ChatQnA/chatqna_no_wrapper_dynamic_batching.py similarity index 100% rename from ChatQnA/chatqna_no_wrapper_static_batching.py rename to ChatQnA/chatqna_no_wrapper_dynamic_batching.py diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_static_batching.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml similarity index 51% rename from ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_static_batching.yaml rename to ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml index bc01f15ed..376a1e82a 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_static_batching.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml @@ -13,7 +13,6 @@ services: container_name: dataprep-redis-server depends_on: - redis-vector-db - - tei-embedding-service ports: - "6007:6007" environment: @@ -37,44 +36,14 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HABANA_VISIBLE_DEVICES: all + HABANA_VISIBLE_DEVICES: all # only use 1 gaudi card OMPI_MCA_btl_vader_single_copy_mechanism: none - LOGFLAG=${LOGFLAG} - # tei-embedding-service: - # image: ${REGISTRY:-opea}/tei-gaudi:${TAG:-latest} - # container_name: tei-embedding-gaudi-server - # ports: - # - "8090:80" - # volumes: - # - "./data:/data" - # runtime: habana - # cap_add: - # - SYS_NICE - # ipc: host - # environment: - # no_proxy: ${no_proxy} - # http_proxy: ${http_proxy} - # https_proxy: ${https_proxy} - # HABANA_VISIBLE_DEVICES: all - # OMPI_MCA_btl_vader_single_copy_mechanism: none - # MAX_WARMUP_SEQUENCE_LENGTH: 512 - # INIT_HCCL_ON_ACQUIRE: 0 - # ENABLE_EXPERIMENTAL_FLAGS: true - # command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate - # embedding: - # image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest} - # container_name: embedding-tei-server - # depends_on: - # - tei-embedding-service - # ports: - # - "6000:6000" - # ipc: host - # environment: - # no_proxy: ${no_proxy} - # http_proxy: ${http_proxy} - # https_proxy: ${https_proxy} - # TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} - # restart: unless-stopped + LOGFLAG: ${LOGFLAG} + DYNAMIC_BATCHING_TIMEOUT: 0.01 + DYNAMIC_BATCHING_MAX_BATCH_SIZE: 32 + PAD_SEQUENCE_TO_MULTIPLE_OF: 128 + EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5" + RERANK_MODEL_ID: "BAAI/bge-reranker-base" retriever: image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest} container_name: retriever-redis-server @@ -90,39 +59,6 @@ services: REDIS_URL: ${REDIS_URL} INDEX_NAME: ${INDEX_NAME} restart: unless-stopped - # tei-reranking-service: - # image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 - # container_name: tei-reranking-gaudi-server - # ports: - # - "8808:80" - # volumes: - # - "./data:/data" - # shm_size: 1g - # environment: - # no_proxy: ${no_proxy} - # http_proxy: ${http_proxy} - # https_proxy: ${https_proxy} - # HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - # HF_HUB_DISABLE_PROGRESS_BARS: 1 - # HF_HUB_ENABLE_HF_TRANSFER: 0 - # command: --model-id ${RERANK_MODEL_ID} --auto-truncate - # reranking: - # image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest} - # container_name: reranking-tei-gaudi-server - # depends_on: - # - tei-reranking-service - # ports: - # - "8000:8000" - # ipc: host - # environment: - # no_proxy: ${no_proxy} - # http_proxy: ${http_proxy} - # https_proxy: ${https_proxy} - # TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT} - # HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - # HF_HUB_DISABLE_PROGRESS_BARS: 1 - # HF_HUB_ENABLE_HF_TRANSFER: 0 - # restart: unless-stopped tgi-service: image: ghcr.io/huggingface/tgi-gaudi:2.0.1 container_name: tgi-gaudi-server @@ -144,35 +80,14 @@ services: - SYS_NICE ipc: host command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 - # llm: - # image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest} - # container_name: llm-tgi-gaudi-server - # depends_on: - # - tgi-service - # ports: - # - "9000:9000" - # ipc: host - # environment: - # no_proxy: ${no_proxy} - # http_proxy: ${http_proxy} - # https_proxy: ${https_proxy} - # TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT} - # HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - # HF_HUB_DISABLE_PROGRESS_BARS: 1 - # HF_HUB_ENABLE_HF_TRANSFER: 0 - # restart: unless-stopped chaqna-gaudi-backend-server: - image: ${REGISTRY:-opea}/chatqna-no-wrapper:${TAG:-latest} + image: ${REGISTRY:-opea}/chatqna-no-wrapper-dynamic-batching:${TAG:-latest} container_name: chatqna-gaudi-backend-server depends_on: - redis-vector-db - # - tei-embedding-service - # - embedding - retriever - # - tei-reranking-service - # - reranking - tgi-service - # - llm + - embedding-reranking-server ports: - "8888:8888" environment: @@ -180,13 +95,9 @@ services: - https_proxy=${https_proxy} - http_proxy=${http_proxy} - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP} - # - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP} - # - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-8090} - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP} - EMBEDDING_RERANK_SERVICE_HOST_IP=${EMBEDDING_RERANK_SERVICE_HOST_IP} - EMBEDDING_RERANK_SERVICE_PORT=${EMBEDDING_RERANK_SERVICE_HOST_IP:-6001} - # - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP} - # - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-8808} - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP} - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8005} - LOGFLAG=${LOGFLAG} From 6d4f175b3a104920b185b50bc4d1e060f48f0327 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 15 Oct 2024 00:07:04 -0700 Subject: [PATCH 3/9] fix --- .../intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml index 376a1e82a..3204d6e5b 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml @@ -87,7 +87,7 @@ services: - redis-vector-db - retriever - tgi-service - - embedding-reranking-server + - embedding-reranking-service ports: - "8888:8888" environment: From 02398611dc0e532db2cb683b9e289e50a0553daf Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 15 Oct 2024 19:59:59 -0700 Subject: [PATCH 4/9] mega --- .../Dockerfile.no_wrapper_dynamic_batching | 31 +++++ .../chatqna_no_wrapper_dynamic_batching.py | 120 ++++-------------- .../compose_no_wrapper_dynamic_batching.yaml | 3 +- 3 files changed, 58 insertions(+), 96 deletions(-) create mode 100644 ChatQnA/Dockerfile.no_wrapper_dynamic_batching diff --git a/ChatQnA/Dockerfile.no_wrapper_dynamic_batching b/ChatQnA/Dockerfile.no_wrapper_dynamic_batching new file mode 100644 index 000000000..fb1174acb --- /dev/null +++ b/ChatQnA/Dockerfile.no_wrapper_dynamic_batching @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + git \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +WORKDIR /home/user/ +RUN git clone https://github.com/opea-project/GenAIComps.git + +WORKDIR /home/user/GenAIComps +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt && \ + pip install --no-cache-dir langchain_core + +COPY ./chatqna_no_wrapper_dynamic_batching.py /home/user/chatqna_no_wrapper_dynamic_batching.py + +ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps + +USER user + +WORKDIR /home/user + +ENTRYPOINT ["python", "chatqna_no_wrapper_dynamic_batching.py"] diff --git a/ChatQnA/chatqna_no_wrapper_dynamic_batching.py b/ChatQnA/chatqna_no_wrapper_dynamic_batching.py index 55fa87f99..d3c6b52ea 100644 --- a/ChatQnA/chatqna_no_wrapper_dynamic_batching.py +++ b/ChatQnA/chatqna_no_wrapper_dynamic_batching.py @@ -36,20 +36,8 @@ def generate_rag_prompt(question, documents): MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0") MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888)) -# EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0") -# EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000)) -# RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0") -# RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000)) -# RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0") -# RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000)) -# LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0") -# LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000)) -# EMBEDDING_SERVER_HOST_IP = os.getenv("EMBEDDING_SERVER_HOST_IP", "0.0.0.0") -# EMBEDDING_SERVER_PORT = int(os.getenv("EMBEDDING_SERVER_PORT", 6006)) RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0") RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000)) -# RERANK_SERVER_HOST_IP = os.getenv("RERANK_SERVER_HOST_IP", "0.0.0.0") -# RERANK_SERVER_PORT = int(os.getenv("RERANK_SERVER_PORT", 8808)) # Embed/Rerank use the same host:ip, diff routers EMBEDDING_RERANK_SERVICE_HOST_IP = os.getenv("EMBEDDING_RERANK_SERVICE_HOST_IP", "0.0.0.0") EMBEDDING_RERANK_SERVICE_PORT = os.getenv("EMBEDDING_RERANK_SERVICE_PORT", 6001) @@ -66,11 +54,33 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k retriever_parameters = kwargs.get("retriever_parameters", None) if retriever_parameters: inputs.update(retriever_parameters.dict()) + elif self.services[cur_node].service_type == ServiceType.RERANK: + reranker_parameters = kwargs.get("reranker_parameters", None) + top_n = reranker_parameters.top_n if reranker_parameters else 1 + inputs["top_n"] = top_n elif self.services[cur_node].service_type == ServiceType.LLM: + prompt = inputs["query"] + docs: list[str] = inputs["documents"] + chat_template = llm_parameters_dict["chat_template"] + if chat_template: + prompt_template = PromptTemplate.from_template(chat_template) + input_variables = prompt_template.input_variables + if sorted(input_variables) == ["context", "question"]: + prompt = prompt_template.format(question=prompt, context="\n".join(docs)) + elif input_variables == ["question"]: + prompt = prompt_template.format(question=prompt) + else: + print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']") + prompt = ChatTemplate.generate_rag_prompt(prompt, docs) + else: + prompt = ChatTemplate.generate_rag_prompt(prompt, docs) + + # inputs: LLMParamsDoc + # {'id': 'd52f75f7bd602526073d933dab541a8c', 'model': None, 'query': 'What is the revenue of Nike in 2023?', 'max_tokens': 1024, 'max_new_tokens': 1024, 'top_k': 10, 'top_p': 0.95, 'typical_p': 0.95, 'temperature': 0.01, 'frequency_penalty': 0.0, 'presence_penalty': 0.0, 'repetition_penalty': 1.0, 'streaming': True, 'chat_template': None, 'documents': []} # convert TGI/vLLM to unified OpenAI /v1/chat/completions format next_inputs = {} next_inputs["model"] = "tgi" # specifically clarify the fake model to make the format unified - next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}] + next_inputs["messages"] = [{"role": "user", "content": prompt}] next_inputs["max_tokens"] = llm_parameters_dict["max_new_tokens"] next_inputs["top_p"] = llm_parameters_dict["top_p"] next_inputs["stream"] = inputs["streaming"] @@ -78,88 +88,8 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k next_inputs["temperature"] = inputs["temperature"] inputs = next_inputs - return inputs - -def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_dict, **kwargs): - next_data = {} - # if self.services[cur_node].service_type == ServiceType.EMBEDDING: - # assert isinstance(data, list) - # next_data = {"text": inputs["inputs"], "embedding": data[0]} - if self.services[cur_node].service_type == ServiceType.RETRIEVER: - # TODO align outputs!! - next_data = data - # docs = [doc["text"] for doc in data["retrieved_docs"]] - - # with_rerank = runtime_graph.downstream(cur_node)[0].startswith("rerank") - # if with_rerank and docs: - # # forward to rerank - # # prepare inputs for rerank - # next_data["query"] = data["initial_query"] - # next_data["texts"] = [doc["text"] for doc in data["retrieved_docs"]] - # else: - # # forward to llm - # if not docs: - # # delete the rerank from retriever -> rerank -> llm - # for ds in reversed(runtime_graph.downstream(cur_node)): - # for nds in runtime_graph.downstream(ds): - # runtime_graph.add_edge(cur_node, nds) - # runtime_graph.delete_node_if_exists(ds) - - # # handle template - # # if user provides template, then format the prompt with it - # # otherwise, use the default template - # prompt = data["initial_query"] - # chat_template = llm_parameters_dict["chat_template"] - # if chat_template: - # prompt_template = PromptTemplate.from_template(chat_template) - # input_variables = prompt_template.input_variables - # if sorted(input_variables) == ["context", "question"]: - # prompt = prompt_template.format(question=data["initial_query"], context="\n".join(docs)) - # elif input_variables == ["question"]: - # prompt = prompt_template.format(question=data["initial_query"]) - # else: - # print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']") - # prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs) - # else: - # prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs) - - # next_data["inputs"] = prompt - - elif self.services[cur_node].service_type == ServiceType.RERANK: - # TODO align outputs!! - next_data = data - # # rerank the inputs with the scores - # reranker_parameters = kwargs.get("reranker_parameters", None) - # top_n = reranker_parameters.top_n if reranker_parameters else 1 - # docs = inputs["texts"] - # reranked_docs = [] - # for best_response in data[:top_n]: - # reranked_docs.append(docs[best_response["index"]]) - - # # handle template - # # if user provides template, then format the prompt with it - # # otherwise, use the default template - # prompt = inputs["query"] - # chat_template = llm_parameters_dict["chat_template"] - # if chat_template: - # prompt_template = PromptTemplate.from_template(chat_template) - # input_variables = prompt_template.input_variables - # if sorted(input_variables) == ["context", "question"]: - # prompt = prompt_template.format(question=prompt, context="\n".join(docs)) - # elif input_variables == ["question"]: - # prompt = prompt_template.format(question=prompt) - # else: - # print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']") - # prompt = ChatTemplate.generate_rag_prompt(prompt, docs) - # else: - # prompt = ChatTemplate.generate_rag_prompt(prompt, docs) - - # next_data["inputs"] = prompt - else: - next_data = data - - return next_data + return inputs def align_generator(self, gen, **kwargs): @@ -186,7 +116,7 @@ def __init__(self, host="0.0.0.0", port=8000): self.host = host self.port = port ServiceOrchestrator.align_inputs = align_inputs - ServiceOrchestrator.align_outputs = align_outputs + # ServiceOrchestrator.align_outputs = align_outputs ServiceOrchestrator.align_generator = align_generator self.megaservice = ServiceOrchestrator() diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml index 3204d6e5b..dcaadb11b 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml @@ -21,7 +21,8 @@ services: https_proxy: ${https_proxy} REDIS_URL: ${REDIS_URL} INDEX_NAME: ${INDEX_NAME} - TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + # TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + EMBED_MODEL: ${EMBED_MODEL} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} embedding-reranking-service: image: ${REGISTRY:-opea}/embedding-reranking-local:${TAG:-latest} From deb1e0834051e9f7fc4d03640efd946b836c7c79 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 15 Oct 2024 20:08:55 -0700 Subject: [PATCH 5/9] fix --- .../intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml index dcaadb11b..1a302f4c6 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml @@ -98,7 +98,7 @@ services: - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP} - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP} - EMBEDDING_RERANK_SERVICE_HOST_IP=${EMBEDDING_RERANK_SERVICE_HOST_IP} - - EMBEDDING_RERANK_SERVICE_PORT=${EMBEDDING_RERANK_SERVICE_HOST_IP:-6001} + - EMBEDDING_RERANK_SERVICE_PORT=${EMBEDDING_RERANK_SERVICE_PORT:-6001} - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP} - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8005} - LOGFLAG=${LOGFLAG} From 7795ca6b072781a195cfbfb76db16b057d1978e6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 16 Oct 2024 03:29:55 +0000 Subject: [PATCH 6/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ChatQnA/chatqna_no_wrapper_dynamic_batching.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ChatQnA/chatqna_no_wrapper_dynamic_batching.py b/ChatQnA/chatqna_no_wrapper_dynamic_batching.py index d3c6b52ea..9d4c6164f 100644 --- a/ChatQnA/chatqna_no_wrapper_dynamic_batching.py +++ b/ChatQnA/chatqna_no_wrapper_dynamic_batching.py @@ -88,7 +88,6 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k next_inputs["temperature"] = inputs["temperature"] inputs = next_inputs - return inputs From 4ba8b3d15fcba30657227b84f0d5afe492dcc365 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Wed, 16 Oct 2024 00:09:34 -0700 Subject: [PATCH 7/9] rename --- ...rapper_dynamic_batching.yaml => compose_dynamic_batching.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ChatQnA/docker_compose/intel/hpu/gaudi/{compose_no_wrapper_dynamic_batching.yaml => compose_dynamic_batching.yaml} (100%) diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_dynamic_batching.yaml similarity index 100% rename from ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper_dynamic_batching.yaml rename to ChatQnA/docker_compose/intel/hpu/gaudi/compose_dynamic_batching.yaml From f0f6e1cd8bd677b2199d16a60ac1b21b61a770f4 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Wed, 16 Oct 2024 00:11:55 -0700 Subject: [PATCH 8/9] rename --- ...o_wrapper_dynamic_batching => Dockerfile.dynamic_batching} | 4 ++-- ...rapper_dynamic_batching.py => chatqna_dynamic_batching.py} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename ChatQnA/{Dockerfile.no_wrapper_dynamic_batching => Dockerfile.dynamic_batching} (81%) rename ChatQnA/{chatqna_no_wrapper_dynamic_batching.py => chatqna_dynamic_batching.py} (100%) diff --git a/ChatQnA/Dockerfile.no_wrapper_dynamic_batching b/ChatQnA/Dockerfile.dynamic_batching similarity index 81% rename from ChatQnA/Dockerfile.no_wrapper_dynamic_batching rename to ChatQnA/Dockerfile.dynamic_batching index fb1174acb..f6b4fda40 100644 --- a/ChatQnA/Dockerfile.no_wrapper_dynamic_batching +++ b/ChatQnA/Dockerfile.dynamic_batching @@ -20,7 +20,7 @@ RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt && \ pip install --no-cache-dir langchain_core -COPY ./chatqna_no_wrapper_dynamic_batching.py /home/user/chatqna_no_wrapper_dynamic_batching.py +COPY ./chatqna_dynamic_batching.py /home/user/chatqna_dynamic_batching.py ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps @@ -28,4 +28,4 @@ USER user WORKDIR /home/user -ENTRYPOINT ["python", "chatqna_no_wrapper_dynamic_batching.py"] +ENTRYPOINT ["python", "chatqna_dynamic_batching.py"] diff --git a/ChatQnA/chatqna_no_wrapper_dynamic_batching.py b/ChatQnA/chatqna_dynamic_batching.py similarity index 100% rename from ChatQnA/chatqna_no_wrapper_dynamic_batching.py rename to ChatQnA/chatqna_dynamic_batching.py From 0316fa58cd6d0a87cf7c99bca1708a81aa9d4aeb Mon Sep 17 00:00:00 2001 From: Spycsh Date: Wed, 16 Oct 2024 00:12:36 -0700 Subject: [PATCH 9/9] fix --- .../intel/hpu/gaudi/compose_dynamic_batching.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_dynamic_batching.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_dynamic_batching.yaml index 1a302f4c6..155b5918f 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_dynamic_batching.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_dynamic_batching.yaml @@ -82,7 +82,7 @@ services: ipc: host command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 chaqna-gaudi-backend-server: - image: ${REGISTRY:-opea}/chatqna-no-wrapper-dynamic-batching:${TAG:-latest} + image: ${REGISTRY:-opea}/chatqna-dynamic-batching:${TAG:-latest} container_name: chatqna-gaudi-backend-server depends_on: - redis-vector-db