From f890269a1b83d13ffc47fca25ffe7496d591bbfe Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 21 Aug 2023 21:26:51 -0700 Subject: [PATCH] [LLM] better format of vllm openai api README (#2440) * [LLM] better format of vllm openai api README * Fix the way to fetch IP * install fschat for chat completion * Add missing dependency * Update readme --- llm/vllm/README.md | 64 ++++++++++++++++++++++++++++------ llm/vllm/serve-openai-api.yaml | 11 +++--- llm/vllm/serve.yaml | 9 +++-- 3 files changed, 66 insertions(+), 18 deletions(-) diff --git a/llm/vllm/README.md b/llm/vllm/README.md index 309c99e8cc6..7a27ad61fc4 100644 --- a/llm/vllm/README.md +++ b/llm/vllm/README.md @@ -51,25 +51,27 @@ sky launch -c vllm-llama2 serving-openai-api.yaml 2. Check the IP for the cluster with: ``` sky status -a +# Or get the IP with Python API: +IP=$(python -c "import sky; print(sky.status('vllm-llama2')[0]['handle'].head_ip)") ``` 3. You can now use the OpenAI API to interact with the model. - Query the models hosted on the cluster: ```bash -curl http://:8000/v1/models +curl http://$IP:8000/v1/models ``` - - Query a model with input prompts: + - Query a model with input prompts for text completion: ```bash -curl http://:8000/v1/completions \ --H "Content-Type: application/json" \ --d '{ -"model": "meta-llama/Llama-2-7b-chat-hf", -"prompt": "San Francisco is a", -"max_tokens": 7, -"temperature": 0 -}' +curl http://$IP:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-2-7b-chat-hf", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' ``` You should get a similar response as the following: -``` +```console { "id":"cmpl-50a231f7f06a4115a1e4bd38c589cd8f", "object":"text_completion","created":1692427390, @@ -81,4 +83,44 @@ curl http://:8000/v1/completions \ }], "usage":{"prompt_tokens":5,"total_tokens":12,"completion_tokens":7} } +``` + - Query a model with input prompts for chat completion: +```bash +curl http://$IP:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-2-7b-chat-hf", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ] + }' +``` + You should get a similar response as the following: +```console +{ + "id": "cmpl-879a58992d704caf80771b4651ff8cb6", + "object": "chat.completion", + "created": 1692650569, + "model": "meta-llama/Llama-2-7b-chat-hf", + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": " Hello! I'm just an AI assistant, here to help you" + }, + "finish_reason": "length" + }], + "usage": { + "prompt_tokens": 31, + "total_tokens": 47, + "completion_tokens": 16 + } +} ``` diff --git a/llm/vllm/serve-openai-api.yaml b/llm/vllm/serve-openai-api.yaml index 25049938d63..c469efc1a93 100644 --- a/llm/vllm/serve-openai-api.yaml +++ b/llm/vllm/serve-openai-api.yaml @@ -1,12 +1,12 @@ +envs: + MODEL_NAME: meta-llama/Llama-2-7b-chat-hf + HF_TOKEN: # Change to your own huggingface token + resources: accelerators: L4:1 ports: - 8000 -envs: - MODEL_NAME: meta-llama/Llama-2-7b-chat-hf - HF_TOKEN: # Change to your own huggingface token - setup: | conda activate vllm if [ $? -ne 0 ]; then @@ -15,6 +15,9 @@ setup: | fi git clone https://github.com/vllm-project/vllm.git || true + # Install fschat and accelerate for chat completion + pip install fschat + pip install accelerate cd vllm pip list | grep vllm || pip install . diff --git a/llm/vllm/serve.yaml b/llm/vllm/serve.yaml index 88ddc23ad2c..a3c6da90a61 100644 --- a/llm/vllm/serve.yaml +++ b/llm/vllm/serve.yaml @@ -1,9 +1,9 @@ -resources: - accelerators: A100-80GB:8 - envs: MODEL_NAME: decapoda-research/llama-65b-hf +resources: + accelerators: A100-80GB:8 + setup: | conda activate vllm if [ $? -ne 0 ]; then @@ -12,6 +12,9 @@ setup: | fi git clone https://github.com/vllm-project/vllm.git || true + # Install fschat and accelerate for chat completion + pip install fschat + pip install accelerate cd vllm pip list | grep vllm || pip install .