From f890269a1b83d13ffc47fca25ffe7496d591bbfe Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Mon, 21 Aug 2023 21:26:51 -0700
Subject: [PATCH] [LLM] better format of vllm openai api README (#2440)

* [LLM] better format of vllm openai api README

* Fix the way to fetch IP

* install fschat for chat completion

* Add missing dependency

* Update readme
---
 llm/vllm/README.md             | 64 ++++++++++++++++++++++++++++------
 llm/vllm/serve-openai-api.yaml | 11 +++---
 llm/vllm/serve.yaml            |  9 +++--
 3 files changed, 66 insertions(+), 18 deletions(-)
diff --git a/llm/vllm/README.md b/llm/vllm/README.md
index 309c99e8cc6..7a27ad61fc4 100644
--- a/llm/vllm/README.md
+++ b/llm/vllm/README.md
@@ -51,25 +51,27 @@ sky launch -c vllm-llama2 serving-openai-api.yaml
 2. Check the IP for the cluster with:
 ```
 sky status -a
+# Or get the IP with Python API:
+IP=$(python -c "import sky; print(sky.status('vllm-llama2')[0]['handle'].head_ip)")
 ```
 3. You can now use the OpenAI API to interact with the model.
   - Query the models hosted on the cluster:
 ```bash
-curl http://<IP>:8000/v1/models
+curl http://$IP:8000/v1/models
 ```
-  - Query a model with input prompts:
+  - Query a model with input prompts for text completion:
 ```bash
-curl http://<IP>:8000/v1/completions \
--H "Content-Type: application/json" \
--d '{
-"model": "meta-llama/Llama-2-7b-chat-hf",
-"prompt": "San Francisco is a",
-"max_tokens": 7,
-"temperature": 0
-}'
+curl http://$IP:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "meta-llama/Llama-2-7b-chat-hf",
+      "prompt": "San Francisco is a",
+      "max_tokens": 7,
+      "temperature": 0
+  }'
 ```
   You should get a similar response as the following:
-```
+```console
 {
     "id":"cmpl-50a231f7f06a4115a1e4bd38c589cd8f",
     "object":"text_completion","created":1692427390,
@@ -81,4 +83,44 @@ curl http://<IP>:8000/v1/completions \
     }],
     "usage":{"prompt_tokens":5,"total_tokens":12,"completion_tokens":7}
 }
+```
+  - Query a model with input prompts for chat completion:
+```bash
+curl http://$IP:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Llama-2-7b-chat-hf",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Who are you?"
+      }
+    ]
+  }'
+```
+  You should get a similar response as the following:
+```console
+{
+  "id": "cmpl-879a58992d704caf80771b4651ff8cb6",
+  "object": "chat.completion",
+  "created": 1692650569,
+  "model": "meta-llama/Llama-2-7b-chat-hf",
+  "choices": [{
+    "index": 0,
+    "message": {
+      "role": "assistant",
+      "content": " Hello! I'm just an AI assistant, here to help you"
+    },
+    "finish_reason": "length"
+  }],
+  "usage": {
+    "prompt_tokens": 31,
+    "total_tokens": 47,
+    "completion_tokens": 16
+  }
+}
 ```
diff --git a/llm/vllm/serve-openai-api.yaml b/llm/vllm/serve-openai-api.yaml
index 25049938d63..c469efc1a93 100644
--- a/llm/vllm/serve-openai-api.yaml
+++ b/llm/vllm/serve-openai-api.yaml
@@ -1,12 +1,12 @@
+envs:
+  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
+  HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
+
 resources:
   accelerators: L4:1
   ports:
     - 8000
 
-envs:
-  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
-  HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token
-
 setup: |
   conda activate vllm
   if [ $? -ne 0 ]; then
@@ -15,6 +15,9 @@ setup: |
   fi
 
   git clone https://github.com/vllm-project/vllm.git || true
+  # Install fschat and accelerate for chat completion
+  pip install fschat
+  pip install accelerate
 
   cd vllm
   pip list | grep vllm || pip install .
diff --git a/llm/vllm/serve.yaml b/llm/vllm/serve.yaml
index 88ddc23ad2c..a3c6da90a61 100644
--- a/llm/vllm/serve.yaml
+++ b/llm/vllm/serve.yaml
@@ -1,9 +1,9 @@
-resources:
-  accelerators: A100-80GB:8
-
 envs:
   MODEL_NAME: decapoda-research/llama-65b-hf
 
+resources:
+  accelerators: A100-80GB:8
+
 setup: |
   conda activate vllm
   if [ $? -ne 0 ]; then
@@ -12,6 +12,9 @@ setup: |
   fi
 
   git clone https://github.com/vllm-project/vllm.git || true
+  # Install fschat and accelerate for chat completion
+  pip install fschat
+  pip install accelerate
 
   cd vllm
   pip list | grep vllm || pip install .