forked from intel-analytics/ipex-llm
-
Notifications
You must be signed in to change notification settings - Fork 0
135 lines (125 loc) · 5.55 KB
/
llm_performance_tests.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
name: LLM Performance Test
# Cancel previous runs in the PR when you push new commits
concurrency:
group: ${{ github.workflow }}-llm-performance-tests-${{ github.event.pull_request.number || github.run_id }}
cancel-in-progress: true
# Controls when the action will run.
on:
schedule:
- cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China
pull_request:
branches: [main]
paths:
- ".github/workflows/llm_performance_tests.yml"
- ".github/workflows/llm-binary-build.yml"
- ".github/actions/llm/setup-llm-env/action.yml"
- ".github/actions/llm/remove-llm-env/action.yml"
- ".github/actions/llm/download-llm-binary/action.yml"
- "python/llm/test/benchmark/**"
workflow_dispatch:
workflow_call:
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
llm-cpp-build:
uses: ./.github/workflows/llm-binary-build.yml
llm-performance-test:
needs: llm-cpp-build
strategy:
fail-fast: false
matrix:
python-version: ["3.9"]
instruction: ["AVX512"]
runs-on: [self-hosted, llm, perf]
env:
THREAD_NUM: 24
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade setuptools==58.0.4
python -m pip install --upgrade wheel
- name: Download llm binary
uses: ./.github/actions/llm/download-llm-binary
- name: Run LLM install (all) test
uses: ./.github/actions/llm/setup-llm-env
env:
ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
- name: Run LLM Performance test
env:
ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
run: bash python/llm/dev/benchmark/run-benchmark-tests.sh
# - name: Clean up test environment
# uses: ./.github/actions/llm/remove-llm-env
# env:
# ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
llm-performance-test-on-arc:
needs: llm-cpp-build
strategy:
fail-fast: false
matrix:
python-version: ["3.9"]
runs-on: [self-hosted, llm, perf]
env:
OMP_NUM_THREADS: 16
THREAD_NUM: 16
ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
steps:
- name: Set model directories
shell: bash
run: |
echo "ORIGIN_DIR=/mnt/disk1/models" >> "$GITHUB_ENV"
- name: Set environment variables
shell: bash
run: |
echo "LLAMA2_7B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-7b-chat-hf" >> "$GITHUB_ENV"
echo "LLAMA2_13B_ORIGIN_PATH=${ORIGIN_DIR}/Llama-2-13b-chat-hf" >> "$GITHUB_ENV"
echo "CHATGLM2_6B_ORIGIN_PATH=${ORIGIN_DIR}/chatglm2-6b" >> "$GITHUB_ENV"
echo "WHISPER_MEDIUM_ORIGIN_PATH=${ORIGIN_DIR}/whisper-medium" >> "$GITHUB_ENV"
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
shell: bash
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade setuptools
python -m pip install --upgrade wheel
- name: Download llm binary
uses: ./.github/actions/llm/download-llm-binary
- name: Run LLM install (all) test
uses: ./.github/actions/llm/setup-llm-env
with:
extra-dependency: "xpu"
- name: Test installed xpu version
shell: bash
run: |
source /opt/intel/oneapi/setvars.sh
bash python/llm/test/run-llm-install-tests.sh
- name: Test on xpu
shell: bash
run: |
source /opt/intel/oneapi/setvars.sh
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
cd python/llm/test/benchmark/gpu
export http_proxy=${HTTP_PROXY}
export https_proxy=${HTTPS_PROXY}
rm -rf test-result || true
mkdir test-result
taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_7B_ORIGIN_PATH}" --input-tokens=32 --max-new-tokens=32 > test-result/llama2_7b-32-32.log
taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_7B_ORIGIN_PATH}" --input-tokens=1024 --max-new-tokens=1024 > test-result/llama2_7b-1024-1024.log
taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_13B_ORIGIN_PATH}" --input-tokens=32 --max-new-tokens=32 > test-result/llama2_13b-32-32.log
taskset -c 0-$((THREAD_NUM - 1)) python llama2.py --model-dir="${LLAMA2_13B_ORIGIN_PATH}" --input-tokens=1024 --max-new-tokens=1024 > test-result/llama2_13b-1024-1024.log
taskset -c 0-$((THREAD_NUM - 1)) python chatglm2.py --model-dir="${CHATGLM2_6B_ORIGIN_PATH}" --input-tokens=32 --max-new-tokens=32 > test-result/chatglm2_6b-32-32.log
taskset -c 0-$((THREAD_NUM - 1)) python chatglm2.py --model-dir="${CHATGLM2_6B_ORIGIN_PATH}" --input-tokens=1024 --max-new-tokens=1024 > test-result/chatglm2_6b-1024-1024.log
taskset -c 0-$((THREAD_NUM - 1)) python whisper.py --model-dir="${WHISPER_MEDIUM_ORIGIN_PATH}" > test-result/whisper_medium-default-default.log
python ../analyze_log_dir.py --log-dir=./test-result --output-path=./xpu_latency.csv
timestamp=`date '+%Y%m%d'`
curl -T ./xpu_latency.csv ${LLM_FTP_URL}/llm/ggml-actions/perf/xpu_lantency_$timestamp.csv