llama : cache llama_token_to_piece (#7587) #1

Workflow file for this run

	# Benchmark
	name: Benchmark

	on:
	workflow_dispatch:
	inputs:
	gpu-series:
	description: 'Azure GPU series to run with'
	required: true
	type: choice
	options:
	- Standard_NC4as_T4_v3
	- Standard_NC24ads_A100_v4
	- Standard_NC80adis_H100_v5
	sha:
	description: 'Commit SHA1 to build'
	required: false
	type: string
	duration:
	description: 'Duration of the bench'
	type: string
	default: 10m

	push:
	branches:
	- master
	paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '*/.cu', 'examples/server/.h', 'examples/server/*.cpp']
	pull_request_target:
	types: [opened, synchronize, reopened]
	paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '*/.cu', 'examples/server/.h', 'examples/server/*.cpp']
	schedule:
	- cron: '04 2 * * *'

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref \|\| github.run_id }}-${{ github.event.inputs.sha }}
	cancel-in-progress: true

	jobs:
	bench-server-baseline:
	runs-on: Standard_NC4as_T4_v3
	env:
	RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
	N_USERS: 8
	DURATION: 10m

	strategy:
	matrix:
	model: [phi-2]
	ftype: [q4_0, q8_0, f16]
	include:
	- model: phi-2
	ftype: q4_0
	pr_comment_enabled: "true"

	if: \|
	inputs.gpu-series == 'Standard_NC4as_T4_v3'
	\|\| (
	github.event_name == 'schedule'
	&& github.ref_name == 'master'
	&& github.repository_owner == 'ggerganov'
	)
	\|\| github.event_name == 'pull_request_target'
	\|\| (
	github.event_name == 'push'
	&& github.event.ref == 'refs/heads/master'
	&& github.repository_owner == 'ggerganov'
	)
	steps:
	- name: Clone
	id: checkout
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	ref: ${{ github.event.inputs.sha \|\| github.event.pull_request.head.sha \|\| github.sha \|\| github.head_ref \|\| github.ref_name }}

	- name: Install python env
	id: pipenv
	run: \|
	cd examples/server/bench
	python3 -m venv venv
	source venv/bin/activate
	pip install -r requirements.txt

	- name: Prometheus
	id: install_prometheus
	run: \|
	wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
	tar xzf prometheus*.tar.gz --strip-components=1
	./prometheus --config.file=examples/server/bench/prometheus.yml &
	while ! nc -z localhost 9090; do
	sleep 0.1
	done

	- name: Set up Go
	uses: actions/setup-go@v5
	with:
	go-version: '1.21'

	- name: Install k6 and xk6-sse
	id: k6_installation
	run: \|
	cd examples/server/bench
	go install go.k6.io/xk6/cmd/xk6@latest
	xk6 build master \
	--with github.com/phymbert/xk6-sse

	- name: Build
	id: cmake_build
	run: \|
	set -eux
	cmake -B build \
	-DLLAMA_NATIVE=OFF \
	-DLLAMA_BUILD_SERVER=ON \
	-DLLAMA_CURL=ON \
	-DLLAMA_CUBLAS=ON \
	-DCUDAToolkit_ROOT=/usr/local/cuda \
	-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
	-DCMAKE_CUDA_ARCHITECTURES=75 \
	-DLLAMA_FATAL_WARNINGS=OFF \
	-DLLAMA_ALL_WARNINGS=OFF \
	-DCMAKE_BUILD_TYPE=Release;
	cmake --build build --config Release -j $(nproc) --target server

	- name: Download the dataset
	id: download_dataset
	run: \|
	cd examples/server/bench
	wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

	- name: Server bench
	id: server_bench
	run: \|
	set -eux

	cd examples/server/bench
	source venv/bin/activate
	python bench.py \
	--runner-label ${{ env.RUNNER_LABEL }} \
	--name ${{ github.job }} \
	--branch ${{ github.head_ref \|\| github.ref_name }} \
	--commit ${{ github.event.inputs.sha \|\| github.event.pull_request.head.sha \|\| github.sha }} \
	--scenario script.js \
	--duration ${{ github.event.inputs.duration \|\| env.DURATION }} \
	--hf-repo ggml-org/models \
	--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
	--model-path-prefix /models \
	--parallel ${{ env.N_USERS }} \
	-ngl 33 \
	--batch-size 2048 \
	--ubatch-size 256 \
	--ctx-size 16384 \
	--n-prompts 1000 \
	--max-prompt-tokens 1024 \
	--max-tokens 2048

	cat results.github.env >> $GITHUB_ENV

	# Remove dataset as we do not want it in the artefact
	rm ShareGPT_V3_unfiltered_cleaned_split.json

	- uses: actions/upload-artifact@v4
	with:
	name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
	compression-level: 9
	path: \|
	examples/server/bench/*.jpg
	examples/server/bench/*.json
	examples/server/bench/*.log

	- name: Commit status
	uses: Sibz/github-status-action@v1
	with:
	authToken: ${{secrets.GITHUB_TOKEN}}
	sha: ${{ inputs.sha \|\| github.event.pull_request.head.sha \|\| github.sha }}
	context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
	description: \|
	${{ env.BENCH_RESULTS }}
	state: 'success'

	- name: Upload benchmark images
	uses: devicons/public-upload-to-imgur@v2.2.2
	continue-on-error: true # Important as it looks unstable: 503
	id: imgur_step
	with:
	client_id: ${{secrets.IMGUR_CLIENT_ID}}
	path: \|
	examples/server/bench/prompt_tokens_seconds.jpg
	examples/server/bench/predicted_tokens_seconds.jpg
	examples/server/bench/kv_cache_usage_ratio.jpg
	examples/server/bench/requests_processing.jpg

	- name: Extract mermaid
	id: set_mermaid
	run: \|
	set -eux

	cd examples/server/bench
	PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
	echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
	echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
	echo "EOF" >> $GITHUB_ENV

	PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
	echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
	echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
	echo "EOF" >> $GITHUB_ENV

	KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
	echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
	echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
	echo "EOF" >> $GITHUB_ENV

	REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
	echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
	echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
	echo "EOF" >> $GITHUB_ENV

	- name: Extract image url
	id: extract_image_url
	continue-on-error: true
	run: \|
	set -eux

	echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
	echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
	echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
	echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV

	- name: Comment PR
	uses: mshick/add-pr-comment@v2
	id: comment_pr
	if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
	with:
	message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
	message: \|
	<p align="center">

	📈 llama.cpp server for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: ${{ env.BENCH_ITERATIONS}} iterations 🚀

	</p>

	<details>

	<summary>Expand details for performance related PR only</summary>

	- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration \|\| env.DURATION }}
	- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
	- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
	- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
	- ${{ env.BENCH_GRAPH_XLABEL }}


	<p align="center">

	<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />

	<details>

	<summary>More</summary>

	```mermaid
	${{ env.PROMPT_TOKENS_SECONDS }}
	```

	</details>

	<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>

	<details>
	<summary>More</summary>

	```mermaid
	${{ env.PREDICTED_TOKENS_SECONDS }}
	```

	</details>

	</p>

	<details>

	<summary>Details</summary>

	<p align="center">

	<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />

	<details>
	<summary>More</summary>

	```mermaid
	${{ env.KV_CACHE_USAGE_RATIO }}
	```

	</details>

	<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>

	<details>
	<summary>More</summary>

	```mermaid
	${{ env.REQUESTS_PROCESSING }}
	```

	</details>

	</p>
	</details>
	</details>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

llama : cache llama_token_to_piece (#7587) #1

Workflow file

llama : cache llama_token_to_piece (#7587) #1

Jobs

Run details

Workflow file for this run