Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shared filesystem caching #272

Merged
merged 23 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions .github/workflows/build-push-huggingface-model-loader.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: Build and Push huggingface-model-loader Docker image
on:
push:
branches:
- main
tags:
- "v*.*.*"
paths-ignore:
- '**/README.md'
pull_request:

# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
env:
REGISTRY: ghcr.io
IMAGE_NAME: substratusai/huggingface-model-loader

jobs:
huggingface-model-loader:
runs-on: ubuntu-latest
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to the Container registry
if: github.event_name == 'push'
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Login to docker.io
if: github.event_name == 'push'
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: |
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
${{ env.IMAGE_NAME }}
- name: Build and push Docker image
uses: docker/build-push-action@v6
with:
context: ./components/huggingface-model-loader
platforms: linux/amd64,linux/arm64
push: ${{ github.event_name == 'push' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Build and Push Docker image
name: Build and Push kubeai Docker image
on:
push:
branches:
Expand Down
38 changes: 35 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ jobs:
- name: Run integration tests
run: make test-integration

e2e:
e2e-general:
runs-on: ubuntu-latest
# NOTE: Uncomment if we start getting limited on number of concurrent jobs
# (due to rapid pushes, etc).
#needs: unit-and-integration # No use in running e2e tests if integration tests fail.
strategy:
matrix:
testcase: ["quickstart", "openai-python-client", "faster-whisper", "autoscaler-restart"]
testcase: ["quickstart", "openai-python-client", "autoscaler-restart", "cache-shared-filesystem"]
steps:
- name: Checkout code
uses: actions/checkout@v2
Expand All @@ -48,4 +48,36 @@ jobs:
run: kind create cluster

- name: Run the e2e testcase
run: make test-e2e-${{ matrix.testcase }}
run: make test-e2e-${{ matrix.testcase }}

e2e-engines:
runs-on: ubuntu-latest
# NOTE: Uncomment if we start getting limited on number of concurrent jobs
# (due to rapid pushes, etc).
#needs: unit-and-integration # No use in running e2e tests if integration tests fail.
strategy:
matrix:
engine: ["FasterWhisper"] # "VLLM", "Infinity", "OLlama"
# Run each test case with and without caching.
cacheProfile: ["", "e2e-test-kind-pv"]
steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Install kind
run: |
curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.24.0/kind-linux-amd64
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind

- name: Install helm
run: |
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3
chmod 700 get_helm.sh
./get_helm.sh

- name: Start kind cluster
run: kind create cluster

- name: Run the e2e testcase
run: make test-e2e-engine ENGINE=${{ matrix.engine }} CACHE_PROFILE=${{ matrix.cacheProfile }}
16 changes: 10 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,21 @@ test-integration: fmt vet envtest
test-e2e-quickstart: skaffold
./test/e2e/run.sh quickstart

.PHONY: test-e2e-faster-whisper
test-e2e-faster-whisper: skaffold
./test/e2e/run.sh faster-whisper --profile kubeai-only

.PHONY: test-e2e-openai-python-client
test-e2e-openai-python-client: skaffold
./test/e2e/run.sh openai-python-client --profile kubeai-only
./test/e2e/run.sh openai-python-client --profile e2e-test-default

.PHONY: test-e2e-autoscaler-restart
test-e2e-autoscaler-restart: skaffold
./test/e2e/run.sh autoscaler-restart --profile kubeai-only-rapid-scaling
./test/e2e/run.sh autoscaler-restart --profile e2e-test-autoscaler-restart

.PHONY: test-e2e-cache-shared-filesystem
test-e2e-cache-shared-filesystem: skaffold
./test/e2e/run.sh cache-shared-filesystem --profile e2e-test-default

.PHONY: test-e2e-engine
test-e2e-engine: skaffold
CACHE_PROFILE=$(CACHE_PROFILE) ./test/e2e/run.sh engine-$(ENGINE) --profile e2e-test-default

.PHONY: lint
lint: golangci-lint ## Run golangci-lint linter
Expand Down
6 changes: 6 additions & 0 deletions api/v1/constants.go → api/v1/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,10 @@ const (
// Use in conjunction with --allow-pod-address-override for development purposes.
ModelPodIPAnnotation = "model-pod-ip"
ModelPodPortAnnotation = "model-pod-port"

ModelCacheEvictionFinalizer = "kubeai.org/cache-eviction"
)

func PVCModelAnnotation(modelName string) string {
return "models.kubeai.org/" + modelName
}
16 changes: 16 additions & 0 deletions api/v1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,16 @@ import (
)

// ModelSpec defines the desired state of Model.
// +kubebuilder:validation:XValidation:rule="!has(self.cacheProfile) || self.url.startsWith(\"hf://\")", message="cacheProfile is only supported with a huggingface url (\"hf://...\") at the moment."
// +kubebuilder:validation:XValidation:rule="!has(self.maxReplicas) || self.minReplicas <= self.maxReplicas", message="minReplicas should be less than or equal to maxReplicas."
type ModelSpec struct {
// URL of the model to be served.
// Currently only the following formats are supported:
// For VLLM & FasterWhisper engines: "hf://<model-repo>/<model-name>"
// For OLlama engine: "ollama://<model>
// +kubebuilder:validation:Required
// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="url is immutable."
// +kubebuilder:validation:XValidation:rule="self.startsWith(\"hf://\") || self.startsWith(\"ollama://\")", message="url must start with \"hf://\" or \"ollama://\" and not be empty."
URL string `json:"url"`

// Features that the model supports.
Expand All @@ -34,6 +39,7 @@ type ModelSpec struct {

// Engine to be used for the server process.
// +kubebuilder:validation:Enum=OLlama;VLLM;FasterWhisper;Infinity
// +kubebuilder:validation:Required
Engine string `json:"engine"`

// ResourceProfile required to serve the model.
Expand All @@ -42,6 +48,11 @@ type ModelSpec struct {
// Must be a valid ResourceProfile defined in the system config.
ResourceProfile string `json:"resourceProfile,omitempty"`

// CacheProfile to be used for caching model artifacts.
// Must be a valid CacheProfile defined in the system config.
// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="cacheProfile is immutable."
CacheProfile string `json:"cacheProfile,omitempty"`

// Image to be used for the server process.
// Will be set from ResourceProfile + Engine if not specified.
Image string `json:"image,omitempty"`
Expand Down Expand Up @@ -110,13 +121,18 @@ const (
// ModelStatus defines the observed state of Model.
type ModelStatus struct {
Replicas ModelStatusReplicas `json:"replicas,omitempty"`
Cache *ModelStatusCache `json:"cache,omitempty"`
}

type ModelStatusReplicas struct {
All int32 `json:"all"`
Ready int32 `json:"ready"`
}

type ModelStatusCache struct {
Loaded bool `json:"loaded"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas.all
Expand Down
22 changes: 21 additions & 1 deletion api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions charts/kubeai/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@ data:
huggingface: {{ include "kubeai.huggingfaceSecretName" . }}
resourceProfiles:
{{- .Values.resourceProfiles | toYaml | nindent 6 }}
cacheProfiles:
{{- .Values.cacheProfiles | toYaml | nindent 6 }}
modelServers:
{{- .Values.modelServers | toYaml | nindent 6 }}
modelLoaders:
{{- .Values.modelLoaders | toYaml | nindent 6 }}
modelRollouts:
{{- .Values.modelRollouts | toYaml | nindent 6 }}
modelServerPods:
Expand Down
26 changes: 26 additions & 0 deletions charts/kubeai/templates/crds/kubeai.org_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ spec:
AutoscalingDisabled will stop the controller from managing the replicas
for the Model. When disabled, metrics will not be collected on server Pods.
type: boolean
cacheProfile:
description: |-
CacheProfile to be used for caching model artifacts.
Must be a valid CacheProfile defined in the system config.
type: string
x-kubernetes-validations:
- message: cacheProfile is immutable.
rule: self == oldSelf
engine:
description: Engine to be used for the server process.
enum:
Expand Down Expand Up @@ -134,16 +142,34 @@ spec:
For VLLM & FasterWhisper engines: "hf://<model-repo>/<model-name>"
For OLlama engine: "ollama://<model>
type: string
x-kubernetes-validations:
- message: url is immutable.
rule: self == oldSelf
- message: url must start with "hf://" or "ollama://" and not be empty.
rule: self.startsWith("hf://") || self.startsWith("ollama://")
required:
- engine
- features
- scaleDownDelaySeconds
- targetRequests
- url
type: object
x-kubernetes-validations:
- message: cacheProfile is only supported with a huggingface url ("hf://...")
at the moment.
rule: '!has(self.cacheProfile) || self.url.startsWith("hf://")'
- message: minReplicas should be less than or equal to maxReplicas.
rule: '!has(self.maxReplicas) || self.minReplicas <= self.maxReplicas'
status:
description: ModelStatus defines the observed state of Model.
properties:
cache:
properties:
loaded:
type: boolean
required:
- loaded
type: object
replicas:
properties:
all:
Expand Down
26 changes: 26 additions & 0 deletions charts/kubeai/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,32 @@ rules:
verbs:
- create
- delete
- deletecollection
- get
- list
- patch
- update
- watch
- apiGroups:
- "batch"
resources:
- jobs
verbs:
- create
- delete
- deletecollection
- get
- list
- patch
- update
- watch
- apiGroups:
- ""
resources:
- persistentvolumeclaims
verbs:
- create
- delete
- get
- list
- patch
Expand Down
8 changes: 8 additions & 0 deletions charts/kubeai/values-gke.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,11 @@ resourceProfiles:
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
cloud.google.com/gke-tpu-topology: "2x4"

cacheProfiles:
standard-filestore:
sharedFilesystem:
storageClassName: "standard-rwx"
premium-filestore:
sharedFilesystem:
storageClassName: "premium-rwx"
7 changes: 7 additions & 0 deletions charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ modelServers:
images:
default: "michaelf34/infinity:latest"

modelLoaders:
huggingface:
# TODO: Update image to the one built with GH Actions.
image: "us-central1-docker.pkg.dev/substratus-dev/default/huggingface-model-downloader:v0.0.1"

modelServerPods:
# Security Context for the model pods
# Needed for OpenShift
Expand Down Expand Up @@ -100,6 +105,8 @@ resourceProfiles:
value: "present"
effect: "NoSchedule"

cacheProfiles: {}

modelAutoscaling:
# Interval that the autoscaler will scrape model server metrics.
# and calculate the desired number of replicas.
Expand Down
Loading
Loading