substratusai · nstogner · Oct 18, 2024 · Oct 12, 2024 · Oct 12, 2024 · Oct 14, 2024
diff --git a/.github/workflows/build-push-huggingface-model-loader.yml b/.github/workflows/build-push-huggingface-model-loader.yml
@@ -0,0 +1,61 @@
+name: Build and Push huggingface-model-loader Docker image
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - "v*.*.*"
+    paths-ignore:
+      - '**/README.md'
+  pull_request:
+
+# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: substratusai/huggingface-model-loader
+
+jobs:
+  huggingface-model-loader:
+    runs-on: ubuntu-latest
+    # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Log in to the Container registry
+        if: github.event_name == 'push'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Login to docker.io
+        if: github.event_name == 'push'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+            ${{ env.IMAGE_NAME }}
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: ./components/huggingface-model-loader
+          platforms: linux/amd64,linux/arm64
+          push: ${{ github.event_name == 'push' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/.github/workflows/build-push.yml → .github/workflows/build-push-kubeai.yml b/.github/workflows/build-push.yml → .github/workflows/build-push-kubeai.yml
@@ -1,4 +1,4 @@
-name: Build and Push Docker image
+name: Build and Push kubeai Docker image
 on:
   push:
     branches:

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -27,7 +27,7 @@ jobs:
     #needs: unit-and-integration # No use in running e2e tests if integration tests fail.
     strategy:
       matrix:
-        testcase: ["quickstart", "openai-python-client", "faster-whisper", "autoscaler-restart"]
+        testcase: ["quickstart", "openai-python-client", "faster-whisper", "autoscaler-restart", "cache-shared-filesystem"]
     steps:
       - name: Checkout code
         uses: actions/checkout@v2

diff --git a/Makefile b/Makefile
@@ -91,6 +91,10 @@ test-e2e-openai-python-client: skaffold
 test-e2e-autoscaler-restart: skaffold
 	./test/e2e/run.sh autoscaler-restart --profile kubeai-only-rapid-scaling
 
+.PHONY: test-e2e-cache-shared-filesystem
+test-e2e-cache-shared-filesystem: skaffold
+	./test/e2e/run.sh cache-shared-filesystem --profile e2e-test-cache-shared-filesystem
+
 .PHONY: lint
 lint: golangci-lint ## Run golangci-lint linter
 	$(GOLANGCI_LINT) run

diff --git a/api/v1/constants.go → api/v1/metadata.go b/api/v1/constants.go → api/v1/metadata.go
@@ -14,4 +14,10 @@ const (
 	// Use in conjunction with --allow-pod-address-override for development purposes.
 	ModelPodIPAnnotation   = "model-pod-ip"
 	ModelPodPortAnnotation = "model-pod-port"
+
+	ModelCacheEvictionFinalizer = "kubeai.org/cache-eviction"
 )
+
+func PVCModelAnnotation(modelName string) string {
+	return "models.kubeai.org/" + modelName
+}
diff --git a/api/v1/model_types.go b/api/v1/model_types.go
@@ -42,6 +42,10 @@ type ModelSpec struct {
 	// Must be a valid ResourceProfile defined in the system config.
 	ResourceProfile string `json:"resourceProfile,omitempty"`
 
+	// CacheProfile to be used for caching model artifacts.
+	// Must be a valid CacheProfile defined in the system config.
+	CacheProfile string `json:"cacheProfile,omitempty"`
+
 	// Image to be used for the server process.
 	// Will be set from ResourceProfile + Engine if not specified.
 	Image string `json:"image,omitempty"`
@@ -110,13 +114,18 @@ const (
 // ModelStatus defines the observed state of Model.
 type ModelStatus struct {
 	Replicas ModelStatusReplicas `json:"replicas,omitempty"`
+	Cache    *ModelStatusCache   `json:"cache,omitempty"`
 }
 
 type ModelStatusReplicas struct {
 	All   int32 `json:"all"`
 	Ready int32 `json:"ready"`
 }
 
+type ModelStatusCache struct {
+	Loaded bool `json:"loaded"`
+}
+
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas.all

diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
diff --git a/charts/kubeai/templates/configmap.yaml b/charts/kubeai/templates/configmap.yaml
@@ -10,8 +10,12 @@ data:
       huggingface: {{ include "kubeai.huggingfaceSecretName" . }}
     resourceProfiles:
       {{- .Values.resourceProfiles | toYaml | nindent 6 }}
+    cacheProfiles:
+      {{- .Values.cacheProfiles | toYaml | nindent 6 }}
     modelServers:
       {{- .Values.modelServers | toYaml | nindent 6 }}
+    modelLoaders:
+      {{- .Values.modelLoaders | toYaml | nindent 6 }}
     modelRollouts:
       {{- .Values.modelRollouts | toYaml | nindent 6 }}
     modelServerPods:

diff --git a/charts/kubeai/templates/crds/kubeai.org_models.yaml b/charts/kubeai/templates/crds/kubeai.org_models.yaml
@@ -49,6 +49,11 @@ spec:
                   AutoscalingDisabled will stop the controller from managing the replicas
                   for the Model. When disabled, metrics will not be collected on server Pods.
                 type: boolean
+              cacheProfile:
+                description: |-
+                  CacheProfile to be used for caching model artifacts.
+                  Must be a valid CacheProfile defined in the system config.
+                type: string
               engine:
                 description: Engine to be used for the server process.
                 enum:
@@ -144,6 +149,13 @@ spec:
           status:
             description: ModelStatus defines the observed state of Model.
             properties:
+              cache:
+                properties:
+                  loaded:
+                    type: boolean
+                required:
+                - loaded
+                type: object
               replicas:
                 properties:
                   all:

diff --git a/charts/kubeai/templates/role.yaml b/charts/kubeai/templates/role.yaml
@@ -12,6 +12,32 @@ rules:
   verbs:
   - create
   - delete
+  - deletecollection
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - "batch"
+  resources:
+  - jobs
+  verbs:
+  - create
+  - delete
+  - deletecollection
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - persistentvolumeclaims
+  verbs:
+  - create
+  - delete
   - get
   - list
   - patch

diff --git a/charts/kubeai/values-gke.yaml b/charts/kubeai/values-gke.yaml
@@ -36,3 +36,11 @@ resourceProfiles:
     nodeSelector:
       cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
       cloud.google.com/gke-tpu-topology: "2x4"
+
+cacheProfiles:
+  standard-filestore:
+    sharedFilesystem:
+      storageClassName: "standard-rwx"
+  premium-filestore:
+    sharedFilesystem:
+      storageClassName: "premium-rwx"
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -32,6 +32,11 @@ modelServers:
     images:
       default: "michaelf34/infinity:latest"
 
+modelLoaders:
+  huggingface:
+    # TODO: Update image to the one built with GH Actions.
+    image: "us-central1-docker.pkg.dev/substratus-dev/default/huggingface-model-downloader:v0.0.1"
+
 modelServerPods:
   # Security Context for the model pods
   # Needed for OpenShift
@@ -100,6 +105,8 @@ resourceProfiles:
         value: "present"
         effect: "NoSchedule"
 
+cacheProfiles: {}
+
 modelAutoscaling:
   # Interval that the autoscaler will scrape model server metrics.
   # and calculate the desired number of replicas.

diff --git a/charts/models/templates/models.yaml b/charts/models/templates/models.yaml
@@ -35,5 +35,8 @@ spec:
   {{- with $model.resourceProfile }}
   resourceProfile: {{ . }}
   {{- end}}
+  {{- with $model.cacheProfile }}
+  cacheProfile: {{ . }}
+  {{- end}}
 {{- end}}
 {{- end}}
diff --git a/components/huggingface-model-loader/Dockerfile b/components/huggingface-model-loader/Dockerfile
@@ -0,0 +1,17 @@
+FROM python:3.10-slim
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Install Hugging Face CLI tool and other necessary dependencies
+RUN pip install --no-cache-dir huggingface_hub
+
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+
+COPY download.sh /app/download.sh
+RUN chmod +x /app/download.sh
+
+CMD ["/app/download.sh"]
diff --git a/components/huggingface-model-loader/download.sh b/components/huggingface-model-loader/download.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+huggingface-cli download --local-dir $MODEL_DIR $MODEL_REPO
+rm -rf $MODEL_DIR/.cache
diff --git a/docs/concepts/storage-caching.md b/docs/concepts/storage-caching.md
@@ -28,9 +28,10 @@ Building a model into a container image can provide a simple way to take advanta
 
 ## B. Model on shared filesystem (read-write-many)
 
-**Status:** [Planned](https://github.com/substratusai/kubeai/blob/main/proposals/model-storage.md).
+KubeAI can manage model caches on a shared filesystem (i.e. AWS [EFS](https://aws.amazon.com/efs/), GCP [Filestore](https://cloud.google.com/filestore/docs/overview), NFS). It manages the full lifecycle of a cached model: loading, serving, and cache eviction (on deletion of the Model).
 
-Examples: [AWS EFS](https://aws.amazon.com/efs/)
+<br>
+<img src="/diagrams/caching-shared-filesystem.excalidraw.png" width="90%"></img>
 
 ## C. Model on read-only-many disk
 

diff --git a/docs/contributing/development-environment.md b/docs/contributing/development-environment.md
@@ -46,6 +46,7 @@ helm upgrade --install kubeai ./charts/kubeai \
 
 # OPTION B #
 # For quick local interation (run KubeAI outside of cluster)
+kubectl create cm kubeai-autoscaler-state -oyaml --dry-run=client | kubectl apply -f -
 CONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go
 
 # In another terminal:

diff --git a/docs/diagrams/arch.excalidraw.png b/docs/diagrams/arch.excalidraw.png
diff --git a/docs/diagrams/caching-shared-filesystem.excalidraw.png b/docs/diagrams/caching-shared-filesystem.excalidraw.png
diff --git a/docs/how-to/build-models-into-containers.md b/docs/how-to/build-models-into-containers.md
@@ -14,7 +14,7 @@ Build and push image. Note: building (downloading base image & model) and pushin
 
 ```bash
 git clone https://github.com/substratusai/kubeai
-cd ./kubeai/images/ollama-builtin
+cd ./kubeai/examples/ollama-builtin
 
 docker build --build-arg MODEL_URL=$MODEL_URL -t $IMAGE .
 docker push $IMAGE