From e382382fbeccb1ff8c91f63d7360743ba45e5153 Mon Sep 17 00:00:00 2001 From: samos123 Date: Fri, 25 Oct 2024 05:55:21 +0000 Subject: [PATCH] Deployed b1b579c with MkDocs version: 1.6.1 --- 404.html | 21 + benchmarks/llama-3.2-11b-vision/index.html | 21 + concepts/autoscaling/index.html | 21 + concepts/backend-servers/index.html | 21 + concepts/resource-profiles/index.html | 21 + concepts/storage-caching/index.html | 21 + .../development-environment/index.html | 21 + contributing/documentation/index.html | 21 + contributing/release-process/index.html | 21 + how-to/architect-for-multitenancy/index.html | 21 + .../build-models-into-containers/index.html | 23 +- how-to/cache-models-with-aws-efs/index.html | 1455 +++++++++++++++++ .../index.html | 23 +- how-to/configure-autoscaling/index.html | 21 + how-to/configure-embedding-models/index.html | 21 + how-to/configure-resource-profiles/index.html | 21 + how-to/configure-speech-to-text/index.html | 21 + how-to/install-models/index.html | 21 + index.html | 21 + installation/eks/index.html | 21 + installation/gke/index.html | 21 + reference/kubernetes-api/index.html | 21 + reference/openai-api-compatibility/index.html | 21 + search/search_index.json | 2 +- sitemap.xml | 4 + sitemap.xml.gz | Bin 489 -> 496 bytes tutorials/langchain/index.html | 21 + tutorials/langtrace/index.html | 21 + tutorials/weaviate/index.html | 21 + 29 files changed, 1987 insertions(+), 3 deletions(-) create mode 100644 how-to/cache-models-with-aws-efs/index.html diff --git a/404.html b/404.html index 7c1ec5b5..f0909f33 100644 --- a/404.html +++ b/404.html @@ -394,6 +394,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/benchmarks/llama-3.2-11b-vision/index.html b/benchmarks/llama-3.2-11b-vision/index.html index 58eaa0e5..7f9e8e6f 100644 --- a/benchmarks/llama-3.2-11b-vision/index.html +++ b/benchmarks/llama-3.2-11b-vision/index.html @@ -403,6 +403,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/concepts/autoscaling/index.html b/concepts/autoscaling/index.html index cf5cf5fd..0c19411d 100644 --- a/concepts/autoscaling/index.html +++ b/concepts/autoscaling/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/concepts/backend-servers/index.html b/concepts/backend-servers/index.html index fde3c0b9..1d0e681e 100644 --- a/concepts/backend-servers/index.html +++ b/concepts/backend-servers/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/concepts/resource-profiles/index.html b/concepts/resource-profiles/index.html index 8d7c13d0..be4633d3 100644 --- a/concepts/resource-profiles/index.html +++ b/concepts/resource-profiles/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/concepts/storage-caching/index.html b/concepts/storage-caching/index.html index fc67eaf1..d99ccb74 100644 --- a/concepts/storage-caching/index.html +++ b/concepts/storage-caching/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/contributing/development-environment/index.html b/contributing/development-environment/index.html index 0e4dcad9..1ac078d2 100644 --- a/contributing/development-environment/index.html +++ b/contributing/development-environment/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/contributing/documentation/index.html b/contributing/documentation/index.html index 27968e02..8228b5f9 100644 --- a/contributing/documentation/index.html +++ b/contributing/documentation/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/contributing/release-process/index.html b/contributing/release-process/index.html index 7126c127..1bfa8ad9 100644 --- a/contributing/release-process/index.html +++ b/contributing/release-process/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/how-to/architect-for-multitenancy/index.html b/how-to/architect-for-multitenancy/index.html index 9555afb5..46cafd81 100644 --- a/how-to/architect-for-multitenancy/index.html +++ b/how-to/architect-for-multitenancy/index.html @@ -417,6 +417,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/how-to/build-models-into-containers/index.html b/how-to/build-models-into-containers/index.html index 0c9ea8c7..f5f23d74 100644 --- a/how-to/build-models-into-containers/index.html +++ b/how-to/build-models-into-containers/index.html @@ -14,7 +14,7 @@ - + @@ -417,6 +417,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/how-to/cache-models-with-aws-efs/index.html b/how-to/cache-models-with-aws-efs/index.html new file mode 100644 index 00000000..2ceebcb0 --- /dev/null +++ b/how-to/cache-models-with-aws-efs/index.html @@ -0,0 +1,1455 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Cache models with AWS EFS - KubeAI + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + Skip to content + + +
    +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + + + + + +
    +
    + + + + + + + +

    Cache models with AWS EFS

    +

    KubeAI can manage model caches. AWS EFS is supported as a pluggable backend store.

    +


    +

    +

    Follow the EKS install guide.

    +

    1. Create an EFS File System

    +

    Set environment variables to match your environment.

    +
    export CLUSTER_NAME="cluster-with-karpenter"
    +export CLUSTER_REGION="us-west-2"
    +
    +

    Create an EFS file system in the same VPC as your EKS cluster.

    +
    vpc_id=$(aws eks describe-cluster \
    +    --name $CLUSTER_NAME \
    +    --query "cluster.resourcesVpcConfig.vpcId" \
    +    --output text)
    +
    +cidr_range=$(aws ec2 describe-vpcs \
    +    --vpc-ids $vpc_id \
    +    --query "Vpcs[].CidrBlock" \
    +    --output text \
    +    --region ${CLUSTER_REGION})
    +
    +security_group_id=$(aws ec2 create-security-group \
    +    --group-name MyEfsSecurityGroup \
    +    --description "My EFS security group" \
    +    --vpc-id $vpc_id \
    +    --output text)
    +
    +aws ec2 authorize-security-group-ingress \
    +    --group-id $security_group_id \
    +    --protocol tcp \
    +    --port 2049 \
    +    --cidr $cidr_range
    +
    +file_system_id=$(aws efs create-file-system \
    +    --region ${CLUSTER_REGION} \
    +    --performance-mode generalPurpose \
    +    --query 'FileSystemId' \
    +    --output text)
    +
    +

    Expose the EFS file system to the subnets used by your EKS cluster. +

    SUBNETS=$(eksctl get cluster --region us-west-2 ${CLUSTER_NAME} -o json | jq -r '.[0].ResourcesVpcConfig.SubnetIds[]')
    +
    +while IFS= read -r subnet; do
    +    echo "Creating EFS mount target in $subnet"
    +    aws efs create-mount-target --file-system-id $file_system_id \
    +      --subnet-id $subnet --security-groups $security_group_id --output text
    +done <<< "$SUBNETS"
    +

    +

    2. Install the EFS CSI driver

    +
    export ROLE_NAME=AmazonEKS_EFS_CSI_DriverRole
    +eksctl create iamserviceaccount \
    +    --name efs-csi-controller-sa \
    +    --namespace kube-system \
    +    --cluster ${CLUSTER_NAME} \
    +    --role-name ${ROLE_NAME} \
    +    --role-only \
    +    --attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy \
    +    --approve
    +
    +TRUST_POLICY=$(aws iam get-role --role-name ${ROLE_NAME} \
    +    --query 'Role.AssumeRolePolicyDocument' --output json | \
    +    sed -e 's/efs-csi-controller-sa/efs-csi-*/' -e 's/StringEquals/StringLike/')
    +
    +aws iam update-assume-role-policy --role-name ${ROLE_NAME} --policy-document "$TRUST_POLICY"
    +
    +# Get the role ARN
    +EFS_ROLE_ARN=$(aws iam get-role --role-name AmazonEKS_EFS_CSI_DriverRole \
    +  --query 'Role.Arn' --output text)
    +
    +aws eks create-addon --cluster-name $CLUSTER_NAME --addon-name aws-efs-csi-driver \
    +  --service-account-role-arn $EFS_ROLE_ARN
    +
    +

    Wait for EKS Addon to active. +

    aws eks wait addon-active --cluster-name $CLUSTER_NAME \
    +  --addon-name aws-efs-csi-driver
    +
    +Verify that the EFS CSI driver is running.

    +
    kubectl get daemonset efs-csi-node -n kube-system
    +
    +

    Create a storage class for using EFS dynamic mode.

    +
    kubectl apply -f - <<EOF
    +kind: StorageClass
    +apiVersion: storage.k8s.io/v1
    +metadata:
    +  name: efs-sc
    +provisioner: efs.csi.aws.com
    +parameters:
    +  provisioningMode: efs-ap
    +  fileSystemId: "${file_system_id}"
    +  directoryPerms: "700"
    +EOF
    +
    +

    Make sure to set file_system_id match the EFS file system ID created in the first step.

    +

    3. Configure KubeAI with the EFS cache profile

    +

    You can skip this step if you've already installed KubeAI using the EKS Helm values file: values-eks.yaml file.

    +

    Configure KubeAI with the efs-dynamic cache profile. +

    helm upgrade --install kubeai kubeai/kubeai \
    +  --reuse-values -f - <<EOF
    +cacheProfiles:
    +  efs-dynamic:
    +    sharedFilesystem:
    +      storageClassName: "efs-sc"
    +  efs-static:
    +    sharedFilesystem:
    +      persistentVolumeName: "efs-pv"
    +EOF
    +

    +

    4. Configure a model to use the EFS cache

    +

    Apply a Model with cacheProfile set to efs-dynamic.

    +

    NOTE: If you already installed the models chart, you will need to edit you values file and run helm upgrade.

    +
    helm install kubeai-models kubeai/models -f - <<EOF
    +catalog:
    +  llama-3.1-8b-instruct-fp8-l4:
    +    enabled: true
    +    cacheProfile: efs-dynamic
    +EOF
    +
    +

    Wait for the Model to be fully cached.

    +
    kubectl wait --timeout 10m --for=jsonpath='{.status.cache.loaded}'=true model/llama-3.1-8b-instruct-fp8-l4
    +
    +

    This model will now be loaded from Filestore when it is served.

    +

    Troubleshooting

    +

    MountVole.SetUp failed for volume pvc deadline exceeded

    +

    kubectl get events may show an error like this: +

    8s          Warning   FailedMount             pod/load-cache-llama-3.1-8b-instruct-fp8-l4-w7thh      MountVolume.SetUp failed for volume "pvc-ceedb563-1e68-47fa-9d12-c697ae153d04" : rpc error: code = DeadlineExceeded desc = context deadline exceeded
    +

    +

    Checking the logs of the EFS CSI DaemonSet may show an error like this: +

    kubectl logs -f efs-csi-node-4n75c -n kube-system
    +Output: Could not start amazon-efs-mount-watchdog, unrecognized init system "aws-efs-csi-dri"
    +Mount attempt 1/3 failed due to timeout after 15 sec, wait 0 sec before next attempt.
    +Mount attempt 2/3 failed due to timeout after 15 sec, wait 0 sec before next attempt.
    +b'mount.nfs4: Connection timed out'
    +

    +

    This likely means your mount target isn't setup correctly. Possibly the security group is not allowing traffic from the EKS cluster.

    +

    Model Loading Job

    +

    Check to see if there is an ongoing model loader Job.

    +
    kubectl get jobs
    +
    + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + \ No newline at end of file diff --git a/how-to/cache-models-with-gcp-filestore/index.html b/how-to/cache-models-with-gcp-filestore/index.html index cfc6cbfc..88988386 100644 --- a/how-to/cache-models-with-gcp-filestore/index.html +++ b/how-to/cache-models-with-gcp-filestore/index.html @@ -11,7 +11,7 @@ - + @@ -405,6 +405,27 @@ + + +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + diff --git a/how-to/configure-autoscaling/index.html b/how-to/configure-autoscaling/index.html index 5a89df43..51def73d 100644 --- a/how-to/configure-autoscaling/index.html +++ b/how-to/configure-autoscaling/index.html @@ -407,6 +407,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/how-to/configure-embedding-models/index.html b/how-to/configure-embedding-models/index.html index 3f994102..8922171e 100644 --- a/how-to/configure-embedding-models/index.html +++ b/how-to/configure-embedding-models/index.html @@ -407,6 +407,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/how-to/configure-resource-profiles/index.html b/how-to/configure-resource-profiles/index.html index 00e9d954..dc2d2061 100644 --- a/how-to/configure-resource-profiles/index.html +++ b/how-to/configure-resource-profiles/index.html @@ -407,6 +407,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/how-to/configure-speech-to-text/index.html b/how-to/configure-speech-to-text/index.html index c369be49..30d7fad2 100644 --- a/how-to/configure-speech-to-text/index.html +++ b/how-to/configure-speech-to-text/index.html @@ -407,6 +407,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/how-to/install-models/index.html b/how-to/install-models/index.html index 3b890664..f5ebc22d 100644 --- a/how-to/install-models/index.html +++ b/how-to/install-models/index.html @@ -407,6 +407,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/index.html b/index.html index 33b8ba12..83bac406 100644 --- a/index.html +++ b/index.html @@ -530,6 +530,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/installation/eks/index.html b/installation/eks/index.html index 6e9bc0e1..da3ee2a7 100644 --- a/installation/eks/index.html +++ b/installation/eks/index.html @@ -483,6 +483,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/installation/gke/index.html b/installation/gke/index.html index fac50a23..a4bdf3b3 100644 --- a/installation/gke/index.html +++ b/installation/gke/index.html @@ -498,6 +498,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/reference/kubernetes-api/index.html b/reference/kubernetes-api/index.html index f40196a7..ad6266ca 100644 --- a/reference/kubernetes-api/index.html +++ b/reference/kubernetes-api/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/reference/openai-api-compatibility/index.html b/reference/openai-api-compatibility/index.html index c125a32d..38c6cfca 100644 --- a/reference/openai-api-compatibility/index.html +++ b/reference/openai-api-compatibility/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/search/search_index.json b/search/search_index.json index 044ebf21..e40ea32a 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"KubeAI: Private Open AI on Kubernetes","text":"

    Get inferencing running on Kubernetes: LLMs, Embeddings, Speech-to-Text.

    \u2705\ufe0f Drop-in replacement for OpenAI with API compatibility \ud83e\udde0 Serve top OSS models (LLMs, Whisper, etc.) \ud83d\ude80 Multi-platform: CPU-only, GPU, coming soon: TPU \u2696\ufe0f Scale from zero, autoscale based on load \ud83d\udee0\ufe0f Zero dependencies (does not depend on Istio, Knative, etc.) \ud83d\udcac Chat UI included (OpenWebUI) \ud83e\udd16 Operates OSS model servers (vLLM, Ollama, FasterWhisper, Infinity) \u2709 Stream/batch inference via messaging integrations (Kafka, PubSub, etc.)

    Quotes from the community:

    reusable, well abstracted solution to run LLMs - Mike Ensor

    "},{"location":"#architecture","title":"Architecture","text":"

    KubeAI serves an OpenAI compatible HTTP API. Admins can configure ML models via kind: Model Kubernetes Custom Resources. KubeAI can be thought of as a Model Operator (See Operator Pattern) that manages vLLM and Ollama servers.

    "},{"location":"#local-quickstart","title":"Local Quickstart","text":"

    Create a local cluster using kind or minikube.

    TIP: If you are using Podman for kind... Make sure your Podman machine can use up to 6G of memory (by default it is capped at 2G):
    # You might need to stop and remove the existing machine:\npodman machine stop\npodman machine rm\n\n# Init and start a new machine:\npodman machine init --memory 6144 --disk-size 120\npodman machine start\n
    kind create cluster # OR: minikube start\n

    Add the KubeAI Helm repository.

    helm repo add kubeai https://www.kubeai.org\nhelm repo update\n

    Install KubeAI and wait for all components to be ready (may take a minute).

    helm install kubeai kubeai/kubeai --wait --timeout 10m\n

    Install some predefined models.

    cat <<EOF > kubeai-models.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\n  qwen2-500m-cpu:\n    enabled: true\n  nomic-embed-text-cpu:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n

    Before progressing to the next steps, start a watch on Pods in a standalone terminal to see how KubeAI deploys models.

    kubectl get pods --watch\n
    "},{"location":"#interact-with-gemma2","title":"Interact with Gemma2","text":"

    Because we set minReplicas: 1 for the Gemma model you should see a model Pod already coming up.

    Start a local port-forward to the bundled chat UI.

    kubectl port-forward svc/openwebui 8000:80\n

    Now open your browser to localhost:8000 and select the Gemma model to start chatting with.

    "},{"location":"#scale-up-qwen2-from-zero","title":"Scale up Qwen2 from Zero","text":"

    If you go back to the browser and start a chat with Qwen2, you will notice that it will take a while to respond at first. This is because we set minReplicas: 0 for this model and KubeAI needs to spin up a new Pod (you can verify with kubectl get models -oyaml qwen2-500m-cpu).

    "},{"location":"#documentation","title":"Documentation","text":"

    Checkout our documentation on kubeai.org to find info on:

    • Installing KubeAI in the cloud
    • How to guides (e.g. how to manage models and resource profiles).
    • Concepts (how the components of KubeAI work).
    • How to contribute
    "},{"location":"#adopters","title":"Adopters","text":"

    List of known adopters:

    Name Description Link Telescope Telescope uses KubeAI for multi-region large scale batch LLM inference. trytelescope.ai Google Cloud Distributed Edge KubeAI is included as a reference architecture for inferencing at the edge. LinkedIn, GitLab

    If you are using KubeAI and would like to be listed as an adopter, please make a PR.

    "},{"location":"#openai-api-compatibility","title":"OpenAI API Compatibility","text":"
    # Implemented #\n/v1/chat/completions\n/v1/completions\n/v1/embeddings\n/v1/models\n/v1/audio/transcriptions\n\n# Planned #\n# /v1/assistants/*\n# /v1/batches/*\n# /v1/fine_tuning/*\n# /v1/images/*\n# /v1/vector_stores/*\n
    "},{"location":"#immediate-roadmap","title":"Immediate Roadmap","text":"
    • Model caching
    • LoRA finetuning (compatible with OpenAI finetuning API)
    • Image generation (compatible with OpenAI images API)

    NOTE: KubeAI was born out of a project called Lingo which was a simple Kubernetes LLM proxy with basic autoscaling. We relaunched the project as KubeAI (late August 2024) and expanded the roadmap to what it is today.

    \ud83c\udf1f Don't forget to drop us a star on GitHub and follow the repo to stay up to date!

    "},{"location":"#contact","title":"Contact","text":"

    Let us know about features you are interested in seeing or reach out with questions. Visit our Discord channel to join the discussion!

    Or just reach out on LinkedIn if you want to connect:

    • Nick Stogner
    • Sam Stoelinga
    "},{"location":"benchmarks/llama-3.2-11b-vision/","title":"Llama 3.2 11B Vision Instruct vLLM Benchmarks","text":"

    Single L4 GPU vLLM 0.6.2

    python3 benchmark_serving.py --backend openai \\\n    --base-url http://localhost:8000/openai \\\n    --dataset-name=sharegpt --dataset-path=ShareGPT_V3_unfiltered_cleaned_split.json \\\n    --model meta-llama-3.2-11b-vision-instruct \\\n    --seed 12345 --tokenizer neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic\n============ Serving Benchmark Result ============\nSuccessful requests:                     1000\nBenchmark duration (s):                  681.93\nTotal input tokens:                      230969\nTotal generated tokens:                  194523\nRequest throughput (req/s):              1.47\nOutput token throughput (tok/s):         285.25\nTotal Token throughput (tok/s):          623.95\n---------------Time to First Token----------------\nMean TTFT (ms):                          319146.12\nMedian TTFT (ms):                        322707.98\nP99 TTFT (ms):                           642512.79\n-----Time per Output Token (excl. 1st token)------\nMean TPOT (ms):                          54.84\nMedian TPOT (ms):                        53.66\nP99 TPOT (ms):                           83.75\n---------------Inter-token Latency----------------\nMean ITL (ms):                           54.09\nMedian ITL (ms):                         47.44\nP99 ITL (ms):                            216.77\n==================================================\n

    "},{"location":"concepts/autoscaling/","title":"Autoscaling","text":"

    KubeAI proxies HTTP and messaging (i.e. Kafka, etc) requests and messages to models. It will adjust the number Pods serving a given model based on the average active number of requests. If no Pods are running when a request comes in, KubeAI will hold the request, scale up a Pod and forward the request when the Pod is ready. This process happens in a manner that is transparent to the end client (other than the added delay from a cold-start).

    "},{"location":"concepts/autoscaling/#next","title":"Next","text":"

    Read about how to configure autoscaling.

    "},{"location":"concepts/backend-servers/","title":"Backend Servers","text":"

    KubeAI serves ML models by launching Pods on Kubernetes. The configuration and lifecycle of these Pods are managed by the KubeAI controller. Every model server Pod loads exactly one model on startup.

    In a Model manifest you can define what server to use for inference (VLLM, OLlama). Any model-specific settings can be passed to the server process via the args and env fields.

    "},{"location":"concepts/backend-servers/#next","title":"Next","text":"

    Read about how to install models.

    "},{"location":"concepts/resource-profiles/","title":"Resource Profiles","text":"

    A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are configured on inference server Pods. These profiles are defined in the KubeAI config.yaml file (via a ConfigMap). Each model specifies the resource profile that it requires.

    Kubernetes Model resources specify a resource profile and the count of that resource that they require (for example resourceProfile: nvidia-gpu-l4:2 - 2x L4 GPUs).

    A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed in.

    Example: A resource profile named nvidia-gpu-l4 might contain the following node selectors when installing KubeAI on a GKE Kubernetes cluster:

    cloud.google.com/gke-accelerator: \"nvidia-l4\"\ncloud.google.com/gke-spot: \"true\"\n

    and add the following resource requests to the model server Pods:

    nvidia.com/gpu: \"1\"\n

    In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource.

    "},{"location":"concepts/resource-profiles/#next","title":"Next","text":"

    Read about how to configure resource profiles.

    "},{"location":"concepts/storage-caching/","title":"Storage / Caching","text":"

    With \"Large\" in the name, caching is a critical part of serving LLMs.

    The best caching technique may very depending on your environment:

    • What cloud features are available?
    • Is your cluster deployed in an air-gapped environment?
    "},{"location":"concepts/storage-caching/#a-model-built-into-container","title":"A. Model built into container","text":"

    Status: Supported

    Building a model into a container image can provide a simple way to take advantage of image-related optimizations built into Kubernetes:

    • Relaunching a model server on the same Node that it ran on before will likely be able to reuse the previously pulled image.

    • Secondary boot disks on GKE can be used to avoid needing to pull images.

    • Image streaming on GKE can allow for containers to startup before the entire image is present on the Node.

    • Container images can be pre-installed on Nodes in air-gapped environments (example: k3s airgap installation).

    Guides:

    • How to build models into container images
    "},{"location":"concepts/storage-caching/#b-model-on-shared-filesystem-read-write-many","title":"B. Model on shared filesystem (read-write-many)","text":"

    KubeAI can manage model caches on a shared filesystem (i.e. AWS EFS, GCP Filestore, NFS). It manages the full lifecycle of a cached model: loading, serving, and cache eviction (on deletion of the Model).

    "},{"location":"concepts/storage-caching/#c-model-on-read-only-many-disk","title":"C. Model on read-only-many disk","text":"

    Status: Planned.

    Examples: GCP Hyperdisk ML

    "},{"location":"contributing/development-environment/","title":"Development environment","text":"

    This document provides instructions for setting up an environment for developing KubeAI.

    "},{"location":"contributing/development-environment/#optional-cloud-setup","title":"Optional: Cloud Setup","text":""},{"location":"contributing/development-environment/#gcp-pubsub","title":"GCP PubSub","text":"

    If you are develop PubSub messaging integration on GCP, setup test topics and subscriptions and uncomment the .messaging.streams in ./hack/dev-config.yaml.

    gcloud auth login --update-adc\n\ngcloud pubsub topics create test-kubeai-requests\ngcloud pubsub subscriptions create test-kubeai-requests-sub --topic test-kubeai-requests\ngcloud pubsub topics create test-kubeai-responses\ngcloud pubsub subscriptions create test-kubeai-responses-sub --topic test-kubeai-responses\n
    "},{"location":"contributing/development-environment/#run-in-local-cluster","title":"Run in Local Cluster","text":"
    kind create cluster\n# OR\n#./hack/create-dev-gke-cluster.yaml\n\n# Generate CRDs from Go code.\nmake generate && make manifests\n\n# When CRDs are changed reapply using kubectl:\nkubectl apply -f ./charts/kubeai/charts/crds/crds\n\n# Model with special address annotations:\nkubectl apply -f ./hack/dev-model.yaml\n\n# OPTION A #\n# Run KubeAI inside cluster\n# Change `-f` based on the cluster environment.\nhelm upgrade --install kubeai ./charts/kubeai \\\n    --set openwebui.enabled=true \\\n    --set image.tag=latest \\\n    --set image.pullPolicy=Always \\\n    --set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \\\n    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \\\n    --set replicaCount=1 -f ./hack/dev-gke-helm-values.yaml\n\n# OPTION B #\n# For quick local interation (run KubeAI outside of cluster)\nkubectl create cm kubeai-autoscaler-state -oyaml --dry-run=client | kubectl apply -f -\nCONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go\n\n# In another terminal:\nwhile true; do kubectl port-forward service/dev-model 7000:7000; done\n############\n
    "},{"location":"contributing/development-environment/#running","title":"Running","text":""},{"location":"contributing/development-environment/#completions-api","title":"Completions API","text":"
    # If you are running kubeai in-cluster:\n# kubectl port-forward svc/kubeai 8000:80\n\ncurl http://localhost:8000/openai/v1/completions -H \"Content-Type: application/json\" -d '{\"prompt\": \"Hi\", \"model\": \"dev\"}' -v\n
    "},{"location":"contributing/development-environment/#messaging-integration","title":"Messaging Integration","text":"
    gcloud pubsub topics publish test-kubeai-requests \\                  \n  --message='{\"path\":\"/v1/completions\", \"metadata\":{\"a\":\"b\"}, \"body\": {\"model\": \"dev\", \"prompt\": \"hi\"}}'\n\ngcloud pubsub subscriptions pull test-kubeai-responses-sub --auto-ack\n
    "},{"location":"contributing/documentation/","title":"Documentation","text":"

    We are grateful for anyone who takes the time to improve KubeAI documentation! In order to keep our docs clear and consistent we ask that you first read about the approach to documentation that we have standardized on...

    "},{"location":"contributing/documentation/#read-before-writing","title":"Read before writing!","text":"

    The KubeAI approach to documentation is loosely inspired by the Diataxis method.

    TLDR on how KubeAI docs are organized:

    • Installation: How-to guides specific to installing KubeAI.
    • How To: Directions that guide the reader through a problem or towards a result. How-to guides are goal-oriented. They assume the user is familiar with general concepts, tools, and has already installed KubeAI.
    • Concepts: A reflective explanation of KubeAI topics with a focus on giving the reader an understanding of the why.
    • Tutorials: Learning oriented experiences. Lessons that often guide a user from beginning to end. The goal is to help the reader learn something (compared to a how-to guide that is focused on helping the reader do something).
    • Contributing: The docs in here differ from the rest of the docs by audience: these docs are for anyone who will be contributing code or docs to the KubeAI project.
    "},{"location":"contributing/documentation/#how-to-serve-kubeaiorg-locally","title":"How to serve kubeai.org locally","text":"

    Make sure you have python3 installed and run:

    make docs\n
    "},{"location":"contributing/release-process/","title":"Release Process","text":"

    This document describes the process for releasing a new version of the project.

    "},{"location":"contributing/release-process/#docs","title":"Docs","text":"

    The docs are automatically published whenever a PR updates the docs and the PR is merged into the main branch. The docs are published to the gh-pages branch, which is the source for the Github Pages site.

    "},{"location":"contributing/release-process/#docker-images","title":"Docker images","text":"

    The Docker image latest tag always points to the latest released version. The main tag points to the latest commit on the main branch.

    If you push a tag vX.Y.Z to the repository, the Docker image with the tag vX.Y.Z is built and pushed to Docker Hub. Afterwards, the latest tag is updated to point to the new version.

    "},{"location":"contributing/release-process/#helm-chart","title":"Helm Chart","text":"

    The Helm chart only gets released when a git tag is pushed to the repository with the format helm-v*.

    The appVersion in the Helm chart does not have to point to the latest released version. This allows us to first publish a new version of the Docker image without updating the Helm chart. The Helm chart is updated when we are ready to release a new version.

    This is important when a new appVersion isn't compatible with the current Helm chart. In those cases, we can first merge the PR, thoroughly test, release new container image, and then in a separate PR update the Helm chart and the appVersion.

    "},{"location":"how-to/architect-for-multitenancy/","title":"Architect for Multitenancy","text":"

    KubeAI can support multitenancy by filtering the models that it serves via Kubernetes label selectors. These label selectors can be applied when accessing any of the OpenAI-compatible endpoints through the X-Label-Selector HTTP header and will match on labels specified on the kind: Model objects. The pattern is similar to using a WHERE clause in a SQL query.

    Example Models:

    kind: Model\nmetadata:\n  name: llama-3.2\n  labels:\n    tenancy: public\nspec:\n# ...\n---\nkind: Model\nmetadata:\n  name: custom-private-model\n  labels:\n    tenancy: org-abc\nspec:\n# ...\n

    Example HTTP requests:

    # The returned list of models will be filtered.\ncurl http://$KUBEAI_ENDPOINT/openai/v1/models \\\n    -H \"X-Label-Selector: tenancy in (org-abc, public)\"\n\n# When running inference, if the label selector does not match\n# a 404 will be returned.\ncurl http://$KUBEAI_ENDPOINT/openai/v1/completions \\\n    -H \"Content-Type: application/json\" \\\n    -H \"X-Label-Selector: tenancy in (org-abc, public)\" \\\n    -d '{\"prompt\": \"Hi\", \"model\": \"llama-3.2\"}'\n

    The header value can be any valid Kubernetes label selector. Some examples include:

    X-Label-Selector: tenancy=org-abc\nX-Label-Selector: tenancy in (org-abc, public)\nX-Label-Selector: tenancy!=private\n

    Multiple X-Label-Selector headers can be specified in the same HTTP request and will be treated as a logical AND. For example, the following request will only match Models that have a label tenant: org-abc and user: sam:

    curl http://$KUBEAI_ENDPOINT/openai/v1/completions \\\n    -H \"Content-Type: application/json\" \\\n    -H \"X-Label-Selector: tenant=org-abc\" \\\n    -H \"X-Label-Selector: user=sam\" \\\n    -d '{\"prompt\": \"Hi\", \"model\": \"llama-3.2\"}'\n

    Example architecture:

    "},{"location":"how-to/build-models-into-containers/","title":"Build models into containers","text":"

    In this guide we will preload a LLM into a custom built Ollama serving image. You can follow the same steps for other models and other serving engines.

    Define some values

    export MODEL_URL=ollama://qwen2:0.5b\n\n# Customize with your own image repo.\nexport IMAGE=us-central1-docker.pkg.dev/substratus-dev/default/ollama-builtin-qwen2-05b:latest\n

    Build and push image. Note: building (downloading base image & model) and pushing (uploading image & model) can take a while depending on the size of the model.

    git clone https://github.com/substratusai/kubeai\ncd ./kubeai/examples/ollama-builtin\n\ndocker build --build-arg MODEL_URL=$MODEL_URL -t $IMAGE .\ndocker push $IMAGE\n

    Create a model manifest & apply into a cluster with KubeAI installed. NOTE: The only difference between an built-in model image and otherwise is the addition of the image: field.

    kubectl apply -f - << EOF\napiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: builtin-model-example\nspec:\n  features: [\"TextGeneration\"]\n  owner: alibaba\n  image: $IMAGE # <-- The image with model built-in\n  url: \"$MODEL_URL\"\n  engine: OLlama\n  resourceProfile: cpu:1\nEOF\n
    "},{"location":"how-to/cache-models-with-gcp-filestore/","title":"Cache models with GCP Filestore","text":"

    KubeAI can manage model caches. GCP Filestore is supported as a pluggable backend store.

    Follow the GKE install guide.

    Ensure that the Filestore API is enabled.

    gcloud services enable file.googleapis.com\n

    Apply a Model with the cache profile set to standard-filestore (defined in the reference GKE Helm values file).

    TIP: If you want to use `premium-filestore` you will need to ensure you have quota.

    Open the cloud console quotas page: https://console.cloud.google.com/iam-admin/quotas. Make sure your project is selected in the top left.

    Ensure that you have at least 2.5Tb of PremiumStorageGbPerRegion quota in the region where your cluster is deployed.

    NOTE: If you already installed the models chart, you will need to edit you values file and run helm upgrade.

    helm install kubeai-models kubeai/models -f - <<EOF\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n    cacheProfile: standard-filestore\nEOF\n

    Wait for the Model to be fully cached. This may take a while if the Filestore instance needs to be created.

    kubectl wait --timeout 10m --for=jsonpath='{.status.cache.loaded}'=true model/llama-3.1-8b-instruct-fp8-l4\n

    This model will now be loaded from Filestore when it is served.

    "},{"location":"how-to/cache-models-with-gcp-filestore/#troubleshooting","title":"Troubleshooting","text":""},{"location":"how-to/cache-models-with-gcp-filestore/#filestore-csi-driver","title":"Filestore CSI Driver","text":"

    Ensure that the Filestore CSI driver is enabled by checking for the existance of Kubernetes storage classes. If they are not found, follow the GCP guide for enabling the CSI driver.

    kubectl get storageclass standard-rwx premium-rwx\n
    "},{"location":"how-to/cache-models-with-gcp-filestore/#persistentvolumes","title":"PersistentVolumes","text":"

    Check the PersistentVolumeClaim (that should be created by KubeAI).

    kubectl describe pvc shared-model-cache-\n
    Example: Out-of-quota error
      Warning  ProvisioningFailed    11m (x26 over 21m)  filestore.csi.storage.gke.io_gke-50826743a27a4d52bf5b-7fac-9607-vm_b4bdb2ec-b58b-4363-adec-15c270a14066  failed to provision volume with StorageClass \"premium-rwx\": rpc error: code = ResourceExhausted desc = googleapi: Error 429: Quota limit 'PremiumStorageGbPerRegion' has been exceeded. Limit: 0 in region us-central1.\nDetails:\n[\n  {\n    \"@type\": \"type.googleapis.com/google.rpc.QuotaFailure\",\n    \"violations\": [\n      {\n        \"description\": \"Quota 'PremiumStorageGbPerRegion' exhausted. Limit 0 in region us-central1\",\n        \"subject\": \"project:819220466562\"\n      }\n    ]\n  }\n]\n

    Check to see if the PersistentVolume has been fully provisioned.

    kubectl get pv\n# Find name of corresponding pv...\nkubectl describe pv <name>\n
    "},{"location":"how-to/cache-models-with-gcp-filestore/#model-loading-job","title":"Model Loading Job","text":"

    Check to see if there is an ongoing model loader Job.

    kubectl get jobs\n
    "},{"location":"how-to/configure-autoscaling/","title":"Configure autoscaling","text":"

    This guide will cover how to configure KubeAI autoscaling parameters.

    "},{"location":"how-to/configure-autoscaling/#system-settings","title":"System Settings","text":"

    KubeAI administrators can define system-wide autoscaling settings by setting the following Helm values (for the kubeai/kubeai chart):

    Example:

    # helm-values.yaml\nmodelAutoscaling:\n  interval: 15s\n  timeWindow: 10m\n# ...\n
    "},{"location":"how-to/configure-autoscaling/#model-settings","title":"Model Settings","text":"

    The following settings can be configured on a model-by-model basis.

    "},{"location":"how-to/configure-autoscaling/#model-settings-helm","title":"Model settings: helm","text":"

    If you are managing models via the kubeai/models Helm chart, you can use:

    # helm-values.yaml\ncatalog:\n  model-a:\n    # ...\n    minReplicas: 1\n    maxReplicas: 9\n    targetRequests: 250\n    scaleDownDelaySeconds: 45\n  model-b:\n    # ...\n    disableAutoscaling: true\n# ...\n

    Re-running helm upgrade with these additional parameters will update model settings in the cluster.

    "},{"location":"how-to/configure-autoscaling/#model-settings-kubectl","title":"Model settings: kubectl","text":"

    You can also specify the autoscaling profile directly via the Models custom resource in the Kubernetes API:

    apiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: my-model\nspec:\n  # ...\n  minReplicas: 1\n  maxReplicas: 9\n  targetRequests: 250\n  scaleDownDelaySeconds: 45\n

    If you are already managing models using Model manifest files, you can make the update to your file and reapply it using kubectl apply -f <filename>.yaml.

    "},{"location":"how-to/configure-embedding-models/","title":"Configure Embedding Models","text":"

    KubeAI supports the following engines for text embedding models:

    • Infinity
    • vLLM
    • Ollama

    Infinity supports any HuggingFace models listed as text-embedding. See the models, reranking or clip models on huggingface for reference.

    "},{"location":"how-to/configure-embedding-models/#install-baaibge-small-en-v15-model-using-infinity","title":"Install BAAI/bge-small-en-v1.5 model using Infinity","text":"

    Create a file named kubeai-models.yaml with the following content:

    catalog:\n  bge-embed-text-cpu:\n    enabled: true\n    features: [\"TextEmbedding\"]\n    owner: baai\n    url: \"hf://BAAI/bge-small-en-v1.5\"\n    engine: Infinity\n    resourceProfile: cpu:1\n    minReplicas: 1\n

    Apply the kubeai-models helm chart:

    helm install kubeai-models kubeai/models -f ./kubeai-models.yaml\n

    Once the pod is ready, you can use the OpenAI Python SDK to interact with the model:

    from openai import OpenAI\n# Assumes port-forward of kubeai service to localhost:8000.\nclient = OpenAI(api_key=\"ignored\", base_url=\"http://localhost:8000/openai/v1\")\nresponse = client.embeddings.create(\n    input=\"Your text goes here.\",\n    model=\"bge-embed-text-cpu\"\n)\n
    "},{"location":"how-to/configure-resource-profiles/","title":"Configure resource profiles","text":"

    This guide will cover modifying preconfigured resource profiles and adding your own.

    "},{"location":"how-to/configure-resource-profiles/#modifying-preconfigured-resource-profiles","title":"Modifying preconfigured resource profiles","text":"

    The KubeAI helm chart comes with preconfigured resource profiles for common resource types such as NVIDIA L4 GPUs. You can view these profiles in the default helm values file.

    These profiles usually require some additional settings based on the cluster/cloud that KubeAI is installed into. You can modify a resource profile by setting custom helm values and runing helm install or helm upgrade. For example, if you are installing KubeAI on GKE you will need to set GKE-specific node selectors:

    # helm-values.yaml\nresourceProfiles:\n  nvidia-gpu-l4:\n    nodeSelector:\n      cloud.google.com/gke-accelerator: \"nvidia-l4\"\n      cloud.google.com/gke-spot: \"true\"\n

    NOTE: See the cloud-specific installation guide for a comprehensive list of settings.

    "},{"location":"how-to/configure-resource-profiles/#adding-additional-resource-profiles","title":"Adding additional resource profiles","text":"

    If the preconfigured resource profiles do not meet your needs you can add additional profiles by appending to the .resourceProfiles object in the helm values file you use to install KubeAI.

    # helm-values.yaml\nresourceProfiles:\n  my-custom-gpu:\n    imageName: \"optional-custom-image-name\"\n    nodeSelector:\n      my-custom-node-pool: \"some-value\"\n    limits:\n      custom.com/gpu: \"1\"\n    requests:\n      custom.com/gpu: \"1\"\n      cpu: \"3\"\n      memory: \"12Gi\"\n    runtimeClassName: \"my-custom-runtime-class\"\n

    If you need to run custom model server images on your resource profile, make sure to also add those in the modelServers section:

    # helm-values.yaml\nmodelServers:\n  VLLM:\n    images:\n      optional-custom-image-name: \"my-repo/my-vllm-image:v1.2.3\"\n  OLlama:\n    images:\n      optional-custom-image-name: \"my-repo/my-ollama-image:v1.2.3\"\n
    "},{"location":"how-to/configure-resource-profiles/#next","title":"Next","text":"

    See the guide on how to install models which includes how to configure the resource profile to use for a given model.

    "},{"location":"how-to/configure-speech-to-text/","title":"Configure speech-to-text","text":"

    KubeAI provides a Speech to Text endpoint that can be used to transcribe audio files. This guide will walk you through the steps to enable this feature.

    "},{"location":"how-to/configure-speech-to-text/#enable-speech-to-text-model","title":"Enable Speech to Text model","text":"

    You can create new models by creating a Model CRD object or by enabling a model from the model catalog.

    "},{"location":"how-to/configure-speech-to-text/#enable-from-model-catalog","title":"Enable from model catalog","text":"

    KubeAI provides predefined models in the kubeai/models Helm chart. To enable the Speech to Text model, you can set the enabled flag to true in your values file.

    # models-helm-values.yaml\ncatalog:\n  faster-whisper-medium-en-cpu:\n    enabled: true\n    minReplicas: 1\n
    "},{"location":"how-to/configure-speech-to-text/#enable-by-creating-model-crd","title":"Enable by creating Model CRD","text":"

    You can also create a Model CRD object to enable the Speech to Text model. Here is an example of a Model CRD object for the Speech to Text model:

    apiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: faster-whisper-medium-en-cpu\nspec:\n  features: [SpeechToText]\n  owner: Systran\n  url: hf://Systran/faster-whisper-medium.en\n  engine: FasterWhisper\n  resourceProfile: cpu:1\n
    "},{"location":"how-to/configure-speech-to-text/#usage","title":"Usage","text":"

    The Speech to Text endpoint is available at /openai/v1/transcriptions.

    Example usage using curl:

    curl -L -o kubeai.mp4 https://github.com/user-attachments/assets/711d1279-6af9-4c6c-a052-e59e7730b757\ncurl http://localhost:8000/openai/v1/audio/transcriptions \\\n  -F \"file=@kubeai.mp4\" \\\n  -F \"language=en\" \\\n  -F \"model=faster-whisper-medium-en-cpu\"\n
    "},{"location":"how-to/install-models/","title":"Install models","text":"

    This guide provides instructions on how to configure KubeAI models.

    "},{"location":"how-to/install-models/#installing-models-with-helm","title":"Installing models with helm","text":"

    KubeAI provides a chart that contains preconfigured models.

    "},{"location":"how-to/install-models/#preconfigured-models-with-helm","title":"Preconfigured models with helm","text":"

    When you are defining Helm values for the kubeai/models chart you can install a preconfigured Model by setting enabled: true. You can view a list of all preconfigured models in the chart's default values file.

    # helm-values.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n

    You can optionally override preconfigured settings, for example, resourceProfile:

    # helm-values.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n    resourceProfile: nvidia-gpu-l4:2 # Require \"2 NVIDIA L4 GPUs\"\n
    "},{"location":"how-to/install-models/#custom-models-with-helm","title":"Custom models with helm","text":"

    If you prefer to add a custom model via the same Helm chart you use for installed KubeAI, you can add your custom model entry into the .catalog array of your existing values file for the kubeai/models Helm chart:

    # helm-values.yaml\ncatalog:\n  my-custom-model-name:\n    enabled: true\n    features: [\"TextEmbedding\"]\n    owner: me\n    url: \"hf://me/my-custom-model\"\n    resourceProfile: CPU:1\n
    "},{"location":"how-to/install-models/#installing-models-with-kubectl","title":"Installing models with kubectl","text":"

    You can add your own model by defining a Model yaml file and applying it using kubectl apply -f model.yaml.

    If you have a running cluster with KubeAI installed you can inspect the schema for a Model using kubectl explain:

    kubectl explain models\nkubectl explain models.spec\nkubectl explain models.spec.engine\n
    "},{"location":"how-to/install-models/#programmatically-installing-models","title":"Programmatically installing models","text":"

    See the examples.

    "},{"location":"how-to/install-models/#feedback-welcome-a-model-management-ui","title":"Feedback welcome: A model management UI","text":"

    We are considering adding a UI for managing models in a running KubeAI instance. Give the GitHub Issue a thumbs up if you would be interested in this feature.

    "},{"location":"installation/eks/","title":"Install on EKS","text":"TIP: Make sure you have enough GPU quota in your AWS account.

    The default quotas for GPU instances are often 0. You will need to request a quota increase for the GPU instances you want to use.

    The following quotas may require an increase if you wish to use GPUs in your EKS cluster: - All G and VT Spot Instance Requests - All P5 Spot Instance Requests - All P4, P3 and P2 Spot Instance Requests - Running Dedicated p4d Hosts

    "},{"location":"installation/eks/#1-create-eks-cluster-with-karpenter","title":"1. Create EKS cluster with Karpenter","text":"

    Set the environment variables used throughout this guide:

    export CLUSTER_NAME=\"cluster-with-karpenter\"\nexport AWS_DEFAULT_REGION=\"us-west-2\"\nexport K8S_VERSION=\"1.30\"\nexport GPU_AMI_ID=\"$(aws ssm get-parameter --name /aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-gpu/recommended/image_id --query Parameter.Value --output text)\"\n

    Create the EKS cluster using eksctl:

    eksctl create cluster -f - <<EOF\n---\napiVersion: eksctl.io/v1alpha5\nkind: ClusterConfig\nmetadata:\n  name: \"${CLUSTER_NAME}\"\n  region: \"${AWS_DEFAULT_REGION}\"\n  version: \"${K8S_VERSION}\"\n  tags:\n    karpenter.sh/discovery: \"${CLUSTER_NAME}\" # here, it is set to the cluster name\n\niam:\n  withOIDC: true # required\n\nkarpenter:\n  version: '1.0.6' # Exact version must be specified\n\nmanagedNodeGroups:\n- instanceType: m5.large\n  amiFamily: AmazonLinux2\n  name: \"${CLUSTER_NAME}-m5-ng\"\n  desiredCapacity: 2\n  minSize: 1\n  maxSize: 10\nEOF\n

    "},{"location":"installation/eks/#2-configure-a-karpenter-gpu-nodepool","title":"2. Configure a Karpenter GPU NodePool","text":"

    Create the NodePool and EC2NodeClass objects:

    kubectl apply -f - <<EOF\napiVersion: karpenter.sh/v1\nkind: NodePool\nmetadata:\n  name: gpu\nspec:\n  template:\n    spec:\n      requirements:\n        - key: karpenter.sh/capacity-type\n          operator: In\n          values: [\"spot\", \"on-demand\"]\n        - key: karpenter.k8s.aws/instance-category\n          operator: In\n          values: [\"g\", \"p\"]\n      nodeClassRef:\n        group: karpenter.k8s.aws\n        kind: EC2NodeClass\n        name: gpu\n      expireAfter: 720h # 30 * 24h = 720h\n      taints:\n      - key: nvidia.com/gpu\n        value: \"true\"\n        effect: NoSchedule\n  limits:\n    cpu: 1000\n  disruption:\n    consolidationPolicy: WhenEmptyOrUnderutilized\n    consolidateAfter: 1m\n---\napiVersion: karpenter.k8s.aws/v1\nkind: EC2NodeClass\nmetadata:\n  name: gpu\nspec:\n  amiFamily: AL2 # Amazon Linux 2\n  role: \"eksctl-KarpenterNodeRole-${CLUSTER_NAME}\"\n  subnetSelectorTerms:\n    - tags:\n        karpenter.sh/discovery: \"${CLUSTER_NAME}\" # replace with your cluster name\n  securityGroupSelectorTerms:\n    - tags:\n        karpenter.sh/discovery: \"${CLUSTER_NAME}\" # replace with your cluster name\n  amiSelectorTerms:\n    - id: \"${GPU_AMI_ID}\" # <- GPU Optimized AMD AMI \n  blockDeviceMappings:\n    - deviceName: /dev/xvda\n      ebs:\n        volumeSize: 300Gi\n        volumeType: gp3\n        encrypted: true\nEOF\n

    Install the NVIDIA device plugin (needed for Karpenter nodes):

    kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.16.1/deployments/static/nvidia-device-plugin.yml\n
    "},{"location":"installation/eks/#3-install-kubeai","title":"3. Install KubeAI","text":"

    Add KubeAI Helm repository.

    helm repo add kubeai https://www.kubeai.org\nhelm repo update\n

    Make sure you have a HuggingFace Hub token set in your environment (HUGGING_FACE_HUB_TOKEN).

    export HF_TOKEN=\"replace-with-your-huggingface-token\"\n

    Install KubeAI with Helm.

    curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-eks.yaml\n# Please review the values-eks.yaml file and edit the nodeSelectors if needed.\ncat values-eks.yaml\nhelm upgrade --install kubeai kubeai/kubeai \\\n    -f values-eks.yaml \\\n    --set secrets.huggingface.token=$HF_TOKEN \\\n    --wait\n
    "},{"location":"installation/eks/#3-optionally-configure-models","title":"3. Optionally configure models","text":"

    Optionally install preconfigured models.

    cat <<EOF > kubeai-models.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n
    "},{"location":"installation/gke/","title":"Install on GKE","text":"TIP: Make sure you have enough quota in your GCP project.

    Open the cloud console quotas page: https://console.cloud.google.com/iam-admin/quotas. Make sure your project is selected in the top left.

    You will need to verify that you have enough quota for the accelerators you want to use. Below is table of common quotas you will have to increase depending on your needs.

    Quota Location Min Value Preemptible TPU v5 Lite Podslice chips <your-region> 8 Preemptible NVIDIA L4 GPUs <your-region> 2 GPUs (all regions) - 2 CPUs (all regions) - 24

    See the following screenshot examples of how these quotas appear in the console:

    "},{"location":"installation/gke/#1-create-a-cluster","title":"1. Create a cluster","text":""},{"location":"installation/gke/#option-gke-autopilot","title":"Option: GKE Autopilot","text":"

    Create an Autopilot cluster (replace us-central1 with a region that you have quota).

    gcloud container clusters create-auto cluster-1 \\\n    --location=us-central1\n
    "},{"location":"installation/gke/#option-gke-standard","title":"Option: GKE Standard","text":"

    TODO: Reference gcloud commands for creating a GKE standard cluster.

    "},{"location":"installation/gke/#2-install-kubeai","title":"2. Install KubeAI","text":"

    Add KubeAI Helm repository.

    helm repo add kubeai https://www.kubeai.org\nhelm repo update\n

    Make sure you have a HuggingFace Hub token set in your environment (HUGGING_FACE_HUB_TOKEN).

    Install KubeAI with Helm.

    curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-gke.yaml\nhelm upgrade --install kubeai kubeai/kubeai \\\n    -f values-gke.yaml \\\n    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \\\n    --wait\n
    "},{"location":"installation/gke/#3-optionally-configure-models","title":"3. Optionally configure models","text":"

    Optionally install preconfigured models.

    cat <<EOF > kubeai-models.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n
    "},{"location":"reference/kubernetes-api/","title":"Kubernetes API","text":""},{"location":"reference/kubernetes-api/#packages","title":"Packages","text":"
    • kubeai.org/v1
    "},{"location":"reference/kubernetes-api/#kubeaiorgv1","title":"kubeai.org/v1","text":"

    Package v1 contains API Schema definitions for the kubeai v1 API group

    "},{"location":"reference/kubernetes-api/#resource-types","title":"Resource Types","text":"
    • Model
    "},{"location":"reference/kubernetes-api/#model","title":"Model","text":"

    Model resources define the ML models that will be served by KubeAI.

    Field Description Default Validation apiVersion string kubeai.org/v1 kind string Model metadata ObjectMeta Refer to Kubernetes API documentation for fields of metadata. spec ModelSpec status ModelStatus"},{"location":"reference/kubernetes-api/#modelfeature","title":"ModelFeature","text":"

    Underlying type: string

    Validation: - Enum: [TextGeneration TextEmbedding SpeechToText]

    Appears in: - ModelSpec

    "},{"location":"reference/kubernetes-api/#modelspec","title":"ModelSpec","text":"

    ModelSpec defines the desired state of Model.

    Appears in: - Model

    Field Description Default Validation url string URL of the model to be served.Currently only the following formats are supported:For VLLM & FasterWhisper engines: \"hf:///\"For OLlama engine: \"ollama:// Required: {} features ModelFeature array Features that the model supports.Dictates the APIs that are available for the model. Enum: [TextGeneration TextEmbedding SpeechToText] engine string Engine to be used for the server process. Enum: [OLlama VLLM FasterWhisper Infinity] Required: {} resourceProfile string ResourceProfile required to serve the model.Use the format \":\".Example: \"nvidia-gpu-l4:2\" - 2x NVIDIA L4 GPUs.Must be a valid ResourceProfile defined in the system config. cacheProfile string CacheProfile to be used for caching model artifacts.Must be a valid CacheProfile defined in the system config. image string Image to be used for the server process.Will be set from ResourceProfile + Engine if not specified. args string array Args to be added to the server process. env object (keys:string, values:string) Env variables to be added to the server process. replicas integer Replicas is the number of Pod replicas that should be activelyserving the model. KubeAI will manage this field unless AutoscalingDisabledis set to true. minReplicas integer MinReplicas is the minimum number of Pod replicas that the model can scale down to.Note: 0 is a valid value. Minimum: 0 Optional: {} maxReplicas integer MaxReplicas is the maximum number of Pod replicas that the model can scale up to.Empty value means no limit. Minimum: 1 autoscalingDisabled boolean AutoscalingDisabled will stop the controller from managing the replicasfor the Model. When disabled, metrics will not be collected on server Pods. targetRequests integer TargetRequests is average number of active requests that the autoscalerwill try to maintain on model server Pods. 100 Minimum: 1 scaleDownDelaySeconds integer ScaleDownDelay is the minimum time before a deployment is scaled down afterthe autoscaling algorithm determines that it should be scaled down. 30 owner string Owner of the model. Used solely to populate the owner field in theOpenAI /v1/models endpoint.DEPRECATED. Optional: {}"},{"location":"reference/kubernetes-api/#modelstatus","title":"ModelStatus","text":"

    ModelStatus defines the observed state of Model.

    Appears in: - Model

    Field Description Default Validation replicas ModelStatusReplicas cache ModelStatusCache"},{"location":"reference/kubernetes-api/#modelstatuscache","title":"ModelStatusCache","text":"

    Appears in: - ModelStatus

    Field Description Default Validation loaded boolean"},{"location":"reference/kubernetes-api/#modelstatusreplicas","title":"ModelStatusReplicas","text":"

    Appears in: - ModelStatus

    Field Description Default Validation all integer ready integer"},{"location":"reference/openai-api-compatibility/","title":"OpenAI API Compatibility","text":"

    KubeAI provides an OpenAI API compatiblity layer.

    "},{"location":"reference/openai-api-compatibility/#general","title":"General:","text":""},{"location":"reference/openai-api-compatibility/#models","title":"Models","text":"
    GET /v1/models\n
    • Lists all kind: Model object installed in teh Kubernetes API Server.
    "},{"location":"reference/openai-api-compatibility/#inference","title":"Inference","text":""},{"location":"reference/openai-api-compatibility/#text-generation","title":"Text Generation","text":"
    POST /v1/chat/completions\nPOST /v1/completions\n
    • Supported for Models with .spec.features: [\"TextGeneration\"].
    "},{"location":"reference/openai-api-compatibility/#embeddings","title":"Embeddings","text":"
    POST /v1/embeddings\n
    • Supported for Models with .spec.features: [\"TextEmbedding\"].
    "},{"location":"reference/openai-api-compatibility/#speech-to-text","title":"Speech-to-Text","text":"
    POST /v1/audio/transcriptions\n
    • Supported for Models with .spec.features: [\"SpeechToText\"].
    "},{"location":"reference/openai-api-compatibility/#openai-client-libaries","title":"OpenAI Client libaries","text":"

    You can use the official OpenAI client libraries by setting the base_url to the KubeAI endpoint.

    For example, you can use the Python client like this:

    from openai import OpenAI\nclient = OpenAI(api_key=\"ignored\",\n                base_url=\"http://kubeai/openai/v1\")\nresponse = client.chat.completions.create(\n  model=\"gemma2-2b-cpu\",\n  messages=[\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n    {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n    {\"role\": \"user\", \"content\": \"Where was it played?\"}\n  ]\n)\n

    "},{"location":"tutorials/langchain/","title":"Using LangChain with KubeAI","text":"

    LangChain makes it easy to build applications powered by LLMs. KubeAI makes it easy to deploy and manage LLMs at scale. Together, they make it easy to build and deploy private and secure AI applications.

    In this tutorial, we'll show you how to use LangChain with KubeAI's OpenAI compatible API. The beauty of KubeAI's OpenAI compatibility is that you can use KubeAI with any framework that supports OpenAI.

    "},{"location":"tutorials/langchain/#prerequisites","title":"Prerequisites","text":"

    A K8s cluster. You can use a local cluster like kind.

    "},{"location":"tutorials/langchain/#installing-kubeai-with-gemma-2b","title":"Installing KubeAI with Gemma 2B","text":"

    Run the following command to install KubeAI with Gemma 2B:

    helm repo add kubeai https://www.kubeai.org\nhelm repo update\n\ncat <<EOF > models-helm-values.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\nEOF\n\nhelm install kubeai kubeai/kubeai \\\n    -f ./helm-values.yaml \\\n    --wait --timeout 10m\n\nhelm install kubeai-models kubeai/models \\\n    -f ./models-helm-values.yaml\n
    "},{"location":"tutorials/langchain/#using-langchain","title":"Using LangChain","text":"

    Install the required Python packages:

    pip install langchain_openai\n

    Let's access the KubeAI OpenAI compatible API locally to make it easier.

    Run the following command to port-forward to the KubeAI service:

    kubectl port-forward svc/kubeai 8000:80\n
    Now the KubeAI OpenAI compatible API is available at http://localhost:8000/openai from your local machine.

    Let's create a simple Python script that uses LangChain and is connected to KubeAI.

    Create a file named test-langchain.py with the following content:

    from langchain_openai import ChatOpenAI\n\nllm = ChatOpenAI(\n    model=\"gemma2-2b-cpu\",\n    temperature=0,\n    max_tokens=None,\n    timeout=None,\n    max_retries=2,\n    api_key=\"thisIsIgnored\",\n    base_url=\"http://localhost:8000/openai/v1\",\n)\n\nmessages = [\n    (\n        \"system\",\n        \"You are a helpful assistant that translates English to French. Translate the user sentence.\",\n    ),\n    (\"human\", \"I love programming.\"),\n]\nai_msg = llm.invoke(messages)\nprint(ai_msg.content)\n

    Run the Python script:

    python test-langchain.py\n

    Notice that we set base_url to http://localhost:8000/openai/v1. This tells LangChain to use our local KubeAI OpenAI compatible AP instead of the default OpenAI public API.

    If you run langchain within the K8s cluster, you can use the following base_url instead: http://kubeai/openai/v1. So the code would look like this:

    llm = ChatOpenAI(\n    ...\n    base_url=\"http://kubeai/openai/v1\",\n)\n

    That's it! You've successfully used LangChain with KubeAI. Now you can build and deploy private and secure AI applications with ease.

    "},{"location":"tutorials/langtrace/","title":"Deploying KubeAI with Langtrace","text":"

    Langtrace is an open source tool that helps you with tracing and monitoring your AI calls. It includes a self-hosted UI that for example shows you the estimated costs of your LLM calls.

    KubeAI is used for deploying LLMs with an OpenAI compatible endpoint.

    In this tutorial you will learn how to deploy KubeAI and Langtrace end-to-end. Both KubeAI and Langtrace are installed in your Kubernetes cluster. No cloud services or external dependencies are required.

    If you don't have a K8s cluster yet, you can create one using kind or minikube.

    kind create cluster # OR: minikube start\n

    Install Langtrace:

    helm repo add langtrace https://Scale3-Labs.github.io/langtrace-helm-chart\nhelm repo update\nhelm install langtrace langtrace/langtrace\n

    Install KubeAI and wait for all components to be ready (may take a minute).

    helm repo add kubeai https://www.kubeai.org\nhelm repo update\nhelm install kubeai kubeai/kubeai --wait --timeout 10m\n

    Install the gemma2-2b-cpu model:

    cat <<EOF > kubeai-models.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n

    Create a local Python environment and install dependencies:

    python3 -m venv .venv\nsource .venv/bin/activate\npip install langtrace-python-sdk openai\n

    Expose the KubeAI service to your local port:

    kubectl port-forward service/kubeai 8000:80\n

    Expose the Langtrace service to your local port:

    kubectl port-forward service/langtrace-app 3000:3000\n

    A Langtrace API key is required to use the Langtrace SDK. So lets get one by visiting your self hosted Langtace UI.

    Open your browser to http://localhost:3000, create a project and get the API keys for your langtrace project.

    In the Python script below, replace langtrace_api_key with your API key.

    Create file named langtrace-example.py with the following content:

    # Replace this with your langtrace API key by visiting http://localhost:3000\nlangtrace_api_key=\"f7e003de19b9a628258531c17c264002e985604ca9fa561debcc85c41f357b09\"\n\nfrom langtrace_python_sdk import langtrace\nfrom langtrace_python_sdk.utils.with_root_span import with_langtrace_root_span\n# Paste this code after your langtrace init function\n\nfrom openai import OpenAI\n\nlangtrace.init(\n    api_key=api_key,\n    api_host=\"http://localhost:3000/api/trace\",\n)\n\nbase_url = \"http://localhost:8000/openai/v1\"\nmodel = \"gemma2-2b-cpu\"\n\n@with_langtrace_root_span()\ndef example():\n    client = OpenAI(base_url=base_url, api_key=\"ignored-by-kubeai\")\n    response = client.chat.completions.create(\n        model=model,\n        messages=[\n            {\n                \"role\": \"system\",\n                \"content\": \"How many states of matter are there?\"\n            }\n        ],\n    )\n    print(response.choices[0].message.content)\n\nexample()\n

    Run the Python script:

    python3 langtrace-example.py\n

    Now you should see the trace in your Langtrace UI. Take a look by visiting http://localhost:3000.

    "},{"location":"tutorials/weaviate/","title":"Weaviate with local autoscaling embedding and generative models","text":"

    Weaviate is a vector search engine that can integrate seamlessly with KubeAI's embedding and generative models. This tutorial demonstrates how to deploy both KubeAI and Weaviate in a Kubernetes cluster, using KubeAI as the OpenAI endpoint for Weaviate.

    Why use KubeAI with Weaviate?

    • Security and privacy: KubeAI runs locally in your Kubernetes cluster, so your data never leaves your infrastructure.
    • Cost savings: KubeAI can run on your existing hardware, reducing the need for paying for embeddings and generative models.

    This tutorial uses CPU only models, so it should work even on your laptop.

    As you go go through this tutorial, you will learn how to:

    • Deploy KubeAI with embedding and generative models
    • Install Weaviate and connect it to KubeAI
    • Import data into Weaviate
    • Perform semantic search using the embedding model
    • Perform generative search using the generative model
    "},{"location":"tutorials/weaviate/#prerequisites","title":"Prerequisites","text":"

    A Kubernetes cluster. You can use kind or minikube.

    kind create cluster\n
    "},{"location":"tutorials/weaviate/#kubeai-configuration","title":"KubeAI Configuration","text":"

    Let's start by deploying KubeAI with the models we want to use. Nomic embedding model is used instead of text-embedding-ada-002. Gemma 2 2B is used instead of gpt-3.5-turbo. You could choose to use bigger models depending on your available hardware.

    Create a file named kubeai-model-values.yaml with the following content:

    catalog:\n  text-embedding-ada-002:\n    enabled: true\n    minReplicas: 1\n    features: [\"TextEmbedding\"]\n    owner: nomic\n    url: \"ollama://nomic-embed-text\"\n    engine: OLlama\n    resourceProfile: cpu:1\n  gpt-3.5-turbo:\n    enabled: true\n    minReplicas: 1\n    features: [\"TextGeneration\"]\n    owner: google\n    url: \"ollama://gemma2:2b\"\n    engine: OLlama\n    resourceProfile: cpu:2\n

    Note: It's important that you name the models as text-embedding-ada-002 and gpt-3.5-turbo as Weaviate expects these names.

    Run the following command to deploy KubeAI and install the configured models:

    helm repo add kubeai https://www.kubeai.org && helm repo update\n\nhelm install kubeai kubeai/kubeai\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-model-values.yaml\n

    "},{"location":"tutorials/weaviate/#weaviate-installation","title":"Weaviate Installation","text":"

    For this tutorial, we will use the Weaviate Helm chart to deploy Weaviate.

    Let's enable the text2vec-openai and generative-openai modules in Weaviate. We will also set the default vectorizer module to text2vec-openai.

    The apiKey is ignored in this case as we are using KubeAI as the OpenAI endpoint.

    Create a file named weaviate-values.yaml with the following content:

    modules:\n  text2vec-openai:\n    enabled: true\n    apiKey: thisIsIgnored\n  generative-openai:\n    enabled: true\n    apiKey: thisIsIgnored\n  default_vectorizer_module: text2vec-openai\nservice:\n  # To prevent Weaviate being exposed publicly\n  type: ClusterIP\n

    Install Weaviate by running the following command:

    helm repo add weaviate https://weaviate.github.io/weaviate-helm && helm repo update\n\nhelm install \\\n  \"weaviate\" \\\n  weaviate/weaviate \\\n  -f weaviate-values.yaml\n

    "},{"location":"tutorials/weaviate/#usage","title":"Usage","text":"

    We will be using Python to interact with Weaviate. The 2 use cases we will cover are: - Semantic search using the embedding model - Generative search using the generative model

    "},{"location":"tutorials/weaviate/#connectivity","title":"Connectivity","text":"

    The remaining steps require connectivity to the Weaviate service. However, Weaviate is not exposed publicly in this setup. So we setup a local port forwards to access the Weaviate services.

    Setup a local port forwards to the Weaviate services by running:

    kubectl port-forward svc/weaviate 8080:80\nkubectl port-forward svc/weaviate-grpc 50051:50051\n

    "},{"location":"tutorials/weaviate/#weaviate-client-python-setup","title":"Weaviate client Python Setup","text":"

    Create a virtual environment and install the Weaviate client:

    python -m venv .venv\nsource .venv/bin/activate\npip install -U weaviate-client requests\n

    "},{"location":"tutorials/weaviate/#collection-and-data-import","title":"Collection and Data Import","text":"

    Create a file named create-collection.py with the following content:

    import json\nimport weaviate\nimport requests\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n\n    client.collections.create(\n        \"Question\",\n        vectorizer_config=Configure.Vectorizer.text2vec_openai(\n                model=\"text-embedding-ada-002\",\n                base_url=\"http://kubeai/openai\",\n        ),\n        generative_config=Configure.Generative.openai(\n            model=\"gpt-3.5-turbo\",\n            base_url=\"http://kubeai/openai\",\n        ),\n    )\n\n    # import data\n    resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')\n    data = json.loads(resp.text)  # Load data\n\n    question_objs = list()\n    for i, d in enumerate(data):\n        question_objs.append({\n            \"answer\": d[\"Answer\"],\n            \"question\": d[\"Question\"],\n            \"category\": d[\"Category\"],\n        })\n\n    questions = client.collections.get(\"Question\")\n    questions.data.insert_many(question_objs)\n    print(\"Data imported successfully\")\n

    Create a collection that uses KubeAI as the openAI endpoint:

    python create-collection.py\n
    You should see a message Data imported successfully.

    The collection is now created and data is imported. The vectors are generated by KubeAI and stored in Weaviate.

    "},{"location":"tutorials/weaviate/#semantic-search","title":"Semantic Search","text":"

    Now let's do semantic search, which uses the embeddings. Create a file named search.py with the following content:

    import weaviate\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n    questions = client.collections.get(\"Question\")\n    response = questions.query.near_text(\n        query=\"biology\",\n        limit=2\n    )\n    print(response.objects[0].properties)  # Inspect the first object\n

    Execute the python script:

    python search.py\n

    You should see the following output:

    {\n  \"answer\": \"DNA\",\n  \"question\": \"In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance\",\n  \"category\": \"SCIENCE\"\n}\n

    "},{"location":"tutorials/weaviate/#generative-search-rag","title":"Generative Search (RAG)","text":"

    Now let's do generative search, which uses the generative model (Text generation LLM). The generative model is run locally and managed by KubeAI.

    Create a file named generate.py with the following content:

    import weaviate\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n    questions = client.collections.get(\"Question\")\n\n    response = questions.generate.near_text(\n        query=\"biology\",\n        limit=2,\n        grouped_task=\"Write a tweet with emojis about these facts.\"\n    )\n\n    print(response.generated)  # Inspect the generated text\n

    Run the python script:

    python generate.py\n

    You should see something similar to this:

    \ud83e\uddec Watson & Crick cracked the code in 1953! \ud83e\udd2f They built a model of DNA, the blueprint of life. \ud83e\uddec \ud83e\udde0 Liver power! \ud83d\udcaa This organ keeps your blood sugar balanced by storing glucose as glycogen. \ud83e\ude78 #ScienceFacts #Biology

    "},{"location":"tutorials/weaviate/#conclusion","title":"Conclusion","text":"

    You've now successfully set up KubeAI with Weaviate for both embedding-based semantic search and generative tasks. You've also learned how to import data, perform searches, and generate content using KubeAI-managed models.

    "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"KubeAI: Private Open AI on Kubernetes","text":"

    Get inferencing running on Kubernetes: LLMs, Embeddings, Speech-to-Text.

    \u2705\ufe0f Drop-in replacement for OpenAI with API compatibility \ud83e\udde0 Serve top OSS models (LLMs, Whisper, etc.) \ud83d\ude80 Multi-platform: CPU-only, GPU, coming soon: TPU \u2696\ufe0f Scale from zero, autoscale based on load \ud83d\udee0\ufe0f Zero dependencies (does not depend on Istio, Knative, etc.) \ud83d\udcac Chat UI included (OpenWebUI) \ud83e\udd16 Operates OSS model servers (vLLM, Ollama, FasterWhisper, Infinity) \u2709 Stream/batch inference via messaging integrations (Kafka, PubSub, etc.)

    Quotes from the community:

    reusable, well abstracted solution to run LLMs - Mike Ensor

    "},{"location":"#architecture","title":"Architecture","text":"

    KubeAI serves an OpenAI compatible HTTP API. Admins can configure ML models via kind: Model Kubernetes Custom Resources. KubeAI can be thought of as a Model Operator (See Operator Pattern) that manages vLLM and Ollama servers.

    "},{"location":"#local-quickstart","title":"Local Quickstart","text":"

    Create a local cluster using kind or minikube.

    TIP: If you are using Podman for kind... Make sure your Podman machine can use up to 6G of memory (by default it is capped at 2G):
    # You might need to stop and remove the existing machine:\npodman machine stop\npodman machine rm\n\n# Init and start a new machine:\npodman machine init --memory 6144 --disk-size 120\npodman machine start\n
    kind create cluster # OR: minikube start\n

    Add the KubeAI Helm repository.

    helm repo add kubeai https://www.kubeai.org\nhelm repo update\n

    Install KubeAI and wait for all components to be ready (may take a minute).

    helm install kubeai kubeai/kubeai --wait --timeout 10m\n

    Install some predefined models.

    cat <<EOF > kubeai-models.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\n  qwen2-500m-cpu:\n    enabled: true\n  nomic-embed-text-cpu:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n

    Before progressing to the next steps, start a watch on Pods in a standalone terminal to see how KubeAI deploys models.

    kubectl get pods --watch\n
    "},{"location":"#interact-with-gemma2","title":"Interact with Gemma2","text":"

    Because we set minReplicas: 1 for the Gemma model you should see a model Pod already coming up.

    Start a local port-forward to the bundled chat UI.

    kubectl port-forward svc/openwebui 8000:80\n

    Now open your browser to localhost:8000 and select the Gemma model to start chatting with.

    "},{"location":"#scale-up-qwen2-from-zero","title":"Scale up Qwen2 from Zero","text":"

    If you go back to the browser and start a chat with Qwen2, you will notice that it will take a while to respond at first. This is because we set minReplicas: 0 for this model and KubeAI needs to spin up a new Pod (you can verify with kubectl get models -oyaml qwen2-500m-cpu).

    "},{"location":"#documentation","title":"Documentation","text":"

    Checkout our documentation on kubeai.org to find info on:

    • Installing KubeAI in the cloud
    • How to guides (e.g. how to manage models and resource profiles).
    • Concepts (how the components of KubeAI work).
    • How to contribute
    "},{"location":"#adopters","title":"Adopters","text":"

    List of known adopters:

    Name Description Link Telescope Telescope uses KubeAI for multi-region large scale batch LLM inference. trytelescope.ai Google Cloud Distributed Edge KubeAI is included as a reference architecture for inferencing at the edge. LinkedIn, GitLab

    If you are using KubeAI and would like to be listed as an adopter, please make a PR.

    "},{"location":"#openai-api-compatibility","title":"OpenAI API Compatibility","text":"
    # Implemented #\n/v1/chat/completions\n/v1/completions\n/v1/embeddings\n/v1/models\n/v1/audio/transcriptions\n\n# Planned #\n# /v1/assistants/*\n# /v1/batches/*\n# /v1/fine_tuning/*\n# /v1/images/*\n# /v1/vector_stores/*\n
    "},{"location":"#immediate-roadmap","title":"Immediate Roadmap","text":"
    • Model caching
    • LoRA finetuning (compatible with OpenAI finetuning API)
    • Image generation (compatible with OpenAI images API)

    NOTE: KubeAI was born out of a project called Lingo which was a simple Kubernetes LLM proxy with basic autoscaling. We relaunched the project as KubeAI (late August 2024) and expanded the roadmap to what it is today.

    \ud83c\udf1f Don't forget to drop us a star on GitHub and follow the repo to stay up to date!

    "},{"location":"#contact","title":"Contact","text":"

    Let us know about features you are interested in seeing or reach out with questions. Visit our Discord channel to join the discussion!

    Or just reach out on LinkedIn if you want to connect:

    • Nick Stogner
    • Sam Stoelinga
    "},{"location":"benchmarks/llama-3.2-11b-vision/","title":"Llama 3.2 11B Vision Instruct vLLM Benchmarks","text":"

    Single L4 GPU vLLM 0.6.2

    python3 benchmark_serving.py --backend openai \\\n    --base-url http://localhost:8000/openai \\\n    --dataset-name=sharegpt --dataset-path=ShareGPT_V3_unfiltered_cleaned_split.json \\\n    --model meta-llama-3.2-11b-vision-instruct \\\n    --seed 12345 --tokenizer neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic\n============ Serving Benchmark Result ============\nSuccessful requests:                     1000\nBenchmark duration (s):                  681.93\nTotal input tokens:                      230969\nTotal generated tokens:                  194523\nRequest throughput (req/s):              1.47\nOutput token throughput (tok/s):         285.25\nTotal Token throughput (tok/s):          623.95\n---------------Time to First Token----------------\nMean TTFT (ms):                          319146.12\nMedian TTFT (ms):                        322707.98\nP99 TTFT (ms):                           642512.79\n-----Time per Output Token (excl. 1st token)------\nMean TPOT (ms):                          54.84\nMedian TPOT (ms):                        53.66\nP99 TPOT (ms):                           83.75\n---------------Inter-token Latency----------------\nMean ITL (ms):                           54.09\nMedian ITL (ms):                         47.44\nP99 ITL (ms):                            216.77\n==================================================\n

    "},{"location":"concepts/autoscaling/","title":"Autoscaling","text":"

    KubeAI proxies HTTP and messaging (i.e. Kafka, etc) requests and messages to models. It will adjust the number Pods serving a given model based on the average active number of requests. If no Pods are running when a request comes in, KubeAI will hold the request, scale up a Pod and forward the request when the Pod is ready. This process happens in a manner that is transparent to the end client (other than the added delay from a cold-start).

    "},{"location":"concepts/autoscaling/#next","title":"Next","text":"

    Read about how to configure autoscaling.

    "},{"location":"concepts/backend-servers/","title":"Backend Servers","text":"

    KubeAI serves ML models by launching Pods on Kubernetes. The configuration and lifecycle of these Pods are managed by the KubeAI controller. Every model server Pod loads exactly one model on startup.

    In a Model manifest you can define what server to use for inference (VLLM, OLlama). Any model-specific settings can be passed to the server process via the args and env fields.

    "},{"location":"concepts/backend-servers/#next","title":"Next","text":"

    Read about how to install models.

    "},{"location":"concepts/resource-profiles/","title":"Resource Profiles","text":"

    A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are configured on inference server Pods. These profiles are defined in the KubeAI config.yaml file (via a ConfigMap). Each model specifies the resource profile that it requires.

    Kubernetes Model resources specify a resource profile and the count of that resource that they require (for example resourceProfile: nvidia-gpu-l4:2 - 2x L4 GPUs).

    A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed in.

    Example: A resource profile named nvidia-gpu-l4 might contain the following node selectors when installing KubeAI on a GKE Kubernetes cluster:

    cloud.google.com/gke-accelerator: \"nvidia-l4\"\ncloud.google.com/gke-spot: \"true\"\n

    and add the following resource requests to the model server Pods:

    nvidia.com/gpu: \"1\"\n

    In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource.

    "},{"location":"concepts/resource-profiles/#next","title":"Next","text":"

    Read about how to configure resource profiles.

    "},{"location":"concepts/storage-caching/","title":"Storage / Caching","text":"

    With \"Large\" in the name, caching is a critical part of serving LLMs.

    The best caching technique may very depending on your environment:

    • What cloud features are available?
    • Is your cluster deployed in an air-gapped environment?
    "},{"location":"concepts/storage-caching/#a-model-built-into-container","title":"A. Model built into container","text":"

    Status: Supported

    Building a model into a container image can provide a simple way to take advantage of image-related optimizations built into Kubernetes:

    • Relaunching a model server on the same Node that it ran on before will likely be able to reuse the previously pulled image.

    • Secondary boot disks on GKE can be used to avoid needing to pull images.

    • Image streaming on GKE can allow for containers to startup before the entire image is present on the Node.

    • Container images can be pre-installed on Nodes in air-gapped environments (example: k3s airgap installation).

    Guides:

    • How to build models into container images
    "},{"location":"concepts/storage-caching/#b-model-on-shared-filesystem-read-write-many","title":"B. Model on shared filesystem (read-write-many)","text":"

    KubeAI can manage model caches on a shared filesystem (i.e. AWS EFS, GCP Filestore, NFS). It manages the full lifecycle of a cached model: loading, serving, and cache eviction (on deletion of the Model).

    "},{"location":"concepts/storage-caching/#c-model-on-read-only-many-disk","title":"C. Model on read-only-many disk","text":"

    Status: Planned.

    Examples: GCP Hyperdisk ML

    "},{"location":"contributing/development-environment/","title":"Development environment","text":"

    This document provides instructions for setting up an environment for developing KubeAI.

    "},{"location":"contributing/development-environment/#optional-cloud-setup","title":"Optional: Cloud Setup","text":""},{"location":"contributing/development-environment/#gcp-pubsub","title":"GCP PubSub","text":"

    If you are develop PubSub messaging integration on GCP, setup test topics and subscriptions and uncomment the .messaging.streams in ./hack/dev-config.yaml.

    gcloud auth login --update-adc\n\ngcloud pubsub topics create test-kubeai-requests\ngcloud pubsub subscriptions create test-kubeai-requests-sub --topic test-kubeai-requests\ngcloud pubsub topics create test-kubeai-responses\ngcloud pubsub subscriptions create test-kubeai-responses-sub --topic test-kubeai-responses\n
    "},{"location":"contributing/development-environment/#run-in-local-cluster","title":"Run in Local Cluster","text":"
    kind create cluster\n# OR\n#./hack/create-dev-gke-cluster.yaml\n\n# Generate CRDs from Go code.\nmake generate && make manifests\n\n# When CRDs are changed reapply using kubectl:\nkubectl apply -f ./charts/kubeai/charts/crds/crds\n\n# Model with special address annotations:\nkubectl apply -f ./hack/dev-model.yaml\n\n# OPTION A #\n# Run KubeAI inside cluster\n# Change `-f` based on the cluster environment.\nhelm upgrade --install kubeai ./charts/kubeai \\\n    --set openwebui.enabled=true \\\n    --set image.tag=latest \\\n    --set image.pullPolicy=Always \\\n    --set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \\\n    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \\\n    --set replicaCount=1 -f ./hack/dev-gke-helm-values.yaml\n\n# OPTION B #\n# For quick local interation (run KubeAI outside of cluster)\nkubectl create cm kubeai-autoscaler-state -oyaml --dry-run=client | kubectl apply -f -\nCONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go\n\n# In another terminal:\nwhile true; do kubectl port-forward service/dev-model 7000:7000; done\n############\n
    "},{"location":"contributing/development-environment/#running","title":"Running","text":""},{"location":"contributing/development-environment/#completions-api","title":"Completions API","text":"
    # If you are running kubeai in-cluster:\n# kubectl port-forward svc/kubeai 8000:80\n\ncurl http://localhost:8000/openai/v1/completions -H \"Content-Type: application/json\" -d '{\"prompt\": \"Hi\", \"model\": \"dev\"}' -v\n
    "},{"location":"contributing/development-environment/#messaging-integration","title":"Messaging Integration","text":"
    gcloud pubsub topics publish test-kubeai-requests \\                  \n  --message='{\"path\":\"/v1/completions\", \"metadata\":{\"a\":\"b\"}, \"body\": {\"model\": \"dev\", \"prompt\": \"hi\"}}'\n\ngcloud pubsub subscriptions pull test-kubeai-responses-sub --auto-ack\n
    "},{"location":"contributing/documentation/","title":"Documentation","text":"

    We are grateful for anyone who takes the time to improve KubeAI documentation! In order to keep our docs clear and consistent we ask that you first read about the approach to documentation that we have standardized on...

    "},{"location":"contributing/documentation/#read-before-writing","title":"Read before writing!","text":"

    The KubeAI approach to documentation is loosely inspired by the Diataxis method.

    TLDR on how KubeAI docs are organized:

    • Installation: How-to guides specific to installing KubeAI.
    • How To: Directions that guide the reader through a problem or towards a result. How-to guides are goal-oriented. They assume the user is familiar with general concepts, tools, and has already installed KubeAI.
    • Concepts: A reflective explanation of KubeAI topics with a focus on giving the reader an understanding of the why.
    • Tutorials: Learning oriented experiences. Lessons that often guide a user from beginning to end. The goal is to help the reader learn something (compared to a how-to guide that is focused on helping the reader do something).
    • Contributing: The docs in here differ from the rest of the docs by audience: these docs are for anyone who will be contributing code or docs to the KubeAI project.
    "},{"location":"contributing/documentation/#how-to-serve-kubeaiorg-locally","title":"How to serve kubeai.org locally","text":"

    Make sure you have python3 installed and run:

    make docs\n
    "},{"location":"contributing/release-process/","title":"Release Process","text":"

    This document describes the process for releasing a new version of the project.

    "},{"location":"contributing/release-process/#docs","title":"Docs","text":"

    The docs are automatically published whenever a PR updates the docs and the PR is merged into the main branch. The docs are published to the gh-pages branch, which is the source for the Github Pages site.

    "},{"location":"contributing/release-process/#docker-images","title":"Docker images","text":"

    The Docker image latest tag always points to the latest released version. The main tag points to the latest commit on the main branch.

    If you push a tag vX.Y.Z to the repository, the Docker image with the tag vX.Y.Z is built and pushed to Docker Hub. Afterwards, the latest tag is updated to point to the new version.

    "},{"location":"contributing/release-process/#helm-chart","title":"Helm Chart","text":"

    The Helm chart only gets released when a git tag is pushed to the repository with the format helm-v*.

    The appVersion in the Helm chart does not have to point to the latest released version. This allows us to first publish a new version of the Docker image without updating the Helm chart. The Helm chart is updated when we are ready to release a new version.

    This is important when a new appVersion isn't compatible with the current Helm chart. In those cases, we can first merge the PR, thoroughly test, release new container image, and then in a separate PR update the Helm chart and the appVersion.

    "},{"location":"how-to/architect-for-multitenancy/","title":"Architect for Multitenancy","text":"

    KubeAI can support multitenancy by filtering the models that it serves via Kubernetes label selectors. These label selectors can be applied when accessing any of the OpenAI-compatible endpoints through the X-Label-Selector HTTP header and will match on labels specified on the kind: Model objects. The pattern is similar to using a WHERE clause in a SQL query.

    Example Models:

    kind: Model\nmetadata:\n  name: llama-3.2\n  labels:\n    tenancy: public\nspec:\n# ...\n---\nkind: Model\nmetadata:\n  name: custom-private-model\n  labels:\n    tenancy: org-abc\nspec:\n# ...\n

    Example HTTP requests:

    # The returned list of models will be filtered.\ncurl http://$KUBEAI_ENDPOINT/openai/v1/models \\\n    -H \"X-Label-Selector: tenancy in (org-abc, public)\"\n\n# When running inference, if the label selector does not match\n# a 404 will be returned.\ncurl http://$KUBEAI_ENDPOINT/openai/v1/completions \\\n    -H \"Content-Type: application/json\" \\\n    -H \"X-Label-Selector: tenancy in (org-abc, public)\" \\\n    -d '{\"prompt\": \"Hi\", \"model\": \"llama-3.2\"}'\n

    The header value can be any valid Kubernetes label selector. Some examples include:

    X-Label-Selector: tenancy=org-abc\nX-Label-Selector: tenancy in (org-abc, public)\nX-Label-Selector: tenancy!=private\n

    Multiple X-Label-Selector headers can be specified in the same HTTP request and will be treated as a logical AND. For example, the following request will only match Models that have a label tenant: org-abc and user: sam:

    curl http://$KUBEAI_ENDPOINT/openai/v1/completions \\\n    -H \"Content-Type: application/json\" \\\n    -H \"X-Label-Selector: tenant=org-abc\" \\\n    -H \"X-Label-Selector: user=sam\" \\\n    -d '{\"prompt\": \"Hi\", \"model\": \"llama-3.2\"}'\n

    Example architecture:

    "},{"location":"how-to/build-models-into-containers/","title":"Build models into containers","text":"

    In this guide we will preload a LLM into a custom built Ollama serving image. You can follow the same steps for other models and other serving engines.

    Define some values

    export MODEL_URL=ollama://qwen2:0.5b\n\n# Customize with your own image repo.\nexport IMAGE=us-central1-docker.pkg.dev/substratus-dev/default/ollama-builtin-qwen2-05b:latest\n

    Build and push image. Note: building (downloading base image & model) and pushing (uploading image & model) can take a while depending on the size of the model.

    git clone https://github.com/substratusai/kubeai\ncd ./kubeai/examples/ollama-builtin\n\ndocker build --build-arg MODEL_URL=$MODEL_URL -t $IMAGE .\ndocker push $IMAGE\n

    Create a model manifest & apply into a cluster with KubeAI installed. NOTE: The only difference between an built-in model image and otherwise is the addition of the image: field.

    kubectl apply -f - << EOF\napiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: builtin-model-example\nspec:\n  features: [\"TextGeneration\"]\n  owner: alibaba\n  image: $IMAGE # <-- The image with model built-in\n  url: \"$MODEL_URL\"\n  engine: OLlama\n  resourceProfile: cpu:1\nEOF\n
    "},{"location":"how-to/cache-models-with-aws-efs/","title":"Cache models with AWS EFS","text":"

    KubeAI can manage model caches. AWS EFS is supported as a pluggable backend store.

    Follow the EKS install guide.

    "},{"location":"how-to/cache-models-with-aws-efs/#1-create-an-efs-file-system","title":"1. Create an EFS File System","text":"

    Set environment variables to match your environment.

    export CLUSTER_NAME=\"cluster-with-karpenter\"\nexport CLUSTER_REGION=\"us-west-2\"\n

    Create an EFS file system in the same VPC as your EKS cluster.

    vpc_id=$(aws eks describe-cluster \\\n    --name $CLUSTER_NAME \\\n    --query \"cluster.resourcesVpcConfig.vpcId\" \\\n    --output text)\n\ncidr_range=$(aws ec2 describe-vpcs \\\n    --vpc-ids $vpc_id \\\n    --query \"Vpcs[].CidrBlock\" \\\n    --output text \\\n    --region ${CLUSTER_REGION})\n\nsecurity_group_id=$(aws ec2 create-security-group \\\n    --group-name MyEfsSecurityGroup \\\n    --description \"My EFS security group\" \\\n    --vpc-id $vpc_id \\\n    --output text)\n\naws ec2 authorize-security-group-ingress \\\n    --group-id $security_group_id \\\n    --protocol tcp \\\n    --port 2049 \\\n    --cidr $cidr_range\n\nfile_system_id=$(aws efs create-file-system \\\n    --region ${CLUSTER_REGION} \\\n    --performance-mode generalPurpose \\\n    --query 'FileSystemId' \\\n    --output text)\n

    Expose the EFS file system to the subnets used by your EKS cluster.

    SUBNETS=$(eksctl get cluster --region us-west-2 ${CLUSTER_NAME} -o json | jq -r '.[0].ResourcesVpcConfig.SubnetIds[]')\n\nwhile IFS= read -r subnet; do\n    echo \"Creating EFS mount target in $subnet\"\n    aws efs create-mount-target --file-system-id $file_system_id \\\n      --subnet-id $subnet --security-groups $security_group_id --output text\ndone <<< \"$SUBNETS\"\n

    "},{"location":"how-to/cache-models-with-aws-efs/#2-install-the-efs-csi-driver","title":"2. Install the EFS CSI driver","text":"
    export ROLE_NAME=AmazonEKS_EFS_CSI_DriverRole\neksctl create iamserviceaccount \\\n    --name efs-csi-controller-sa \\\n    --namespace kube-system \\\n    --cluster ${CLUSTER_NAME} \\\n    --role-name ${ROLE_NAME} \\\n    --role-only \\\n    --attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy \\\n    --approve\n\nTRUST_POLICY=$(aws iam get-role --role-name ${ROLE_NAME} \\\n    --query 'Role.AssumeRolePolicyDocument' --output json | \\\n    sed -e 's/efs-csi-controller-sa/efs-csi-*/' -e 's/StringEquals/StringLike/')\n\naws iam update-assume-role-policy --role-name ${ROLE_NAME} --policy-document \"$TRUST_POLICY\"\n\n# Get the role ARN\nEFS_ROLE_ARN=$(aws iam get-role --role-name AmazonEKS_EFS_CSI_DriverRole \\\n  --query 'Role.Arn' --output text)\n\naws eks create-addon --cluster-name $CLUSTER_NAME --addon-name aws-efs-csi-driver \\\n  --service-account-role-arn $EFS_ROLE_ARN\n

    Wait for EKS Addon to active.

    aws eks wait addon-active --cluster-name $CLUSTER_NAME \\\n  --addon-name aws-efs-csi-driver\n
    Verify that the EFS CSI driver is running.

    kubectl get daemonset efs-csi-node -n kube-system\n

    Create a storage class for using EFS dynamic mode.

    kubectl apply -f - <<EOF\nkind: StorageClass\napiVersion: storage.k8s.io/v1\nmetadata:\n  name: efs-sc\nprovisioner: efs.csi.aws.com\nparameters:\n  provisioningMode: efs-ap\n  fileSystemId: \"${file_system_id}\"\n  directoryPerms: \"700\"\nEOF\n

    Make sure to set file_system_id match the EFS file system ID created in the first step.

    "},{"location":"how-to/cache-models-with-aws-efs/#3-configure-kubeai-with-the-efs-cache-profile","title":"3. Configure KubeAI with the EFS cache profile","text":"

    You can skip this step if you've already installed KubeAI using the EKS Helm values file: values-eks.yaml file.

    Configure KubeAI with the efs-dynamic cache profile.

    helm upgrade --install kubeai kubeai/kubeai \\\n  --reuse-values -f - <<EOF\ncacheProfiles:\n  efs-dynamic:\n    sharedFilesystem:\n      storageClassName: \"efs-sc\"\n  efs-static:\n    sharedFilesystem:\n      persistentVolumeName: \"efs-pv\"\nEOF\n

    "},{"location":"how-to/cache-models-with-aws-efs/#4-configure-a-model-to-use-the-efs-cache","title":"4. Configure a model to use the EFS cache","text":"

    Apply a Model with cacheProfile set to efs-dynamic.

    NOTE: If you already installed the models chart, you will need to edit you values file and run helm upgrade.

    helm install kubeai-models kubeai/models -f - <<EOF\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n    cacheProfile: efs-dynamic\nEOF\n

    Wait for the Model to be fully cached.

    kubectl wait --timeout 10m --for=jsonpath='{.status.cache.loaded}'=true model/llama-3.1-8b-instruct-fp8-l4\n

    This model will now be loaded from Filestore when it is served.

    "},{"location":"how-to/cache-models-with-aws-efs/#troubleshooting","title":"Troubleshooting","text":""},{"location":"how-to/cache-models-with-aws-efs/#mountvolesetup-failed-for-volume-pvc-deadline-exceeded","title":"MountVole.SetUp failed for volume pvc deadline exceeded","text":"

    kubectl get events may show an error like this:

    8s          Warning   FailedMount             pod/load-cache-llama-3.1-8b-instruct-fp8-l4-w7thh      MountVolume.SetUp failed for volume \"pvc-ceedb563-1e68-47fa-9d12-c697ae153d04\" : rpc error: code = DeadlineExceeded desc = context deadline exceeded\n

    Checking the logs of the EFS CSI DaemonSet may show an error like this:

    kubectl logs -f efs-csi-node-4n75c -n kube-system\nOutput: Could not start amazon-efs-mount-watchdog, unrecognized init system \"aws-efs-csi-dri\"\nMount attempt 1/3 failed due to timeout after 15 sec, wait 0 sec before next attempt.\nMount attempt 2/3 failed due to timeout after 15 sec, wait 0 sec before next attempt.\nb'mount.nfs4: Connection timed out'\n

    This likely means your mount target isn't setup correctly. Possibly the security group is not allowing traffic from the EKS cluster.

    "},{"location":"how-to/cache-models-with-aws-efs/#model-loading-job","title":"Model Loading Job","text":"

    Check to see if there is an ongoing model loader Job.

    kubectl get jobs\n
    "},{"location":"how-to/cache-models-with-gcp-filestore/","title":"Cache models with GCP Filestore","text":"

    KubeAI can manage model caches. GCP Filestore is supported as a pluggable backend store.

    Follow the GKE install guide.

    Ensure that the Filestore API is enabled.

    gcloud services enable file.googleapis.com\n

    Apply a Model with the cache profile set to standard-filestore (defined in the reference GKE Helm values file).

    TIP: If you want to use `premium-filestore` you will need to ensure you have quota.

    Open the cloud console quotas page: https://console.cloud.google.com/iam-admin/quotas. Make sure your project is selected in the top left.

    Ensure that you have at least 2.5Tb of PremiumStorageGbPerRegion quota in the region where your cluster is deployed.

    NOTE: If you already installed the models chart, you will need to edit you values file and run helm upgrade.

    helm install kubeai-models kubeai/models -f - <<EOF\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n    cacheProfile: standard-filestore\nEOF\n

    Wait for the Model to be fully cached. This may take a while if the Filestore instance needs to be created.

    kubectl wait --timeout 10m --for=jsonpath='{.status.cache.loaded}'=true model/llama-3.1-8b-instruct-fp8-l4\n

    This model will now be loaded from Filestore when it is served.

    "},{"location":"how-to/cache-models-with-gcp-filestore/#troubleshooting","title":"Troubleshooting","text":""},{"location":"how-to/cache-models-with-gcp-filestore/#filestore-csi-driver","title":"Filestore CSI Driver","text":"

    Ensure that the Filestore CSI driver is enabled by checking for the existance of Kubernetes storage classes. If they are not found, follow the GCP guide for enabling the CSI driver.

    kubectl get storageclass standard-rwx premium-rwx\n
    "},{"location":"how-to/cache-models-with-gcp-filestore/#persistentvolumes","title":"PersistentVolumes","text":"

    Check the PersistentVolumeClaim (that should be created by KubeAI).

    kubectl describe pvc shared-model-cache-\n
    Example: Out-of-quota error
      Warning  ProvisioningFailed    11m (x26 over 21m)  filestore.csi.storage.gke.io_gke-50826743a27a4d52bf5b-7fac-9607-vm_b4bdb2ec-b58b-4363-adec-15c270a14066  failed to provision volume with StorageClass \"premium-rwx\": rpc error: code = ResourceExhausted desc = googleapi: Error 429: Quota limit 'PremiumStorageGbPerRegion' has been exceeded. Limit: 0 in region us-central1.\nDetails:\n[\n  {\n    \"@type\": \"type.googleapis.com/google.rpc.QuotaFailure\",\n    \"violations\": [\n      {\n        \"description\": \"Quota 'PremiumStorageGbPerRegion' exhausted. Limit 0 in region us-central1\",\n        \"subject\": \"project:819220466562\"\n      }\n    ]\n  }\n]\n

    Check to see if the PersistentVolume has been fully provisioned.

    kubectl get pv\n# Find name of corresponding pv...\nkubectl describe pv <name>\n
    "},{"location":"how-to/cache-models-with-gcp-filestore/#model-loading-job","title":"Model Loading Job","text":"

    Check to see if there is an ongoing model loader Job.

    kubectl get jobs\n
    "},{"location":"how-to/configure-autoscaling/","title":"Configure autoscaling","text":"

    This guide will cover how to configure KubeAI autoscaling parameters.

    "},{"location":"how-to/configure-autoscaling/#system-settings","title":"System Settings","text":"

    KubeAI administrators can define system-wide autoscaling settings by setting the following Helm values (for the kubeai/kubeai chart):

    Example:

    # helm-values.yaml\nmodelAutoscaling:\n  interval: 15s\n  timeWindow: 10m\n# ...\n
    "},{"location":"how-to/configure-autoscaling/#model-settings","title":"Model Settings","text":"

    The following settings can be configured on a model-by-model basis.

    "},{"location":"how-to/configure-autoscaling/#model-settings-helm","title":"Model settings: helm","text":"

    If you are managing models via the kubeai/models Helm chart, you can use:

    # helm-values.yaml\ncatalog:\n  model-a:\n    # ...\n    minReplicas: 1\n    maxReplicas: 9\n    targetRequests: 250\n    scaleDownDelaySeconds: 45\n  model-b:\n    # ...\n    disableAutoscaling: true\n# ...\n

    Re-running helm upgrade with these additional parameters will update model settings in the cluster.

    "},{"location":"how-to/configure-autoscaling/#model-settings-kubectl","title":"Model settings: kubectl","text":"

    You can also specify the autoscaling profile directly via the Models custom resource in the Kubernetes API:

    apiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: my-model\nspec:\n  # ...\n  minReplicas: 1\n  maxReplicas: 9\n  targetRequests: 250\n  scaleDownDelaySeconds: 45\n

    If you are already managing models using Model manifest files, you can make the update to your file and reapply it using kubectl apply -f <filename>.yaml.

    "},{"location":"how-to/configure-embedding-models/","title":"Configure Embedding Models","text":"

    KubeAI supports the following engines for text embedding models:

    • Infinity
    • vLLM
    • Ollama

    Infinity supports any HuggingFace models listed as text-embedding. See the models, reranking or clip models on huggingface for reference.

    "},{"location":"how-to/configure-embedding-models/#install-baaibge-small-en-v15-model-using-infinity","title":"Install BAAI/bge-small-en-v1.5 model using Infinity","text":"

    Create a file named kubeai-models.yaml with the following content:

    catalog:\n  bge-embed-text-cpu:\n    enabled: true\n    features: [\"TextEmbedding\"]\n    owner: baai\n    url: \"hf://BAAI/bge-small-en-v1.5\"\n    engine: Infinity\n    resourceProfile: cpu:1\n    minReplicas: 1\n

    Apply the kubeai-models helm chart:

    helm install kubeai-models kubeai/models -f ./kubeai-models.yaml\n

    Once the pod is ready, you can use the OpenAI Python SDK to interact with the model:

    from openai import OpenAI\n# Assumes port-forward of kubeai service to localhost:8000.\nclient = OpenAI(api_key=\"ignored\", base_url=\"http://localhost:8000/openai/v1\")\nresponse = client.embeddings.create(\n    input=\"Your text goes here.\",\n    model=\"bge-embed-text-cpu\"\n)\n
    "},{"location":"how-to/configure-resource-profiles/","title":"Configure resource profiles","text":"

    This guide will cover modifying preconfigured resource profiles and adding your own.

    "},{"location":"how-to/configure-resource-profiles/#modifying-preconfigured-resource-profiles","title":"Modifying preconfigured resource profiles","text":"

    The KubeAI helm chart comes with preconfigured resource profiles for common resource types such as NVIDIA L4 GPUs. You can view these profiles in the default helm values file.

    These profiles usually require some additional settings based on the cluster/cloud that KubeAI is installed into. You can modify a resource profile by setting custom helm values and runing helm install or helm upgrade. For example, if you are installing KubeAI on GKE you will need to set GKE-specific node selectors:

    # helm-values.yaml\nresourceProfiles:\n  nvidia-gpu-l4:\n    nodeSelector:\n      cloud.google.com/gke-accelerator: \"nvidia-l4\"\n      cloud.google.com/gke-spot: \"true\"\n

    NOTE: See the cloud-specific installation guide for a comprehensive list of settings.

    "},{"location":"how-to/configure-resource-profiles/#adding-additional-resource-profiles","title":"Adding additional resource profiles","text":"

    If the preconfigured resource profiles do not meet your needs you can add additional profiles by appending to the .resourceProfiles object in the helm values file you use to install KubeAI.

    # helm-values.yaml\nresourceProfiles:\n  my-custom-gpu:\n    imageName: \"optional-custom-image-name\"\n    nodeSelector:\n      my-custom-node-pool: \"some-value\"\n    limits:\n      custom.com/gpu: \"1\"\n    requests:\n      custom.com/gpu: \"1\"\n      cpu: \"3\"\n      memory: \"12Gi\"\n    runtimeClassName: \"my-custom-runtime-class\"\n

    If you need to run custom model server images on your resource profile, make sure to also add those in the modelServers section:

    # helm-values.yaml\nmodelServers:\n  VLLM:\n    images:\n      optional-custom-image-name: \"my-repo/my-vllm-image:v1.2.3\"\n  OLlama:\n    images:\n      optional-custom-image-name: \"my-repo/my-ollama-image:v1.2.3\"\n
    "},{"location":"how-to/configure-resource-profiles/#next","title":"Next","text":"

    See the guide on how to install models which includes how to configure the resource profile to use for a given model.

    "},{"location":"how-to/configure-speech-to-text/","title":"Configure speech-to-text","text":"

    KubeAI provides a Speech to Text endpoint that can be used to transcribe audio files. This guide will walk you through the steps to enable this feature.

    "},{"location":"how-to/configure-speech-to-text/#enable-speech-to-text-model","title":"Enable Speech to Text model","text":"

    You can create new models by creating a Model CRD object or by enabling a model from the model catalog.

    "},{"location":"how-to/configure-speech-to-text/#enable-from-model-catalog","title":"Enable from model catalog","text":"

    KubeAI provides predefined models in the kubeai/models Helm chart. To enable the Speech to Text model, you can set the enabled flag to true in your values file.

    # models-helm-values.yaml\ncatalog:\n  faster-whisper-medium-en-cpu:\n    enabled: true\n    minReplicas: 1\n
    "},{"location":"how-to/configure-speech-to-text/#enable-by-creating-model-crd","title":"Enable by creating Model CRD","text":"

    You can also create a Model CRD object to enable the Speech to Text model. Here is an example of a Model CRD object for the Speech to Text model:

    apiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: faster-whisper-medium-en-cpu\nspec:\n  features: [SpeechToText]\n  owner: Systran\n  url: hf://Systran/faster-whisper-medium.en\n  engine: FasterWhisper\n  resourceProfile: cpu:1\n
    "},{"location":"how-to/configure-speech-to-text/#usage","title":"Usage","text":"

    The Speech to Text endpoint is available at /openai/v1/transcriptions.

    Example usage using curl:

    curl -L -o kubeai.mp4 https://github.com/user-attachments/assets/711d1279-6af9-4c6c-a052-e59e7730b757\ncurl http://localhost:8000/openai/v1/audio/transcriptions \\\n  -F \"file=@kubeai.mp4\" \\\n  -F \"language=en\" \\\n  -F \"model=faster-whisper-medium-en-cpu\"\n
    "},{"location":"how-to/install-models/","title":"Install models","text":"

    This guide provides instructions on how to configure KubeAI models.

    "},{"location":"how-to/install-models/#installing-models-with-helm","title":"Installing models with helm","text":"

    KubeAI provides a chart that contains preconfigured models.

    "},{"location":"how-to/install-models/#preconfigured-models-with-helm","title":"Preconfigured models with helm","text":"

    When you are defining Helm values for the kubeai/models chart you can install a preconfigured Model by setting enabled: true. You can view a list of all preconfigured models in the chart's default values file.

    # helm-values.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n

    You can optionally override preconfigured settings, for example, resourceProfile:

    # helm-values.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n    resourceProfile: nvidia-gpu-l4:2 # Require \"2 NVIDIA L4 GPUs\"\n
    "},{"location":"how-to/install-models/#custom-models-with-helm","title":"Custom models with helm","text":"

    If you prefer to add a custom model via the same Helm chart you use for installed KubeAI, you can add your custom model entry into the .catalog array of your existing values file for the kubeai/models Helm chart:

    # helm-values.yaml\ncatalog:\n  my-custom-model-name:\n    enabled: true\n    features: [\"TextEmbedding\"]\n    owner: me\n    url: \"hf://me/my-custom-model\"\n    resourceProfile: CPU:1\n
    "},{"location":"how-to/install-models/#installing-models-with-kubectl","title":"Installing models with kubectl","text":"

    You can add your own model by defining a Model yaml file and applying it using kubectl apply -f model.yaml.

    If you have a running cluster with KubeAI installed you can inspect the schema for a Model using kubectl explain:

    kubectl explain models\nkubectl explain models.spec\nkubectl explain models.spec.engine\n
    "},{"location":"how-to/install-models/#programmatically-installing-models","title":"Programmatically installing models","text":"

    See the examples.

    "},{"location":"how-to/install-models/#feedback-welcome-a-model-management-ui","title":"Feedback welcome: A model management UI","text":"

    We are considering adding a UI for managing models in a running KubeAI instance. Give the GitHub Issue a thumbs up if you would be interested in this feature.

    "},{"location":"installation/eks/","title":"Install on EKS","text":"TIP: Make sure you have enough GPU quota in your AWS account.

    The default quotas for GPU instances are often 0. You will need to request a quota increase for the GPU instances you want to use.

    The following quotas may require an increase if you wish to use GPUs in your EKS cluster: - All G and VT Spot Instance Requests - All P5 Spot Instance Requests - All P4, P3 and P2 Spot Instance Requests - Running Dedicated p4d Hosts

    "},{"location":"installation/eks/#1-create-eks-cluster-with-karpenter","title":"1. Create EKS cluster with Karpenter","text":"

    Set the environment variables used throughout this guide:

    export CLUSTER_NAME=\"cluster-with-karpenter\"\nexport AWS_DEFAULT_REGION=\"us-west-2\"\nexport K8S_VERSION=\"1.30\"\nexport GPU_AMI_ID=\"$(aws ssm get-parameter --name /aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-gpu/recommended/image_id --query Parameter.Value --output text)\"\n

    Create the EKS cluster using eksctl:

    eksctl create cluster -f - <<EOF\n---\napiVersion: eksctl.io/v1alpha5\nkind: ClusterConfig\nmetadata:\n  name: \"${CLUSTER_NAME}\"\n  region: \"${AWS_DEFAULT_REGION}\"\n  version: \"${K8S_VERSION}\"\n  tags:\n    karpenter.sh/discovery: \"${CLUSTER_NAME}\" # here, it is set to the cluster name\n\niam:\n  withOIDC: true # required\n\nkarpenter:\n  version: '1.0.6' # Exact version must be specified\n\nmanagedNodeGroups:\n- instanceType: m5.large\n  amiFamily: AmazonLinux2\n  name: \"${CLUSTER_NAME}-m5-ng\"\n  desiredCapacity: 2\n  minSize: 1\n  maxSize: 10\nEOF\n

    "},{"location":"installation/eks/#2-configure-a-karpenter-gpu-nodepool","title":"2. Configure a Karpenter GPU NodePool","text":"

    Create the NodePool and EC2NodeClass objects:

    kubectl apply -f - <<EOF\napiVersion: karpenter.sh/v1\nkind: NodePool\nmetadata:\n  name: gpu\nspec:\n  template:\n    spec:\n      requirements:\n        - key: karpenter.sh/capacity-type\n          operator: In\n          values: [\"spot\", \"on-demand\"]\n        - key: karpenter.k8s.aws/instance-category\n          operator: In\n          values: [\"g\", \"p\"]\n      nodeClassRef:\n        group: karpenter.k8s.aws\n        kind: EC2NodeClass\n        name: gpu\n      expireAfter: 720h # 30 * 24h = 720h\n      taints:\n      - key: nvidia.com/gpu\n        value: \"true\"\n        effect: NoSchedule\n  limits:\n    cpu: 1000\n  disruption:\n    consolidationPolicy: WhenEmptyOrUnderutilized\n    consolidateAfter: 1m\n---\napiVersion: karpenter.k8s.aws/v1\nkind: EC2NodeClass\nmetadata:\n  name: gpu\nspec:\n  amiFamily: AL2 # Amazon Linux 2\n  role: \"eksctl-KarpenterNodeRole-${CLUSTER_NAME}\"\n  subnetSelectorTerms:\n    - tags:\n        karpenter.sh/discovery: \"${CLUSTER_NAME}\" # replace with your cluster name\n  securityGroupSelectorTerms:\n    - tags:\n        karpenter.sh/discovery: \"${CLUSTER_NAME}\" # replace with your cluster name\n  amiSelectorTerms:\n    - id: \"${GPU_AMI_ID}\" # <- GPU Optimized AMD AMI \n  blockDeviceMappings:\n    - deviceName: /dev/xvda\n      ebs:\n        volumeSize: 300Gi\n        volumeType: gp3\n        encrypted: true\nEOF\n

    Install the NVIDIA device plugin (needed for Karpenter nodes):

    kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.16.1/deployments/static/nvidia-device-plugin.yml\n
    "},{"location":"installation/eks/#3-install-kubeai","title":"3. Install KubeAI","text":"

    Add KubeAI Helm repository.

    helm repo add kubeai https://www.kubeai.org\nhelm repo update\n

    Make sure you have a HuggingFace Hub token set in your environment (HUGGING_FACE_HUB_TOKEN).

    export HF_TOKEN=\"replace-with-your-huggingface-token\"\n

    Install KubeAI with Helm.

    curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-eks.yaml\n# Please review the values-eks.yaml file and edit the nodeSelectors if needed.\ncat values-eks.yaml\nhelm upgrade --install kubeai kubeai/kubeai \\\n    -f values-eks.yaml \\\n    --set secrets.huggingface.token=$HF_TOKEN \\\n    --wait\n
    "},{"location":"installation/eks/#3-optionally-configure-models","title":"3. Optionally configure models","text":"

    Optionally install preconfigured models.

    cat <<EOF > kubeai-models.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n
    "},{"location":"installation/gke/","title":"Install on GKE","text":"TIP: Make sure you have enough quota in your GCP project.

    Open the cloud console quotas page: https://console.cloud.google.com/iam-admin/quotas. Make sure your project is selected in the top left.

    You will need to verify that you have enough quota for the accelerators you want to use. Below is table of common quotas you will have to increase depending on your needs.

    Quota Location Min Value Preemptible TPU v5 Lite Podslice chips <your-region> 8 Preemptible NVIDIA L4 GPUs <your-region> 2 GPUs (all regions) - 2 CPUs (all regions) - 24

    See the following screenshot examples of how these quotas appear in the console:

    "},{"location":"installation/gke/#1-create-a-cluster","title":"1. Create a cluster","text":""},{"location":"installation/gke/#option-gke-autopilot","title":"Option: GKE Autopilot","text":"

    Create an Autopilot cluster (replace us-central1 with a region that you have quota).

    gcloud container clusters create-auto cluster-1 \\\n    --location=us-central1\n
    "},{"location":"installation/gke/#option-gke-standard","title":"Option: GKE Standard","text":"

    TODO: Reference gcloud commands for creating a GKE standard cluster.

    "},{"location":"installation/gke/#2-install-kubeai","title":"2. Install KubeAI","text":"

    Add KubeAI Helm repository.

    helm repo add kubeai https://www.kubeai.org\nhelm repo update\n

    Make sure you have a HuggingFace Hub token set in your environment (HUGGING_FACE_HUB_TOKEN).

    Install KubeAI with Helm.

    curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-gke.yaml\nhelm upgrade --install kubeai kubeai/kubeai \\\n    -f values-gke.yaml \\\n    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \\\n    --wait\n
    "},{"location":"installation/gke/#3-optionally-configure-models","title":"3. Optionally configure models","text":"

    Optionally install preconfigured models.

    cat <<EOF > kubeai-models.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n
    "},{"location":"reference/kubernetes-api/","title":"Kubernetes API","text":""},{"location":"reference/kubernetes-api/#packages","title":"Packages","text":"
    • kubeai.org/v1
    "},{"location":"reference/kubernetes-api/#kubeaiorgv1","title":"kubeai.org/v1","text":"

    Package v1 contains API Schema definitions for the kubeai v1 API group

    "},{"location":"reference/kubernetes-api/#resource-types","title":"Resource Types","text":"
    • Model
    "},{"location":"reference/kubernetes-api/#model","title":"Model","text":"

    Model resources define the ML models that will be served by KubeAI.

    Field Description Default Validation apiVersion string kubeai.org/v1 kind string Model metadata ObjectMeta Refer to Kubernetes API documentation for fields of metadata. spec ModelSpec status ModelStatus"},{"location":"reference/kubernetes-api/#modelfeature","title":"ModelFeature","text":"

    Underlying type: string

    Validation: - Enum: [TextGeneration TextEmbedding SpeechToText]

    Appears in: - ModelSpec

    "},{"location":"reference/kubernetes-api/#modelspec","title":"ModelSpec","text":"

    ModelSpec defines the desired state of Model.

    Appears in: - Model

    Field Description Default Validation url string URL of the model to be served.Currently only the following formats are supported:For VLLM & FasterWhisper engines: \"hf:///\"For OLlama engine: \"ollama:// Required: {} features ModelFeature array Features that the model supports.Dictates the APIs that are available for the model. Enum: [TextGeneration TextEmbedding SpeechToText] engine string Engine to be used for the server process. Enum: [OLlama VLLM FasterWhisper Infinity] Required: {} resourceProfile string ResourceProfile required to serve the model.Use the format \":\".Example: \"nvidia-gpu-l4:2\" - 2x NVIDIA L4 GPUs.Must be a valid ResourceProfile defined in the system config. cacheProfile string CacheProfile to be used for caching model artifacts.Must be a valid CacheProfile defined in the system config. image string Image to be used for the server process.Will be set from ResourceProfile + Engine if not specified. args string array Args to be added to the server process. env object (keys:string, values:string) Env variables to be added to the server process. replicas integer Replicas is the number of Pod replicas that should be activelyserving the model. KubeAI will manage this field unless AutoscalingDisabledis set to true. minReplicas integer MinReplicas is the minimum number of Pod replicas that the model can scale down to.Note: 0 is a valid value. Minimum: 0 Optional: {} maxReplicas integer MaxReplicas is the maximum number of Pod replicas that the model can scale up to.Empty value means no limit. Minimum: 1 autoscalingDisabled boolean AutoscalingDisabled will stop the controller from managing the replicasfor the Model. When disabled, metrics will not be collected on server Pods. targetRequests integer TargetRequests is average number of active requests that the autoscalerwill try to maintain on model server Pods. 100 Minimum: 1 scaleDownDelaySeconds integer ScaleDownDelay is the minimum time before a deployment is scaled down afterthe autoscaling algorithm determines that it should be scaled down. 30 owner string Owner of the model. Used solely to populate the owner field in theOpenAI /v1/models endpoint.DEPRECATED. Optional: {}"},{"location":"reference/kubernetes-api/#modelstatus","title":"ModelStatus","text":"

    ModelStatus defines the observed state of Model.

    Appears in: - Model

    Field Description Default Validation replicas ModelStatusReplicas cache ModelStatusCache"},{"location":"reference/kubernetes-api/#modelstatuscache","title":"ModelStatusCache","text":"

    Appears in: - ModelStatus

    Field Description Default Validation loaded boolean"},{"location":"reference/kubernetes-api/#modelstatusreplicas","title":"ModelStatusReplicas","text":"

    Appears in: - ModelStatus

    Field Description Default Validation all integer ready integer"},{"location":"reference/openai-api-compatibility/","title":"OpenAI API Compatibility","text":"

    KubeAI provides an OpenAI API compatiblity layer.

    "},{"location":"reference/openai-api-compatibility/#general","title":"General:","text":""},{"location":"reference/openai-api-compatibility/#models","title":"Models","text":"
    GET /v1/models\n
    • Lists all kind: Model object installed in teh Kubernetes API Server.
    "},{"location":"reference/openai-api-compatibility/#inference","title":"Inference","text":""},{"location":"reference/openai-api-compatibility/#text-generation","title":"Text Generation","text":"
    POST /v1/chat/completions\nPOST /v1/completions\n
    • Supported for Models with .spec.features: [\"TextGeneration\"].
    "},{"location":"reference/openai-api-compatibility/#embeddings","title":"Embeddings","text":"
    POST /v1/embeddings\n
    • Supported for Models with .spec.features: [\"TextEmbedding\"].
    "},{"location":"reference/openai-api-compatibility/#speech-to-text","title":"Speech-to-Text","text":"
    POST /v1/audio/transcriptions\n
    • Supported for Models with .spec.features: [\"SpeechToText\"].
    "},{"location":"reference/openai-api-compatibility/#openai-client-libaries","title":"OpenAI Client libaries","text":"

    You can use the official OpenAI client libraries by setting the base_url to the KubeAI endpoint.

    For example, you can use the Python client like this:

    from openai import OpenAI\nclient = OpenAI(api_key=\"ignored\",\n                base_url=\"http://kubeai/openai/v1\")\nresponse = client.chat.completions.create(\n  model=\"gemma2-2b-cpu\",\n  messages=[\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n    {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n    {\"role\": \"user\", \"content\": \"Where was it played?\"}\n  ]\n)\n

    "},{"location":"tutorials/langchain/","title":"Using LangChain with KubeAI","text":"

    LangChain makes it easy to build applications powered by LLMs. KubeAI makes it easy to deploy and manage LLMs at scale. Together, they make it easy to build and deploy private and secure AI applications.

    In this tutorial, we'll show you how to use LangChain with KubeAI's OpenAI compatible API. The beauty of KubeAI's OpenAI compatibility is that you can use KubeAI with any framework that supports OpenAI.

    "},{"location":"tutorials/langchain/#prerequisites","title":"Prerequisites","text":"

    A K8s cluster. You can use a local cluster like kind.

    "},{"location":"tutorials/langchain/#installing-kubeai-with-gemma-2b","title":"Installing KubeAI with Gemma 2B","text":"

    Run the following command to install KubeAI with Gemma 2B:

    helm repo add kubeai https://www.kubeai.org\nhelm repo update\n\ncat <<EOF > models-helm-values.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\nEOF\n\nhelm install kubeai kubeai/kubeai \\\n    -f ./helm-values.yaml \\\n    --wait --timeout 10m\n\nhelm install kubeai-models kubeai/models \\\n    -f ./models-helm-values.yaml\n
    "},{"location":"tutorials/langchain/#using-langchain","title":"Using LangChain","text":"

    Install the required Python packages:

    pip install langchain_openai\n

    Let's access the KubeAI OpenAI compatible API locally to make it easier.

    Run the following command to port-forward to the KubeAI service:

    kubectl port-forward svc/kubeai 8000:80\n
    Now the KubeAI OpenAI compatible API is available at http://localhost:8000/openai from your local machine.

    Let's create a simple Python script that uses LangChain and is connected to KubeAI.

    Create a file named test-langchain.py with the following content:

    from langchain_openai import ChatOpenAI\n\nllm = ChatOpenAI(\n    model=\"gemma2-2b-cpu\",\n    temperature=0,\n    max_tokens=None,\n    timeout=None,\n    max_retries=2,\n    api_key=\"thisIsIgnored\",\n    base_url=\"http://localhost:8000/openai/v1\",\n)\n\nmessages = [\n    (\n        \"system\",\n        \"You are a helpful assistant that translates English to French. Translate the user sentence.\",\n    ),\n    (\"human\", \"I love programming.\"),\n]\nai_msg = llm.invoke(messages)\nprint(ai_msg.content)\n

    Run the Python script:

    python test-langchain.py\n

    Notice that we set base_url to http://localhost:8000/openai/v1. This tells LangChain to use our local KubeAI OpenAI compatible AP instead of the default OpenAI public API.

    If you run langchain within the K8s cluster, you can use the following base_url instead: http://kubeai/openai/v1. So the code would look like this:

    llm = ChatOpenAI(\n    ...\n    base_url=\"http://kubeai/openai/v1\",\n)\n

    That's it! You've successfully used LangChain with KubeAI. Now you can build and deploy private and secure AI applications with ease.

    "},{"location":"tutorials/langtrace/","title":"Deploying KubeAI with Langtrace","text":"

    Langtrace is an open source tool that helps you with tracing and monitoring your AI calls. It includes a self-hosted UI that for example shows you the estimated costs of your LLM calls.

    KubeAI is used for deploying LLMs with an OpenAI compatible endpoint.

    In this tutorial you will learn how to deploy KubeAI and Langtrace end-to-end. Both KubeAI and Langtrace are installed in your Kubernetes cluster. No cloud services or external dependencies are required.

    If you don't have a K8s cluster yet, you can create one using kind or minikube.

    kind create cluster # OR: minikube start\n

    Install Langtrace:

    helm repo add langtrace https://Scale3-Labs.github.io/langtrace-helm-chart\nhelm repo update\nhelm install langtrace langtrace/langtrace\n

    Install KubeAI and wait for all components to be ready (may take a minute).

    helm repo add kubeai https://www.kubeai.org\nhelm repo update\nhelm install kubeai kubeai/kubeai --wait --timeout 10m\n

    Install the gemma2-2b-cpu model:

    cat <<EOF > kubeai-models.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n

    Create a local Python environment and install dependencies:

    python3 -m venv .venv\nsource .venv/bin/activate\npip install langtrace-python-sdk openai\n

    Expose the KubeAI service to your local port:

    kubectl port-forward service/kubeai 8000:80\n

    Expose the Langtrace service to your local port:

    kubectl port-forward service/langtrace-app 3000:3000\n

    A Langtrace API key is required to use the Langtrace SDK. So lets get one by visiting your self hosted Langtace UI.

    Open your browser to http://localhost:3000, create a project and get the API keys for your langtrace project.

    In the Python script below, replace langtrace_api_key with your API key.

    Create file named langtrace-example.py with the following content:

    # Replace this with your langtrace API key by visiting http://localhost:3000\nlangtrace_api_key=\"f7e003de19b9a628258531c17c264002e985604ca9fa561debcc85c41f357b09\"\n\nfrom langtrace_python_sdk import langtrace\nfrom langtrace_python_sdk.utils.with_root_span import with_langtrace_root_span\n# Paste this code after your langtrace init function\n\nfrom openai import OpenAI\n\nlangtrace.init(\n    api_key=api_key,\n    api_host=\"http://localhost:3000/api/trace\",\n)\n\nbase_url = \"http://localhost:8000/openai/v1\"\nmodel = \"gemma2-2b-cpu\"\n\n@with_langtrace_root_span()\ndef example():\n    client = OpenAI(base_url=base_url, api_key=\"ignored-by-kubeai\")\n    response = client.chat.completions.create(\n        model=model,\n        messages=[\n            {\n                \"role\": \"system\",\n                \"content\": \"How many states of matter are there?\"\n            }\n        ],\n    )\n    print(response.choices[0].message.content)\n\nexample()\n

    Run the Python script:

    python3 langtrace-example.py\n

    Now you should see the trace in your Langtrace UI. Take a look by visiting http://localhost:3000.

    "},{"location":"tutorials/weaviate/","title":"Weaviate with local autoscaling embedding and generative models","text":"

    Weaviate is a vector search engine that can integrate seamlessly with KubeAI's embedding and generative models. This tutorial demonstrates how to deploy both KubeAI and Weaviate in a Kubernetes cluster, using KubeAI as the OpenAI endpoint for Weaviate.

    Why use KubeAI with Weaviate?

    • Security and privacy: KubeAI runs locally in your Kubernetes cluster, so your data never leaves your infrastructure.
    • Cost savings: KubeAI can run on your existing hardware, reducing the need for paying for embeddings and generative models.

    This tutorial uses CPU only models, so it should work even on your laptop.

    As you go go through this tutorial, you will learn how to:

    • Deploy KubeAI with embedding and generative models
    • Install Weaviate and connect it to KubeAI
    • Import data into Weaviate
    • Perform semantic search using the embedding model
    • Perform generative search using the generative model
    "},{"location":"tutorials/weaviate/#prerequisites","title":"Prerequisites","text":"

    A Kubernetes cluster. You can use kind or minikube.

    kind create cluster\n
    "},{"location":"tutorials/weaviate/#kubeai-configuration","title":"KubeAI Configuration","text":"

    Let's start by deploying KubeAI with the models we want to use. Nomic embedding model is used instead of text-embedding-ada-002. Gemma 2 2B is used instead of gpt-3.5-turbo. You could choose to use bigger models depending on your available hardware.

    Create a file named kubeai-model-values.yaml with the following content:

    catalog:\n  text-embedding-ada-002:\n    enabled: true\n    minReplicas: 1\n    features: [\"TextEmbedding\"]\n    owner: nomic\n    url: \"ollama://nomic-embed-text\"\n    engine: OLlama\n    resourceProfile: cpu:1\n  gpt-3.5-turbo:\n    enabled: true\n    minReplicas: 1\n    features: [\"TextGeneration\"]\n    owner: google\n    url: \"ollama://gemma2:2b\"\n    engine: OLlama\n    resourceProfile: cpu:2\n

    Note: It's important that you name the models as text-embedding-ada-002 and gpt-3.5-turbo as Weaviate expects these names.

    Run the following command to deploy KubeAI and install the configured models:

    helm repo add kubeai https://www.kubeai.org && helm repo update\n\nhelm install kubeai kubeai/kubeai\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-model-values.yaml\n

    "},{"location":"tutorials/weaviate/#weaviate-installation","title":"Weaviate Installation","text":"

    For this tutorial, we will use the Weaviate Helm chart to deploy Weaviate.

    Let's enable the text2vec-openai and generative-openai modules in Weaviate. We will also set the default vectorizer module to text2vec-openai.

    The apiKey is ignored in this case as we are using KubeAI as the OpenAI endpoint.

    Create a file named weaviate-values.yaml with the following content:

    modules:\n  text2vec-openai:\n    enabled: true\n    apiKey: thisIsIgnored\n  generative-openai:\n    enabled: true\n    apiKey: thisIsIgnored\n  default_vectorizer_module: text2vec-openai\nservice:\n  # To prevent Weaviate being exposed publicly\n  type: ClusterIP\n

    Install Weaviate by running the following command:

    helm repo add weaviate https://weaviate.github.io/weaviate-helm && helm repo update\n\nhelm install \\\n  \"weaviate\" \\\n  weaviate/weaviate \\\n  -f weaviate-values.yaml\n

    "},{"location":"tutorials/weaviate/#usage","title":"Usage","text":"

    We will be using Python to interact with Weaviate. The 2 use cases we will cover are: - Semantic search using the embedding model - Generative search using the generative model

    "},{"location":"tutorials/weaviate/#connectivity","title":"Connectivity","text":"

    The remaining steps require connectivity to the Weaviate service. However, Weaviate is not exposed publicly in this setup. So we setup a local port forwards to access the Weaviate services.

    Setup a local port forwards to the Weaviate services by running:

    kubectl port-forward svc/weaviate 8080:80\nkubectl port-forward svc/weaviate-grpc 50051:50051\n

    "},{"location":"tutorials/weaviate/#weaviate-client-python-setup","title":"Weaviate client Python Setup","text":"

    Create a virtual environment and install the Weaviate client:

    python -m venv .venv\nsource .venv/bin/activate\npip install -U weaviate-client requests\n

    "},{"location":"tutorials/weaviate/#collection-and-data-import","title":"Collection and Data Import","text":"

    Create a file named create-collection.py with the following content:

    import json\nimport weaviate\nimport requests\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n\n    client.collections.create(\n        \"Question\",\n        vectorizer_config=Configure.Vectorizer.text2vec_openai(\n                model=\"text-embedding-ada-002\",\n                base_url=\"http://kubeai/openai\",\n        ),\n        generative_config=Configure.Generative.openai(\n            model=\"gpt-3.5-turbo\",\n            base_url=\"http://kubeai/openai\",\n        ),\n    )\n\n    # import data\n    resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')\n    data = json.loads(resp.text)  # Load data\n\n    question_objs = list()\n    for i, d in enumerate(data):\n        question_objs.append({\n            \"answer\": d[\"Answer\"],\n            \"question\": d[\"Question\"],\n            \"category\": d[\"Category\"],\n        })\n\n    questions = client.collections.get(\"Question\")\n    questions.data.insert_many(question_objs)\n    print(\"Data imported successfully\")\n

    Create a collection that uses KubeAI as the openAI endpoint:

    python create-collection.py\n
    You should see a message Data imported successfully.

    The collection is now created and data is imported. The vectors are generated by KubeAI and stored in Weaviate.

    "},{"location":"tutorials/weaviate/#semantic-search","title":"Semantic Search","text":"

    Now let's do semantic search, which uses the embeddings. Create a file named search.py with the following content:

    import weaviate\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n    questions = client.collections.get(\"Question\")\n    response = questions.query.near_text(\n        query=\"biology\",\n        limit=2\n    )\n    print(response.objects[0].properties)  # Inspect the first object\n

    Execute the python script:

    python search.py\n

    You should see the following output:

    {\n  \"answer\": \"DNA\",\n  \"question\": \"In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance\",\n  \"category\": \"SCIENCE\"\n}\n

    "},{"location":"tutorials/weaviate/#generative-search-rag","title":"Generative Search (RAG)","text":"

    Now let's do generative search, which uses the generative model (Text generation LLM). The generative model is run locally and managed by KubeAI.

    Create a file named generate.py with the following content:

    import weaviate\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n    questions = client.collections.get(\"Question\")\n\n    response = questions.generate.near_text(\n        query=\"biology\",\n        limit=2,\n        grouped_task=\"Write a tweet with emojis about these facts.\"\n    )\n\n    print(response.generated)  # Inspect the generated text\n

    Run the python script:

    python generate.py\n

    You should see something similar to this:

    \ud83e\uddec Watson & Crick cracked the code in 1953! \ud83e\udd2f They built a model of DNA, the blueprint of life. \ud83e\uddec \ud83e\udde0 Liver power! \ud83d\udcaa This organ keeps your blood sugar balanced by storing glucose as glycogen. \ud83e\ude78 #ScienceFacts #Biology

    "},{"location":"tutorials/weaviate/#conclusion","title":"Conclusion","text":"

    You've now successfully set up KubeAI with Weaviate for both embedding-based semantic search and generative tasks. You've also learned how to import data, perform searches, and generate content using KubeAI-managed models.

    "}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml index cc7c63c2..f962cedd 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -44,6 +44,10 @@ https://www.kubeai.org/how-to/build-models-into-containers/ 2024-10-25 + + https://www.kubeai.org/how-to/cache-models-with-aws-efs/ + 2024-10-25 + https://www.kubeai.org/how-to/cache-models-with-gcp-filestore/ 2024-10-25 diff --git a/sitemap.xml.gz b/sitemap.xml.gz index e53fd8112c32c7944371a6a17b1d1691119a848c..96d75eabfb4e70ebb580b973ab909efabce2d9ae 100644 GIT binary patch delta 484 zcmV#MfG%wS2FMzZGYFzdR-+eyqg*3n$T=U zZYn+eb1$-tzNxO~>qoep;rc0V+1DP%)i3LRp;O2A>_-d(D{FB?_)uQqa;af4$r;%= zw0D{&VTH4J)@Y;a^NZK02XYNCg%l0>0Rpj~gC>}QZ;hn`5LkS`jzFXCGarHkQ_qq_ z8(PTvX(T5=ZhtYsyEHxs#_Kt$qh!vA-w1&$A#Y)!iStE8?^h77FvM68(gLjyFwR!u z%%Rgia+B1vu?<99WEqST56MqxT+VdeSyG13PnOk454bW=`yYUGdV;Nf#AK%IINr6U zn*+fS#rz!vjWso?mG2%pj^w!N{tksFBHhO|5PFr|99_n-ptZRVWS4!^aT*R1?uL9B z8K|WYKk-PV#o$PY0Vi_+fS+Q=FiAX?>~V^|sEsv}uY*V|M!{f=H;b;*eQX*BX-hy& a2WYQ^i$VDZ>FpN(fvPu@w~2-`4FCYR2I&a^ delta 477 zcmV<30V4kJ1L*^h7JtQ8%Z{8d5WMeKjJP+RnQWvanqhDI1<40ArUxvHZMhrR{QcT9 zc{t2_)@(zPnS7bCOM?d zj<3}q622+5u4~OkXu`m?kG*0&qy?)jsE+m5s(Cu3kpUjF`+vAtubYg*8_YPjgys%% zQTgWIdy!@GU2{I3pW$?b^XIf>i#@>AZ}flQY14BK6ULF1wKySss?TsbwXm3+jAA;p zcbbB*!dX0Pv?=xF$y?MTxel0Ou7>ggBC(&NCJ52rjHMkUu=t2Qfkr(PJ_L!Toh6fY zw2<{-A}2s@F@Mp!Jl{!-*K=k^Dcli%Bt)`=d@l=4T&^kxzk+y$G1Y>Q7P$Ec(`+TJ z96J58w32oSkT(i2UfDtKJ1gmorwFfTt`N_QA}S} zq;fDi5@Nt$b`ao0GHR2_lT%J1<)Ss#NIBz{WJb|osyB +
    + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/tutorials/langtrace/index.html b/tutorials/langtrace/index.html index 0645e785..51812a59 100644 --- a/tutorials/langtrace/index.html +++ b/tutorials/langtrace/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +
  • diff --git a/tutorials/weaviate/index.html b/tutorials/weaviate/index.html index 97ae8e7c..5cca5a93 100644 --- a/tutorials/weaviate/index.html +++ b/tutorials/weaviate/index.html @@ -405,6 +405,27 @@ +
  • + + + + + Cache models with AWS EFS + + + + +
  • + + + + + + + + + +