From e382382fbeccb1ff8c91f63d7360743ba45e5153 Mon Sep 17 00:00:00 2001
From: samos123 <samos123@users.noreply.github.com>
Date: Fri, 25 Oct 2024 05:55:21 +0000
Subject: [PATCH] Deployed b1b579c with MkDocs version: 1.6.1

---
 404.html                                      |   21 +
 benchmarks/llama-3.2-11b-vision/index.html    |   21 +
 concepts/autoscaling/index.html               |   21 +
 concepts/backend-servers/index.html           |   21 +
 concepts/resource-profiles/index.html         |   21 +
 concepts/storage-caching/index.html           |   21 +
 .../development-environment/index.html        |   21 +
 contributing/documentation/index.html         |   21 +
 contributing/release-process/index.html       |   21 +
 how-to/architect-for-multitenancy/index.html  |   21 +
 .../build-models-into-containers/index.html   |   23 +-
 how-to/cache-models-with-aws-efs/index.html   | 1455 +++++++++++++++++
 .../index.html                                |   23 +-
 how-to/configure-autoscaling/index.html       |   21 +
 how-to/configure-embedding-models/index.html  |   21 +
 how-to/configure-resource-profiles/index.html |   21 +
 how-to/configure-speech-to-text/index.html    |   21 +
 how-to/install-models/index.html              |   21 +
 index.html                                    |   21 +
 installation/eks/index.html                   |   21 +
 installation/gke/index.html                   |   21 +
 reference/kubernetes-api/index.html           |   21 +
 reference/openai-api-compatibility/index.html |   21 +
 search/search_index.json                      |    2 +-
 sitemap.xml                                   |    4 +
 sitemap.xml.gz                                |  Bin 489 -> 496 bytes
 tutorials/langchain/index.html                |   21 +
 tutorials/langtrace/index.html                |   21 +
 tutorials/weaviate/index.html                 |   21 +
 29 files changed, 1987 insertions(+), 3 deletions(-)
 create mode 100644 how-to/cache-models-with-aws-efs/index.html
diff --git a/404.html b/404.html
index 7c1ec5b5..f0909f33 100644
--- a/404.html
+++ b/404.html
@@ -394,6 +394,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="/how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="/how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/benchmarks/llama-3.2-11b-vision/index.html b/benchmarks/llama-3.2-11b-vision/index.html
index 58eaa0e5..7f9e8e6f 100644
--- a/benchmarks/llama-3.2-11b-vision/index.html
+++ b/benchmarks/llama-3.2-11b-vision/index.html
@@ -403,6 +403,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/concepts/autoscaling/index.html b/concepts/autoscaling/index.html
index cf5cf5fd..0c19411d 100644
--- a/concepts/autoscaling/index.html
+++ b/concepts/autoscaling/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/concepts/backend-servers/index.html b/concepts/backend-servers/index.html
index fde3c0b9..1d0e681e 100644
--- a/concepts/backend-servers/index.html
+++ b/concepts/backend-servers/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/concepts/resource-profiles/index.html b/concepts/resource-profiles/index.html
index 8d7c13d0..be4633d3 100644
--- a/concepts/resource-profiles/index.html
+++ b/concepts/resource-profiles/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/concepts/storage-caching/index.html b/concepts/storage-caching/index.html
index fc67eaf1..d99ccb74 100644
--- a/concepts/storage-caching/index.html
+++ b/concepts/storage-caching/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/contributing/development-environment/index.html b/contributing/development-environment/index.html
index 0e4dcad9..1ac078d2 100644
--- a/contributing/development-environment/index.html
+++ b/contributing/development-environment/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/contributing/documentation/index.html b/contributing/documentation/index.html
index 27968e02..8228b5f9 100644
--- a/contributing/documentation/index.html
+++ b/contributing/documentation/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/contributing/release-process/index.html b/contributing/release-process/index.html
index 7126c127..1bfa8ad9 100644
--- a/contributing/release-process/index.html
+++ b/contributing/release-process/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/how-to/architect-for-multitenancy/index.html b/how-to/architect-for-multitenancy/index.html
index 9555afb5..46cafd81 100644
--- a/how-to/architect-for-multitenancy/index.html
+++ b/how-to/architect-for-multitenancy/index.html
@@ -417,6 +417,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/how-to/build-models-into-containers/index.html b/how-to/build-models-into-containers/index.html
index 0c9ea8c7..f5f23d74 100644
--- a/how-to/build-models-into-containers/index.html
+++ b/how-to/build-models-into-containers/index.html
@@ -14,7 +14,7 @@
         <link rel="prev" href="../architect-for-multitenancy/">
       
       
-        <link rel="next" href="../cache-models-with-gcp-filestore/">
+        <link rel="next" href="../cache-models-with-aws-efs/">
       
       
       <link rel="icon" href="../../assets/images/favicon.png">
@@ -417,6 +417,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/how-to/cache-models-with-aws-efs/index.html b/how-to/cache-models-with-aws-efs/index.html
new file mode 100644
index 00000000..2ceebcb0
--- /dev/null
+++ b/how-to/cache-models-with-aws-efs/index.html
@@ -0,0 +1,1455 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+      
+      
+        <link rel="canonical" href="https://www.kubeai.org/how-to/cache-models-with-aws-efs/">
+      
+      
+        <link rel="prev" href="../build-models-into-containers/">
+      
+      
+        <link rel="next" href="../cache-models-with-gcp-filestore/">
+      
+      
+      <link rel="icon" href="../../assets/images/favicon.png">
+      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.5.42">
+    
+    
+      
+        <title>Cache models with AWS EFS - KubeAI</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.0253249f.min.css">
+      
+        
+        <link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css">
+      
+      
+
+
+    
+    
+      
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    
+    
+    
+    
+    
+    <body dir="ltr" data-md-color-scheme="default" data-md-color-primary="white" data-md-color-accent="blue">
+  
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#cache-models-with-aws-efs" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="KubeAI" class="md-header__button md-logo" aria-label="KubeAI" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            KubeAI
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Cache models with AWS EFS
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+      
+    
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.52 6.52 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5"/></svg>
+        
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" tabindex="0" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/substratusai/kubeai" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="KubeAI" class="md-nav__button md-logo" aria-label="KubeAI" data-md-component="logo">
+      
+  
+  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
+
+    </a>
+    KubeAI
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/substratusai/kubeai" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81"/></svg>
+  </div>
+  <div class="md-source__repository">
+    GitHub
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Home
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
+        
+          
+          <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Installation
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_2">
+            <span class="md-nav__icon md-icon"></span>
+            Installation
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../installation/eks/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Install on EKS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../installation/gke/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Install on GKE
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+    
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
+        
+          
+          <label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    How to
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
+          <label class="md-nav__title" for="__nav_3">
+            <span class="md-nav__icon md-icon"></span>
+            How to
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../architect-for-multitenancy/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Architect for Multitenancy
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../build-models-into-containers/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Build models into containers
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+    
+  
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#1-create-an-efs-file-system" class="md-nav__link">
+    <span class="md-ellipsis">
+      1. Create an EFS File System
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#2-install-the-efs-csi-driver" class="md-nav__link">
+    <span class="md-ellipsis">
+      2. Install the EFS CSI driver
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#3-configure-kubeai-with-the-efs-cache-profile" class="md-nav__link">
+    <span class="md-ellipsis">
+      3. Configure KubeAI with the EFS cache profile
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#4-configure-a-model-to-use-the-efs-cache" class="md-nav__link">
+    <span class="md-ellipsis">
+      4. Configure a model to use the EFS cache
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#troubleshooting" class="md-nav__link">
+    <span class="md-ellipsis">
+      Troubleshooting
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="Troubleshooting">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#mountvolesetup-failed-for-volume-pvc-deadline-exceeded" class="md-nav__link">
+    <span class="md-ellipsis">
+      MountVole.SetUp failed for volume pvc deadline exceeded
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#model-loading-job" class="md-nav__link">
+    <span class="md-ellipsis">
+      Model Loading Job
+    </span>
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../cache-models-with-gcp-filestore/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with GCP Filestore
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../configure-autoscaling/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Configure autoscaling
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../configure-embedding-models/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Configure Embedding Models
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../configure-resource-profiles/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Configure resource profiles
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../configure-speech-to-text/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Configure speech-to-text
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../install-models/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Install models
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
+        
+          
+          <label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Concepts
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_4">
+            <span class="md-nav__icon md-icon"></span>
+            Concepts
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../concepts/autoscaling/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Autoscaling
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../concepts/backend-servers/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Backend Servers
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../concepts/resource-profiles/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Resource Profiles
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../concepts/storage-caching/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Storage / Caching
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_5" >
+        
+          
+          <label class="md-nav__link" for="__nav_5" id="__nav_5_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Tutorials
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_5_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_5">
+            <span class="md-nav__icon md-icon"></span>
+            Tutorials
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../tutorials/langchain/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Using LangChain with KubeAI
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../tutorials/langtrace/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Deploying KubeAI with Langtrace
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../tutorials/weaviate/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Weaviate with local autoscaling embedding and generative models
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_6" >
+        
+          
+          <label class="md-nav__link" for="__nav_6" id="__nav_6_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Contributing
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_6_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_6">
+            <span class="md-nav__icon md-icon"></span>
+            Contributing
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../contributing/development-environment/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Development environment
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../contributing/documentation/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Documentation
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../contributing/release-process/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Release Process
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_7" >
+        
+          
+          <label class="md-nav__link" for="__nav_7" id="__nav_7_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Reference
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_7_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_7">
+            <span class="md-nav__icon md-icon"></span>
+            Reference
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../reference/kubernetes-api/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Kubernetes API
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../reference/openai-api-compatibility/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    OpenAI API Compatibility
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+      
+      
+  
+  
+  
+  
+    
+    
+    
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+        
+        
+        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_8" >
+        
+          
+          <label class="md-nav__link" for="__nav_8" id="__nav_8_label" tabindex="0">
+            
+  
+  <span class="md-ellipsis">
+    Benchmarks
+  </span>
+  
+
+            <span class="md-nav__icon md-icon"></span>
+          </label>
+        
+        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_8_label" aria-expanded="false">
+          <label class="md-nav__title" for="__nav_8">
+            <span class="md-nav__icon md-icon"></span>
+            Benchmarks
+          </label>
+          <ul class="md-nav__list" data-md-scrollfix>
+            
+              
+                
+  
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../benchmarks/llama-3.2-11b-vision/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Llama 3.2 11B Vision Instruct vLLM Benchmarks
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+          </ul>
+        </nav>
+      
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#1-create-an-efs-file-system" class="md-nav__link">
+    <span class="md-ellipsis">
+      1. Create an EFS File System
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#2-install-the-efs-csi-driver" class="md-nav__link">
+    <span class="md-ellipsis">
+      2. Install the EFS CSI driver
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#3-configure-kubeai-with-the-efs-cache-profile" class="md-nav__link">
+    <span class="md-ellipsis">
+      3. Configure KubeAI with the EFS cache profile
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#4-configure-a-model-to-use-the-efs-cache" class="md-nav__link">
+    <span class="md-ellipsis">
+      4. Configure a model to use the EFS cache
+    </span>
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#troubleshooting" class="md-nav__link">
+    <span class="md-ellipsis">
+      Troubleshooting
+    </span>
+  </a>
+  
+    <nav class="md-nav" aria-label="Troubleshooting">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#mountvolesetup-failed-for-volume-pvc-deadline-exceeded" class="md-nav__link">
+    <span class="md-ellipsis">
+      MountVole.SetUp failed for volume pvc deadline exceeded
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#model-loading-job" class="md-nav__link">
+    <span class="md-ellipsis">
+      Model Loading Job
+    </span>
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="cache-models-with-aws-efs">Cache models with AWS EFS<a class="headerlink" href="#cache-models-with-aws-efs" title="Permanent link">&para;</a></h1>
+<p>KubeAI can manage model caches. AWS EFS is supported as a pluggable backend store.</p>
+<p><br>
+<img src="/diagrams/caching-shared-filesystem.excalidraw.png" width="90%"></img></p>
+<p>Follow the <a href="../../installation/eks/">EKS install guide</a>.</p>
+<h2 id="1-create-an-efs-file-system">1. Create an EFS File System<a class="headerlink" href="#1-create-an-efs-file-system" title="Permanent link">&para;</a></h2>
+<p>Set environment variables to match your environment.</p>
+<div class="highlight"><pre><span></span><code><span class="nb">export</span><span class="w"> </span><span class="nv">CLUSTER_NAME</span><span class="o">=</span><span class="s2">&quot;cluster-with-karpenter&quot;</span>
+<span class="nb">export</span><span class="w"> </span><span class="nv">CLUSTER_REGION</span><span class="o">=</span><span class="s2">&quot;us-west-2&quot;</span>
+</code></pre></div>
+<p>Create an EFS file system in the same VPC as your EKS cluster.</p>
+<div class="highlight"><pre><span></span><code><span class="nv">vpc_id</span><span class="o">=</span><span class="k">$(</span>aws<span class="w"> </span>eks<span class="w"> </span>describe-cluster<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--name<span class="w"> </span><span class="nv">$CLUSTER_NAME</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--query<span class="w"> </span><span class="s2">&quot;cluster.resourcesVpcConfig.vpcId&quot;</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--output<span class="w"> </span>text<span class="k">)</span>
+
+<span class="nv">cidr_range</span><span class="o">=</span><span class="k">$(</span>aws<span class="w"> </span>ec2<span class="w"> </span>describe-vpcs<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--vpc-ids<span class="w"> </span><span class="nv">$vpc_id</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--query<span class="w"> </span><span class="s2">&quot;Vpcs[].CidrBlock&quot;</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--output<span class="w"> </span>text<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--region<span class="w"> </span><span class="si">${</span><span class="nv">CLUSTER_REGION</span><span class="si">}</span><span class="k">)</span>
+
+<span class="nv">security_group_id</span><span class="o">=</span><span class="k">$(</span>aws<span class="w"> </span>ec2<span class="w"> </span>create-security-group<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--group-name<span class="w"> </span>MyEfsSecurityGroup<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--description<span class="w"> </span><span class="s2">&quot;My EFS security group&quot;</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--vpc-id<span class="w"> </span><span class="nv">$vpc_id</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--output<span class="w"> </span>text<span class="k">)</span>
+
+aws<span class="w"> </span>ec2<span class="w"> </span>authorize-security-group-ingress<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--group-id<span class="w"> </span><span class="nv">$security_group_id</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--protocol<span class="w"> </span>tcp<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--port<span class="w"> </span><span class="m">2049</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--cidr<span class="w"> </span><span class="nv">$cidr_range</span>
+
+<span class="nv">file_system_id</span><span class="o">=</span><span class="k">$(</span>aws<span class="w"> </span>efs<span class="w"> </span>create-file-system<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--region<span class="w"> </span><span class="si">${</span><span class="nv">CLUSTER_REGION</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--performance-mode<span class="w"> </span>generalPurpose<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--query<span class="w"> </span><span class="s1">&#39;FileSystemId&#39;</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--output<span class="w"> </span>text<span class="k">)</span>
+</code></pre></div>
+<p>Expose the EFS file system to the subnets used by your EKS cluster.
+<div class="highlight"><pre><span></span><code><span class="nv">SUBNETS</span><span class="o">=</span><span class="k">$(</span>eksctl<span class="w"> </span>get<span class="w"> </span>cluster<span class="w"> </span>--region<span class="w"> </span>us-west-2<span class="w"> </span><span class="si">${</span><span class="nv">CLUSTER_NAME</span><span class="si">}</span><span class="w"> </span>-o<span class="w"> </span>json<span class="w"> </span><span class="p">|</span><span class="w"> </span>jq<span class="w"> </span>-r<span class="w"> </span><span class="s1">&#39;.[0].ResourcesVpcConfig.SubnetIds[]&#39;</span><span class="k">)</span>
+
+<span class="k">while</span><span class="w"> </span><span class="nv">IFS</span><span class="o">=</span><span class="w"> </span><span class="nb">read</span><span class="w"> </span>-r<span class="w"> </span>subnet<span class="p">;</span><span class="w"> </span><span class="k">do</span>
+<span class="w">    </span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;Creating EFS mount target in </span><span class="nv">$subnet</span><span class="s2">&quot;</span>
+<span class="w">    </span>aws<span class="w"> </span>efs<span class="w"> </span>create-mount-target<span class="w"> </span>--file-system-id<span class="w"> </span><span class="nv">$file_system_id</span><span class="w"> </span><span class="se">\</span>
+<span class="w">      </span>--subnet-id<span class="w"> </span><span class="nv">$subnet</span><span class="w"> </span>--security-groups<span class="w"> </span><span class="nv">$security_group_id</span><span class="w"> </span>--output<span class="w"> </span>text
+<span class="k">done</span><span class="w"> </span><span class="o">&lt;&lt;&lt;</span><span class="w"> </span><span class="s2">&quot;</span><span class="nv">$SUBNETS</span><span class="s2">&quot;</span>
+</code></pre></div></p>
+<h2 id="2-install-the-efs-csi-driver">2. Install the EFS CSI driver<a class="headerlink" href="#2-install-the-efs-csi-driver" title="Permanent link">&para;</a></h2>
+<div class="highlight"><pre><span></span><code><span class="nb">export</span><span class="w"> </span><span class="nv">ROLE_NAME</span><span class="o">=</span>AmazonEKS_EFS_CSI_DriverRole
+eksctl<span class="w"> </span>create<span class="w"> </span>iamserviceaccount<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--name<span class="w"> </span>efs-csi-controller-sa<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--namespace<span class="w"> </span>kube-system<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--cluster<span class="w"> </span><span class="si">${</span><span class="nv">CLUSTER_NAME</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--role-name<span class="w"> </span><span class="si">${</span><span class="nv">ROLE_NAME</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--role-only<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--attach-policy-arn<span class="w"> </span>arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--approve
+
+<span class="nv">TRUST_POLICY</span><span class="o">=</span><span class="k">$(</span>aws<span class="w"> </span>iam<span class="w"> </span>get-role<span class="w"> </span>--role-name<span class="w"> </span><span class="si">${</span><span class="nv">ROLE_NAME</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--query<span class="w"> </span><span class="s1">&#39;Role.AssumeRolePolicyDocument&#39;</span><span class="w"> </span>--output<span class="w"> </span>json<span class="w"> </span><span class="p">|</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>sed<span class="w"> </span>-e<span class="w"> </span><span class="s1">&#39;s/efs-csi-controller-sa/efs-csi-*/&#39;</span><span class="w"> </span>-e<span class="w"> </span><span class="s1">&#39;s/StringEquals/StringLike/&#39;</span><span class="k">)</span>
+
+aws<span class="w"> </span>iam<span class="w"> </span>update-assume-role-policy<span class="w"> </span>--role-name<span class="w"> </span><span class="si">${</span><span class="nv">ROLE_NAME</span><span class="si">}</span><span class="w"> </span>--policy-document<span class="w"> </span><span class="s2">&quot;</span><span class="nv">$TRUST_POLICY</span><span class="s2">&quot;</span>
+
+<span class="c1"># Get the role ARN</span>
+<span class="nv">EFS_ROLE_ARN</span><span class="o">=</span><span class="k">$(</span>aws<span class="w"> </span>iam<span class="w"> </span>get-role<span class="w"> </span>--role-name<span class="w"> </span>AmazonEKS_EFS_CSI_DriverRole<span class="w"> </span><span class="se">\</span>
+<span class="w">  </span>--query<span class="w"> </span><span class="s1">&#39;Role.Arn&#39;</span><span class="w"> </span>--output<span class="w"> </span>text<span class="k">)</span>
+
+aws<span class="w"> </span>eks<span class="w"> </span>create-addon<span class="w"> </span>--cluster-name<span class="w"> </span><span class="nv">$CLUSTER_NAME</span><span class="w"> </span>--addon-name<span class="w"> </span>aws-efs-csi-driver<span class="w"> </span><span class="se">\</span>
+<span class="w">  </span>--service-account-role-arn<span class="w"> </span><span class="nv">$EFS_ROLE_ARN</span>
+</code></pre></div>
+<p>Wait for EKS Addon to active.
+<div class="highlight"><pre><span></span><code>aws<span class="w"> </span>eks<span class="w"> </span><span class="nb">wait</span><span class="w"> </span>addon-active<span class="w"> </span>--cluster-name<span class="w"> </span><span class="nv">$CLUSTER_NAME</span><span class="w"> </span><span class="se">\</span>
+<span class="w">  </span>--addon-name<span class="w"> </span>aws-efs-csi-driver
+</code></pre></div>
+Verify that the EFS CSI driver is running.</p>
+<div class="highlight"><pre><span></span><code>kubectl<span class="w"> </span>get<span class="w"> </span>daemonset<span class="w"> </span>efs-csi-node<span class="w"> </span>-n<span class="w"> </span>kube-system
+</code></pre></div>
+<p>Create a storage class for using EFS dynamic mode.</p>
+<div class="highlight"><pre><span></span><code>kubectl<span class="w"> </span>apply<span class="w"> </span>-f<span class="w"> </span>-<span class="w"> </span><span class="s">&lt;&lt;EOF</span>
+<span class="s">kind: StorageClass</span>
+<span class="s">apiVersion: storage.k8s.io/v1</span>
+<span class="s">metadata:</span>
+<span class="s">  name: efs-sc</span>
+<span class="s">provisioner: efs.csi.aws.com</span>
+<span class="s">parameters:</span>
+<span class="s">  provisioningMode: efs-ap</span>
+<span class="s">  fileSystemId: &quot;${file_system_id}&quot;</span>
+<span class="s">  directoryPerms: &quot;700&quot;</span>
+<span class="s">EOF</span>
+</code></pre></div>
+<p>Make sure to set <code>file_system_id</code> match the EFS file system ID created in the first step.</p>
+<h2 id="3-configure-kubeai-with-the-efs-cache-profile">3. Configure KubeAI with the EFS cache profile<a class="headerlink" href="#3-configure-kubeai-with-the-efs-cache-profile" title="Permanent link">&para;</a></h2>
+<p>You can skip this step if you've already installed KubeAI using the <a href="https://github.com/substratusai/kubeai/blob/main/charts/kubeai/values-eks.yaml">EKS Helm values file: values-eks.yaml</a> file.</p>
+<p>Configure KubeAI with the <code>efs-dynamic</code> cache profile.
+<div class="highlight"><pre><span></span><code>helm<span class="w"> </span>upgrade<span class="w"> </span>--install<span class="w"> </span>kubeai<span class="w"> </span>kubeai/kubeai<span class="w"> </span><span class="se">\</span>
+<span class="w">  </span>--reuse-values<span class="w"> </span>-f<span class="w"> </span>-<span class="w"> </span><span class="s">&lt;&lt;EOF</span>
+<span class="s">cacheProfiles:</span>
+<span class="s">  efs-dynamic:</span>
+<span class="s">    sharedFilesystem:</span>
+<span class="s">      storageClassName: &quot;efs-sc&quot;</span>
+<span class="s">  efs-static:</span>
+<span class="s">    sharedFilesystem:</span>
+<span class="s">      persistentVolumeName: &quot;efs-pv&quot;</span>
+<span class="s">EOF</span>
+</code></pre></div></p>
+<h2 id="4-configure-a-model-to-use-the-efs-cache">4. Configure a model to use the EFS cache<a class="headerlink" href="#4-configure-a-model-to-use-the-efs-cache" title="Permanent link">&para;</a></h2>
+<p>Apply a Model with <code>cacheProfile</code> set to <code>efs-dynamic</code>.</p>
+<p>NOTE: If you already installed the models chart, you will need to edit you values file and run <code>helm upgrade</code>.</p>
+<div class="highlight"><pre><span></span><code>helm<span class="w"> </span>install<span class="w"> </span>kubeai-models<span class="w"> </span>kubeai/models<span class="w"> </span>-f<span class="w"> </span>-<span class="w"> </span><span class="s">&lt;&lt;EOF</span>
+<span class="s">catalog:</span>
+<span class="s">  llama-3.1-8b-instruct-fp8-l4:</span>
+<span class="s">    enabled: true</span>
+<span class="s">    cacheProfile: efs-dynamic</span>
+<span class="s">EOF</span>
+</code></pre></div>
+<p>Wait for the Model to be fully cached.</p>
+<div class="highlight"><pre><span></span><code>kubectl<span class="w"> </span><span class="nb">wait</span><span class="w"> </span>--timeout<span class="w"> </span>10m<span class="w"> </span>--for<span class="o">=</span><span class="nv">jsonpath</span><span class="o">=</span><span class="s1">&#39;{.status.cache.loaded}&#39;</span><span class="o">=</span><span class="nb">true</span><span class="w"> </span>model/llama-3.1-8b-instruct-fp8-l4
+</code></pre></div>
+<p>This model will now be loaded from Filestore when it is served.</p>
+<h2 id="troubleshooting">Troubleshooting<a class="headerlink" href="#troubleshooting" title="Permanent link">&para;</a></h2>
+<h3 id="mountvolesetup-failed-for-volume-pvc-deadline-exceeded">MountVole.SetUp failed for volume pvc deadline exceeded<a class="headerlink" href="#mountvolesetup-failed-for-volume-pvc-deadline-exceeded" title="Permanent link">&para;</a></h3>
+<p><code>kubectl get events</code> may show an error like this:
+<div class="highlight"><pre><span></span><code>8s          Warning   FailedMount             pod/load-cache-llama-3.1-8b-instruct-fp8-l4-w7thh      MountVolume.SetUp failed for volume &quot;pvc-ceedb563-1e68-47fa-9d12-c697ae153d04&quot; : rpc error: code = DeadlineExceeded desc = context deadline exceeded
+</code></pre></div></p>
+<p>Checking the logs of the EFS CSI DaemonSet may show an error like this:
+<div class="highlight"><pre><span></span><code>kubectl<span class="w"> </span>logs<span class="w"> </span>-f<span class="w"> </span>efs-csi-node-4n75c<span class="w"> </span>-n<span class="w"> </span>kube-system
+Output:<span class="w"> </span>Could<span class="w"> </span>not<span class="w"> </span>start<span class="w"> </span>amazon-efs-mount-watchdog,<span class="w"> </span>unrecognized<span class="w"> </span>init<span class="w"> </span>system<span class="w"> </span><span class="s2">&quot;aws-efs-csi-dri&quot;</span>
+Mount<span class="w"> </span>attempt<span class="w"> </span><span class="m">1</span>/3<span class="w"> </span>failed<span class="w"> </span>due<span class="w"> </span>to<span class="w"> </span>timeout<span class="w"> </span>after<span class="w"> </span><span class="m">15</span><span class="w"> </span>sec,<span class="w"> </span><span class="nb">wait</span><span class="w"> </span><span class="m">0</span><span class="w"> </span>sec<span class="w"> </span>before<span class="w"> </span>next<span class="w"> </span>attempt.
+Mount<span class="w"> </span>attempt<span class="w"> </span><span class="m">2</span>/3<span class="w"> </span>failed<span class="w"> </span>due<span class="w"> </span>to<span class="w"> </span>timeout<span class="w"> </span>after<span class="w"> </span><span class="m">15</span><span class="w"> </span>sec,<span class="w"> </span><span class="nb">wait</span><span class="w"> </span><span class="m">0</span><span class="w"> </span>sec<span class="w"> </span>before<span class="w"> </span>next<span class="w"> </span>attempt.
+b<span class="s1">&#39;mount.nfs4: Connection timed out&#39;</span>
+</code></pre></div></p>
+<p>This likely means your mount target isn't setup correctly. Possibly the security group is not allowing traffic from the EKS cluster.</p>
+<h3 id="model-loading-job">Model Loading Job<a class="headerlink" href="#model-loading-job" title="Permanent link">&para;</a></h3>
+<p>Check to see if there is an ongoing model loader Job.</p>
+<div class="highlight"><pre><span></span><code>kubectl<span class="w"> </span>get<span class="w"> </span><span class="nb">jobs</span>
+</code></pre></div>
+
+
+
+
+
+
+
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.6ce7567c.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.83f73b43.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/how-to/cache-models-with-gcp-filestore/index.html b/how-to/cache-models-with-gcp-filestore/index.html
index cfc6cbfc..88988386 100644
--- a/how-to/cache-models-with-gcp-filestore/index.html
+++ b/how-to/cache-models-with-gcp-filestore/index.html
@@ -11,7 +11,7 @@
         <link rel="canonical" href="https://www.kubeai.org/how-to/cache-models-with-gcp-filestore/">
       
       
-        <link rel="prev" href="../build-models-into-containers/">
+        <link rel="prev" href="../cache-models-with-aws-efs/">
       
       
         <link rel="next" href="../configure-autoscaling/">
@@ -405,6 +405,27 @@
                 
   
   
+  
+  
+    <li class="md-nav__item">
+      <a href="../cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
     
   
   
diff --git a/how-to/configure-autoscaling/index.html b/how-to/configure-autoscaling/index.html
index 5a89df43..51def73d 100644
--- a/how-to/configure-autoscaling/index.html
+++ b/how-to/configure-autoscaling/index.html
@@ -407,6 +407,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/how-to/configure-embedding-models/index.html b/how-to/configure-embedding-models/index.html
index 3f994102..8922171e 100644
--- a/how-to/configure-embedding-models/index.html
+++ b/how-to/configure-embedding-models/index.html
@@ -407,6 +407,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/how-to/configure-resource-profiles/index.html b/how-to/configure-resource-profiles/index.html
index 00e9d954..dc2d2061 100644
--- a/how-to/configure-resource-profiles/index.html
+++ b/how-to/configure-resource-profiles/index.html
@@ -407,6 +407,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/how-to/configure-speech-to-text/index.html b/how-to/configure-speech-to-text/index.html
index c369be49..30d7fad2 100644
--- a/how-to/configure-speech-to-text/index.html
+++ b/how-to/configure-speech-to-text/index.html
@@ -407,6 +407,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/how-to/install-models/index.html b/how-to/install-models/index.html
index 3b890664..f5ebc22d 100644
--- a/how-to/install-models/index.html
+++ b/how-to/install-models/index.html
@@ -407,6 +407,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/index.html b/index.html
index 33b8ba12..83bac406 100644
--- a/index.html
+++ b/index.html
@@ -530,6 +530,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/installation/eks/index.html b/installation/eks/index.html
index 6e9bc0e1..da3ee2a7 100644
--- a/installation/eks/index.html
+++ b/installation/eks/index.html
@@ -483,6 +483,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/installation/gke/index.html b/installation/gke/index.html
index fac50a23..a4bdf3b3 100644
--- a/installation/gke/index.html
+++ b/installation/gke/index.html
@@ -498,6 +498,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/reference/kubernetes-api/index.html b/reference/kubernetes-api/index.html
index f40196a7..ad6266ca 100644
--- a/reference/kubernetes-api/index.html
+++ b/reference/kubernetes-api/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/reference/openai-api-compatibility/index.html b/reference/openai-api-compatibility/index.html
index c125a32d..38c6cfca 100644
--- a/reference/openai-api-compatibility/index.html
+++ b/reference/openai-api-compatibility/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/search/search_index.json b/search/search_index.json
index 044ebf21..e40ea32a 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"KubeAI: Private Open AI on Kubernetes","text":"<p>Get inferencing running on Kubernetes: LLMs, Embeddings, Speech-to-Text.</p> <p>\u2705\ufe0f  Drop-in replacement for OpenAI with API compatibility \ud83e\udde0  Serve top OSS models (LLMs, Whisper, etc.) \ud83d\ude80  Multi-platform: CPU-only, GPU, coming soon: TPU \u2696\ufe0f  Scale from zero, autoscale based on load \ud83d\udee0\ufe0f  Zero dependencies (does not depend on Istio, Knative, etc.)  \ud83d\udcac  Chat UI included (OpenWebUI) \ud83e\udd16  Operates OSS model servers (vLLM, Ollama, FasterWhisper, Infinity) \u2709  Stream/batch inference via messaging integrations (Kafka, PubSub, etc.)  </p> <p>Quotes from the community:</p> <p>reusable, well abstracted solution to run LLMs - Mike Ensor</p>"},{"location":"#architecture","title":"Architecture","text":"<p>KubeAI serves an OpenAI compatible HTTP API. Admins can configure ML models via <code>kind: Model</code> Kubernetes Custom Resources. KubeAI can be thought of as a Model Operator (See Operator Pattern) that manages vLLM and Ollama servers.</p> <p></p>"},{"location":"#local-quickstart","title":"Local Quickstart","text":"<p>Create a local cluster using kind or minikube.</p> TIP: If you are using Podman for kind... Make sure your Podman machine can use up to 6G of memory (by default it is capped at 2G):  <pre><code># You might need to stop and remove the existing machine:\npodman machine stop\npodman machine rm\n\n# Init and start a new machine:\npodman machine init --memory 6144 --disk-size 120\npodman machine start\n</code></pre> <pre><code>kind create cluster # OR: minikube start\n</code></pre> <p>Add the KubeAI Helm repository.</p> <pre><code>helm repo add kubeai https://www.kubeai.org\nhelm repo update\n</code></pre> <p>Install KubeAI and wait for all components to be ready (may take a minute).</p> <pre><code>helm install kubeai kubeai/kubeai --wait --timeout 10m\n</code></pre> <p>Install some predefined models.</p> <pre><code>cat &lt;&lt;EOF &gt; kubeai-models.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\n  qwen2-500m-cpu:\n    enabled: true\n  nomic-embed-text-cpu:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n</code></pre> <p>Before progressing to the next steps, start a watch on Pods in a standalone terminal to see how KubeAI deploys models. </p> <pre><code>kubectl get pods --watch\n</code></pre>"},{"location":"#interact-with-gemma2","title":"Interact with Gemma2","text":"<p>Because we set <code>minReplicas: 1</code> for the Gemma model you should see a model Pod already coming up.</p> <p>Start a local port-forward to the bundled chat UI.</p> <pre><code>kubectl port-forward svc/openwebui 8000:80\n</code></pre> <p>Now open your browser to localhost:8000 and select the Gemma model to start chatting with.</p>"},{"location":"#scale-up-qwen2-from-zero","title":"Scale up Qwen2 from Zero","text":"<p>If you go back to the browser and start a chat with Qwen2, you will notice that it will take a while to respond at first. This is because we set <code>minReplicas: 0</code> for this model and KubeAI needs to spin up a new Pod (you can verify with <code>kubectl get models -oyaml qwen2-500m-cpu</code>).</p>"},{"location":"#documentation","title":"Documentation","text":"<p>Checkout our documentation on kubeai.org to find info on:</p> <ul> <li>Installing KubeAI in the cloud</li> <li>How to guides (e.g. how to manage models and resource profiles).</li> <li>Concepts (how the components of KubeAI work).</li> <li>How to contribute</li> </ul>"},{"location":"#adopters","title":"Adopters","text":"<p>List of known adopters:</p> Name Description Link Telescope Telescope uses KubeAI for multi-region large scale batch LLM inference. trytelescope.ai Google Cloud Distributed Edge KubeAI is included as a reference architecture for inferencing at the edge. LinkedIn, GitLab <p>If you are using KubeAI and would like to be listed as an adopter, please make a PR.</p>"},{"location":"#openai-api-compatibility","title":"OpenAI API Compatibility","text":"<pre><code># Implemented #\n/v1/chat/completions\n/v1/completions\n/v1/embeddings\n/v1/models\n/v1/audio/transcriptions\n\n# Planned #\n# /v1/assistants/*\n# /v1/batches/*\n# /v1/fine_tuning/*\n# /v1/images/*\n# /v1/vector_stores/*\n</code></pre>"},{"location":"#immediate-roadmap","title":"Immediate Roadmap","text":"<ul> <li>Model caching</li> <li>LoRA finetuning (compatible with OpenAI finetuning API)</li> <li>Image generation (compatible with OpenAI images API)</li> </ul> <p>NOTE: KubeAI was born out of a project called Lingo which was a simple Kubernetes LLM proxy with basic autoscaling. We relaunched the project as KubeAI (late August 2024) and expanded the roadmap to what it is today.</p> <p>\ud83c\udf1f Don't forget to drop us a star on GitHub and follow the repo to stay up to date!</p> <p></p>"},{"location":"#contact","title":"Contact","text":"<p>Let us know about features you are interested in seeing or reach out with questions. Visit our Discord channel to join the discussion!</p> <p>Or just reach out on LinkedIn if you want to connect:</p> <ul> <li>Nick Stogner</li> <li>Sam Stoelinga</li> </ul>"},{"location":"benchmarks/llama-3.2-11b-vision/","title":"Llama 3.2 11B Vision Instruct vLLM Benchmarks","text":"<p>Single L4 GPU vLLM 0.6.2 <pre><code>python3 benchmark_serving.py --backend openai \\\n    --base-url http://localhost:8000/openai \\\n    --dataset-name=sharegpt --dataset-path=ShareGPT_V3_unfiltered_cleaned_split.json \\\n    --model meta-llama-3.2-11b-vision-instruct \\\n    --seed 12345 --tokenizer neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic\n============ Serving Benchmark Result ============\nSuccessful requests:                     1000\nBenchmark duration (s):                  681.93\nTotal input tokens:                      230969\nTotal generated tokens:                  194523\nRequest throughput (req/s):              1.47\nOutput token throughput (tok/s):         285.25\nTotal Token throughput (tok/s):          623.95\n---------------Time to First Token----------------\nMean TTFT (ms):                          319146.12\nMedian TTFT (ms):                        322707.98\nP99 TTFT (ms):                           642512.79\n-----Time per Output Token (excl. 1st token)------\nMean TPOT (ms):                          54.84\nMedian TPOT (ms):                        53.66\nP99 TPOT (ms):                           83.75\n---------------Inter-token Latency----------------\nMean ITL (ms):                           54.09\nMedian ITL (ms):                         47.44\nP99 ITL (ms):                            216.77\n==================================================\n</code></pre></p>"},{"location":"concepts/autoscaling/","title":"Autoscaling","text":"<p>KubeAI proxies HTTP and messaging (i.e. Kafka, etc) requests and messages to models. It will adjust the number Pods serving a given model based on the average active number of requests. If no Pods are running when a request comes in, KubeAI will hold the request, scale up a Pod and forward the request when the Pod is ready. This process happens in a manner that is transparent to the end client (other than the added delay from a cold-start).</p> <p> </p>"},{"location":"concepts/autoscaling/#next","title":"Next","text":"<p>Read about how to configure autoscaling.</p>"},{"location":"concepts/backend-servers/","title":"Backend Servers","text":"<p>KubeAI serves ML models by launching Pods on Kubernetes. The configuration and lifecycle of these Pods are managed by the KubeAI controller. Every model server Pod loads exactly one model on startup.</p> <p>In a Model manifest you can define what server to use for inference (<code>VLLM</code>, <code>OLlama</code>). Any model-specific settings can be passed to the server process via the <code>args</code> and <code>env</code> fields.</p>"},{"location":"concepts/backend-servers/#next","title":"Next","text":"<p>Read about how to install models.</p>"},{"location":"concepts/resource-profiles/","title":"Resource Profiles","text":"<p>A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are configured on inference server Pods. These profiles are defined in the KubeAI <code>config.yaml</code> file (via a ConfigMap). Each model specifies the resource profile that it requires.</p> <p>Kubernetes Model resources specify a resource profile and the count of that resource that they require (for example <code>resourceProfile: nvidia-gpu-l4:2</code> - 2x L4 GPUs).</p> <p>A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed in.</p> <p>Example: A resource profile named <code>nvidia-gpu-l4</code> might contain the following node selectors when installing KubeAI on a GKE Kubernetes cluster:</p> <pre><code>cloud.google.com/gke-accelerator: \"nvidia-l4\"\ncloud.google.com/gke-spot: \"true\"\n</code></pre> <p>and add the following resource requests to the model server Pods:</p> <pre><code>nvidia.com/gpu: \"1\"\n</code></pre> <p>In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource.</p>"},{"location":"concepts/resource-profiles/#next","title":"Next","text":"<p>Read about how to configure resource profiles.</p>"},{"location":"concepts/storage-caching/","title":"Storage / Caching","text":"<p>With \"Large\" in the name, caching is a critical part of serving LLMs.</p> <p>The best caching technique may very depending on your environment:</p> <ul> <li>What cloud features are available?</li> <li>Is your cluster deployed in an air-gapped environment?</li> </ul>"},{"location":"concepts/storage-caching/#a-model-built-into-container","title":"A. Model built into container","text":"<p>Status: Supported</p> <p>Building a model into a container image can provide a simple way to take advantage of image-related optimizations built into Kubernetes:</p> <ul> <li> <p>Relaunching a model server on the same Node that it ran on before will likely be able to reuse the previously pulled image.</p> </li> <li> <p>Secondary boot disks on GKE can be used to avoid needing to pull images.</p> </li> <li> <p>Image streaming on GKE can allow for containers to startup before the entire image is present on the Node.</p> </li> <li> <p>Container images can be pre-installed on Nodes in air-gapped environments (example: k3s airgap installation).</p> </li> </ul> <p>Guides:</p> <ul> <li>How to build models into container images</li> </ul>"},{"location":"concepts/storage-caching/#b-model-on-shared-filesystem-read-write-many","title":"B. Model on shared filesystem (read-write-many)","text":"<p>KubeAI can manage model caches on a shared filesystem (i.e. AWS EFS, GCP Filestore, NFS). It manages the full lifecycle of a cached model: loading, serving, and cache eviction (on deletion of the Model).</p> <p> </p>"},{"location":"concepts/storage-caching/#c-model-on-read-only-many-disk","title":"C. Model on read-only-many disk","text":"<p>Status: Planned.</p> <p>Examples: GCP Hyperdisk ML</p>"},{"location":"contributing/development-environment/","title":"Development environment","text":"<p>This document provides instructions for setting up an environment for developing KubeAI.</p>"},{"location":"contributing/development-environment/#optional-cloud-setup","title":"Optional: Cloud Setup","text":""},{"location":"contributing/development-environment/#gcp-pubsub","title":"GCP PubSub","text":"<p>If you are develop PubSub messaging integration on GCP, setup test topics and subscriptions and uncomment the <code>.messaging.streams</code> in <code>./hack/dev-config.yaml</code>.</p> <pre><code>gcloud auth login --update-adc\n\ngcloud pubsub topics create test-kubeai-requests\ngcloud pubsub subscriptions create test-kubeai-requests-sub --topic test-kubeai-requests\ngcloud pubsub topics create test-kubeai-responses\ngcloud pubsub subscriptions create test-kubeai-responses-sub --topic test-kubeai-responses\n</code></pre>"},{"location":"contributing/development-environment/#run-in-local-cluster","title":"Run in Local Cluster","text":"<pre><code>kind create cluster\n# OR\n#./hack/create-dev-gke-cluster.yaml\n\n# Generate CRDs from Go code.\nmake generate &amp;&amp; make manifests\n\n# When CRDs are changed reapply using kubectl:\nkubectl apply -f ./charts/kubeai/charts/crds/crds\n\n# Model with special address annotations:\nkubectl apply -f ./hack/dev-model.yaml\n\n# OPTION A #\n# Run KubeAI inside cluster\n# Change `-f` based on the cluster environment.\nhelm upgrade --install kubeai ./charts/kubeai \\\n    --set openwebui.enabled=true \\\n    --set image.tag=latest \\\n    --set image.pullPolicy=Always \\\n    --set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \\\n    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \\\n    --set replicaCount=1 -f ./hack/dev-gke-helm-values.yaml\n\n# OPTION B #\n# For quick local interation (run KubeAI outside of cluster)\nkubectl create cm kubeai-autoscaler-state -oyaml --dry-run=client | kubectl apply -f -\nCONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go\n\n# In another terminal:\nwhile true; do kubectl port-forward service/dev-model 7000:7000; done\n############\n</code></pre>"},{"location":"contributing/development-environment/#running","title":"Running","text":""},{"location":"contributing/development-environment/#completions-api","title":"Completions API","text":"<pre><code># If you are running kubeai in-cluster:\n# kubectl port-forward svc/kubeai 8000:80\n\ncurl http://localhost:8000/openai/v1/completions -H \"Content-Type: application/json\" -d '{\"prompt\": \"Hi\", \"model\": \"dev\"}' -v\n</code></pre>"},{"location":"contributing/development-environment/#messaging-integration","title":"Messaging Integration","text":"<pre><code>gcloud pubsub topics publish test-kubeai-requests \\                  \n  --message='{\"path\":\"/v1/completions\", \"metadata\":{\"a\":\"b\"}, \"body\": {\"model\": \"dev\", \"prompt\": \"hi\"}}'\n\ngcloud pubsub subscriptions pull test-kubeai-responses-sub --auto-ack\n</code></pre>"},{"location":"contributing/documentation/","title":"Documentation","text":"<p>We are grateful for anyone who takes the time to improve KubeAI documentation! In order to keep our docs clear and consistent we ask that you first read about the approach to documentation that we have standardized on...</p>"},{"location":"contributing/documentation/#read-before-writing","title":"Read before writing!","text":"<p>The KubeAI approach to documentation is loosely inspired by the Diataxis method.</p> <p>TLDR on how KubeAI docs are organized:</p> <ul> <li>Installation: How-to guides specific to installing KubeAI.</li> <li>How To: Directions that guide the reader through a problem or towards a result. How-to guides are goal-oriented. They assume the user is familiar with general concepts, tools, and has already installed KubeAI.</li> <li>Concepts: A reflective explanation of KubeAI topics with a focus on giving the reader an understanding of the why.</li> <li>Tutorials: Learning oriented experiences. Lessons that often guide a user from beginning to end. The goal is to help the reader learn something (compared to a how-to guide that is focused on helping the reader do something).</li> <li>Contributing: The docs in here differ from the rest of the docs by audience: these docs are for anyone who will be contributing code or docs to the KubeAI project.</li> </ul>"},{"location":"contributing/documentation/#how-to-serve-kubeaiorg-locally","title":"How to serve kubeai.org locally","text":"<p>Make sure you have python3 installed and run:</p> <pre><code>make docs\n</code></pre>"},{"location":"contributing/release-process/","title":"Release Process","text":"<p>This document describes the process for releasing a new version of the project.</p>"},{"location":"contributing/release-process/#docs","title":"Docs","text":"<p>The docs are automatically published whenever a PR updates the docs and the PR is merged into the main branch. The docs are published to the <code>gh-pages</code> branch, which is the source for the Github Pages site.</p>"},{"location":"contributing/release-process/#docker-images","title":"Docker images","text":"<p>The Docker image latest tag always points to the latest released version. The <code>main</code> tag points to the latest commit on the main branch.</p> <p>If you push a tag <code>vX.Y.Z</code> to the repository, the Docker image with the tag <code>vX.Y.Z</code> is built and pushed to Docker Hub. Afterwards, the <code>latest</code> tag is updated to point to the new version.</p>"},{"location":"contributing/release-process/#helm-chart","title":"Helm Chart","text":"<p>The Helm chart only gets released when a git tag is pushed to the repository with the format <code>helm-v*</code>.</p> <p>The appVersion in the Helm chart does not have to point to the latest released version. This allows us to first publish a new version of the Docker image without updating the Helm chart. The Helm chart is updated when we are ready to release a new version.</p> <p>This is important when a new appVersion isn't compatible with the current Helm chart. In those cases, we can first merge the PR, thoroughly test, release new container image, and then in a separate PR update the Helm chart and the appVersion.</p>"},{"location":"how-to/architect-for-multitenancy/","title":"Architect for Multitenancy","text":"<p>KubeAI can support multitenancy by filtering the models that it serves via Kubernetes label selectors. These label selectors can be applied when accessing any of the OpenAI-compatible endpoints through the <code>X-Label-Selector</code> HTTP header and will match on labels specified on the <code>kind: Model</code> objects. The pattern is similar to using a <code>WHERE</code> clause in a SQL query.</p> <p>Example Models:</p> <pre><code>kind: Model\nmetadata:\n  name: llama-3.2\n  labels:\n    tenancy: public\nspec:\n# ...\n---\nkind: Model\nmetadata:\n  name: custom-private-model\n  labels:\n    tenancy: org-abc\nspec:\n# ...\n</code></pre> <p>Example HTTP requests:</p> <pre><code># The returned list of models will be filtered.\ncurl http://$KUBEAI_ENDPOINT/openai/v1/models \\\n    -H \"X-Label-Selector: tenancy in (org-abc, public)\"\n\n# When running inference, if the label selector does not match\n# a 404 will be returned.\ncurl http://$KUBEAI_ENDPOINT/openai/v1/completions \\\n    -H \"Content-Type: application/json\" \\\n    -H \"X-Label-Selector: tenancy in (org-abc, public)\" \\\n    -d '{\"prompt\": \"Hi\", \"model\": \"llama-3.2\"}'\n</code></pre> <p>The header value can be any valid Kubernetes label selector. Some examples include:</p> <pre><code>X-Label-Selector: tenancy=org-abc\nX-Label-Selector: tenancy in (org-abc, public)\nX-Label-Selector: tenancy!=private\n</code></pre> <p>Multiple <code>X-Label-Selector</code> headers can be specified in the same HTTP request and will be treated as a logical <code>AND</code>. For example, the following request will only match Models that have a label <code>tenant: org-abc</code> and <code>user: sam</code>:</p> <pre><code>curl http://$KUBEAI_ENDPOINT/openai/v1/completions \\\n    -H \"Content-Type: application/json\" \\\n    -H \"X-Label-Selector: tenant=org-abc\" \\\n    -H \"X-Label-Selector: user=sam\" \\\n    -d '{\"prompt\": \"Hi\", \"model\": \"llama-3.2\"}'\n</code></pre> <p>Example architecture:</p> <p></p>"},{"location":"how-to/build-models-into-containers/","title":"Build models into containers","text":"<p>In this guide we will preload a LLM into a custom built Ollama serving image. You can follow the same steps for other models and other serving engines.</p> <p>Define some values <pre><code>export MODEL_URL=ollama://qwen2:0.5b\n\n# Customize with your own image repo.\nexport IMAGE=us-central1-docker.pkg.dev/substratus-dev/default/ollama-builtin-qwen2-05b:latest\n</code></pre></p> <p>Build and push image. Note: building (downloading base image &amp; model) and pushing (uploading image &amp; model) can take a while depending on the size of the model.</p> <pre><code>git clone https://github.com/substratusai/kubeai\ncd ./kubeai/examples/ollama-builtin\n\ndocker build --build-arg MODEL_URL=$MODEL_URL -t $IMAGE .\ndocker push $IMAGE\n</code></pre> <p>Create a model manifest &amp; apply into a cluster with KubeAI installed. NOTE: The only difference between an built-in model image and otherwise is the addition of the <code>image:</code> field.</p> <pre><code>kubectl apply -f - &lt;&lt; EOF\napiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: builtin-model-example\nspec:\n  features: [\"TextGeneration\"]\n  owner: alibaba\n  image: $IMAGE # &lt;-- The image with model built-in\n  url: \"$MODEL_URL\"\n  engine: OLlama\n  resourceProfile: cpu:1\nEOF\n</code></pre>"},{"location":"how-to/cache-models-with-gcp-filestore/","title":"Cache models with GCP Filestore","text":"<p>KubeAI can manage model caches. GCP Filestore is supported as a pluggable backend store.</p> <p> </p> <p>Follow the GKE install guide.</p> <p>Ensure that the Filestore API is enabled.</p> <pre><code>gcloud services enable file.googleapis.com\n</code></pre> <p>Apply a Model with the cache profile set to <code>standard-filestore</code> (defined in the reference GKE Helm values file).</p> TIP: If you want to use `premium-filestore` you will need to ensure you have quota. <p>Open the cloud console quotas page: https://console.cloud.google.com/iam-admin/quotas. Make sure your project is selected in the top left.</p> <p>Ensure that you have at least 2.5Tb of <code>PremiumStorageGbPerRegion</code> quota in the region where your cluster is deployed.</p> <p></p> <p></p> <p>NOTE: If you already installed the models chart, you will need to edit you values file and run <code>helm upgrade</code>.</p> <pre><code>helm install kubeai-models kubeai/models -f - &lt;&lt;EOF\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n    cacheProfile: standard-filestore\nEOF\n</code></pre> <p>Wait for the Model to be fully cached. This may take a while if the Filestore instance needs to be created.</p> <pre><code>kubectl wait --timeout 10m --for=jsonpath='{.status.cache.loaded}'=true model/llama-3.1-8b-instruct-fp8-l4\n</code></pre> <p>This model will now be loaded from Filestore when it is served.</p>"},{"location":"how-to/cache-models-with-gcp-filestore/#troubleshooting","title":"Troubleshooting","text":""},{"location":"how-to/cache-models-with-gcp-filestore/#filestore-csi-driver","title":"Filestore CSI Driver","text":"<p>Ensure that the Filestore CSI driver is enabled by checking for the existance of Kubernetes storage classes. If they are not found, follow the GCP guide for enabling the CSI driver.</p> <pre><code>kubectl get storageclass standard-rwx premium-rwx\n</code></pre>"},{"location":"how-to/cache-models-with-gcp-filestore/#persistentvolumes","title":"PersistentVolumes","text":"<p>Check the PersistentVolumeClaim (that should be created by KubeAI).</p> <pre><code>kubectl describe pvc shared-model-cache-\n</code></pre> Example: Out-of-quota error <pre><code>  Warning  ProvisioningFailed    11m (x26 over 21m)  filestore.csi.storage.gke.io_gke-50826743a27a4d52bf5b-7fac-9607-vm_b4bdb2ec-b58b-4363-adec-15c270a14066  failed to provision volume with StorageClass \"premium-rwx\": rpc error: code = ResourceExhausted desc = googleapi: Error 429: Quota limit 'PremiumStorageGbPerRegion' has been exceeded. Limit: 0 in region us-central1.\nDetails:\n[\n  {\n    \"@type\": \"type.googleapis.com/google.rpc.QuotaFailure\",\n    \"violations\": [\n      {\n        \"description\": \"Quota 'PremiumStorageGbPerRegion' exhausted. Limit 0 in region us-central1\",\n        \"subject\": \"project:819220466562\"\n      }\n    ]\n  }\n]\n</code></pre> <p>Check to see if the PersistentVolume has been fully provisioned.</p> <pre><code>kubectl get pv\n# Find name of corresponding pv...\nkubectl describe pv &lt;name&gt;\n</code></pre>"},{"location":"how-to/cache-models-with-gcp-filestore/#model-loading-job","title":"Model Loading Job","text":"<p>Check to see if there is an ongoing model loader Job.</p> <pre><code>kubectl get jobs\n</code></pre>"},{"location":"how-to/configure-autoscaling/","title":"Configure autoscaling","text":"<p>This guide will cover how to configure KubeAI autoscaling parameters.</p>"},{"location":"how-to/configure-autoscaling/#system-settings","title":"System Settings","text":"<p>KubeAI administrators can define system-wide autoscaling settings by setting the following Helm values (for the <code>kubeai/kubeai</code> chart):</p> <p>Example:</p> <pre><code># helm-values.yaml\nmodelAutoscaling:\n  interval: 15s\n  timeWindow: 10m\n# ...\n</code></pre>"},{"location":"how-to/configure-autoscaling/#model-settings","title":"Model Settings","text":"<p>The following settings can be configured on a model-by-model basis.</p>"},{"location":"how-to/configure-autoscaling/#model-settings-helm","title":"Model settings: helm","text":"<p>If you are managing models via the <code>kubeai/models</code> Helm chart, you can use:</p> <pre><code># helm-values.yaml\ncatalog:\n  model-a:\n    # ...\n    minReplicas: 1\n    maxReplicas: 9\n    targetRequests: 250\n    scaleDownDelaySeconds: 45\n  model-b:\n    # ...\n    disableAutoscaling: true\n# ...\n</code></pre> <p>Re-running <code>helm upgrade</code> with these additional parameters will update model settings in the cluster.</p>"},{"location":"how-to/configure-autoscaling/#model-settings-kubectl","title":"Model settings: kubectl","text":"<p>You can also specify the autoscaling profile directly via the Models custom resource in the Kubernetes API:</p> <pre><code>apiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: my-model\nspec:\n  # ...\n  minReplicas: 1\n  maxReplicas: 9\n  targetRequests: 250\n  scaleDownDelaySeconds: 45\n</code></pre> <p>If you are already managing models using Model manifest files, you can make the update to your file and reapply it using <code>kubectl apply -f &lt;filename&gt;.yaml</code>.</p>"},{"location":"how-to/configure-embedding-models/","title":"Configure Embedding Models","text":"<p>KubeAI supports the following engines for text embedding models:</p> <ul> <li>Infinity</li> <li>vLLM</li> <li>Ollama</li> </ul> <p>Infinity supports any HuggingFace models listed as text-embedding. See the models, reranking or clip models on huggingface for reference.</p>"},{"location":"how-to/configure-embedding-models/#install-baaibge-small-en-v15-model-using-infinity","title":"Install BAAI/bge-small-en-v1.5 model using Infinity","text":"<p>Create a file named <code>kubeai-models.yaml</code> with the following content:</p> <pre><code>catalog:\n  bge-embed-text-cpu:\n    enabled: true\n    features: [\"TextEmbedding\"]\n    owner: baai\n    url: \"hf://BAAI/bge-small-en-v1.5\"\n    engine: Infinity\n    resourceProfile: cpu:1\n    minReplicas: 1\n</code></pre> <p>Apply the kubeai-models helm chart:</p> <pre><code>helm install kubeai-models kubeai/models -f ./kubeai-models.yaml\n</code></pre> <p>Once the pod is ready, you can use the OpenAI Python SDK to interact with the model:</p> <pre><code>from openai import OpenAI\n# Assumes port-forward of kubeai service to localhost:8000.\nclient = OpenAI(api_key=\"ignored\", base_url=\"http://localhost:8000/openai/v1\")\nresponse = client.embeddings.create(\n    input=\"Your text goes here.\",\n    model=\"bge-embed-text-cpu\"\n)\n</code></pre>"},{"location":"how-to/configure-resource-profiles/","title":"Configure resource profiles","text":"<p>This guide will cover modifying preconfigured resource profiles and adding your own.</p>"},{"location":"how-to/configure-resource-profiles/#modifying-preconfigured-resource-profiles","title":"Modifying preconfigured resource profiles","text":"<p>The KubeAI helm chart comes with preconfigured resource profiles for common resource types such as NVIDIA L4 GPUs. You can view these profiles in the default helm values file.</p> <p>These profiles usually require some additional settings based on the cluster/cloud that KubeAI is installed into. You can modify a resource profile by setting custom helm values and runing <code>helm install</code> or <code>helm upgrade</code>. For example, if you are installing KubeAI on GKE you will need to set GKE-specific node selectors:</p> <pre><code># helm-values.yaml\nresourceProfiles:\n  nvidia-gpu-l4:\n    nodeSelector:\n      cloud.google.com/gke-accelerator: \"nvidia-l4\"\n      cloud.google.com/gke-spot: \"true\"\n</code></pre> <p>NOTE: See the cloud-specific installation guide for a comprehensive list of settings.</p>"},{"location":"how-to/configure-resource-profiles/#adding-additional-resource-profiles","title":"Adding additional resource profiles","text":"<p>If the preconfigured resource profiles do not meet your needs you can add additional profiles by appending to the <code>.resourceProfiles</code> object in the helm values file you use to install KubeAI.</p> <pre><code># helm-values.yaml\nresourceProfiles:\n  my-custom-gpu:\n    imageName: \"optional-custom-image-name\"\n    nodeSelector:\n      my-custom-node-pool: \"some-value\"\n    limits:\n      custom.com/gpu: \"1\"\n    requests:\n      custom.com/gpu: \"1\"\n      cpu: \"3\"\n      memory: \"12Gi\"\n    runtimeClassName: \"my-custom-runtime-class\"\n</code></pre> <p>If you need to run custom model server images on your resource profile, make sure to also add those in the <code>modelServers</code> section:</p> <pre><code># helm-values.yaml\nmodelServers:\n  VLLM:\n    images:\n      optional-custom-image-name: \"my-repo/my-vllm-image:v1.2.3\"\n  OLlama:\n    images:\n      optional-custom-image-name: \"my-repo/my-ollama-image:v1.2.3\"\n</code></pre>"},{"location":"how-to/configure-resource-profiles/#next","title":"Next","text":"<p>See the guide on how to install models which includes how to configure the resource profile to use for a given model.</p>"},{"location":"how-to/configure-speech-to-text/","title":"Configure speech-to-text","text":"<p>KubeAI provides a Speech to Text endpoint that can be used to transcribe audio files. This guide will walk you through the steps to enable this feature.</p>"},{"location":"how-to/configure-speech-to-text/#enable-speech-to-text-model","title":"Enable Speech to Text model","text":"<p>You can create new models by creating a Model CRD object or by enabling a model from the model catalog.</p>"},{"location":"how-to/configure-speech-to-text/#enable-from-model-catalog","title":"Enable from model catalog","text":"<p>KubeAI provides predefined models in the <code>kubeai/models</code> Helm chart. To enable the Speech to Text model, you can set the <code>enabled</code> flag to <code>true</code> in your values file.</p> <pre><code># models-helm-values.yaml\ncatalog:\n  faster-whisper-medium-en-cpu:\n    enabled: true\n    minReplicas: 1\n</code></pre>"},{"location":"how-to/configure-speech-to-text/#enable-by-creating-model-crd","title":"Enable by creating Model CRD","text":"<p>You can also create a Model CRD object to enable the Speech to Text model. Here is an example of a Model CRD object for the Speech to Text model:</p> <pre><code>apiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: faster-whisper-medium-en-cpu\nspec:\n  features: [SpeechToText]\n  owner: Systran\n  url: hf://Systran/faster-whisper-medium.en\n  engine: FasterWhisper\n  resourceProfile: cpu:1\n</code></pre>"},{"location":"how-to/configure-speech-to-text/#usage","title":"Usage","text":"<p>The Speech to Text endpoint is available at <code>/openai/v1/transcriptions</code>.</p> <p>Example usage using curl:</p> <pre><code>curl -L -o kubeai.mp4 https://github.com/user-attachments/assets/711d1279-6af9-4c6c-a052-e59e7730b757\ncurl http://localhost:8000/openai/v1/audio/transcriptions \\\n  -F \"file=@kubeai.mp4\" \\\n  -F \"language=en\" \\\n  -F \"model=faster-whisper-medium-en-cpu\"\n</code></pre>"},{"location":"how-to/install-models/","title":"Install models","text":"<p>This guide provides instructions on how to configure KubeAI models.</p>"},{"location":"how-to/install-models/#installing-models-with-helm","title":"Installing models with helm","text":"<p>KubeAI provides a chart that contains preconfigured models.</p>"},{"location":"how-to/install-models/#preconfigured-models-with-helm","title":"Preconfigured models with helm","text":"<p>When you are defining Helm values for the <code>kubeai/models</code> chart you can install a preconfigured Model by setting <code>enabled: true</code>. You can view a list of all preconfigured models in the chart's default values file. </p> <pre><code># helm-values.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n</code></pre> <p>You can optionally override preconfigured settings, for example, <code>resourceProfile</code>:</p> <pre><code># helm-values.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n    resourceProfile: nvidia-gpu-l4:2 # Require \"2 NVIDIA L4 GPUs\"\n</code></pre>"},{"location":"how-to/install-models/#custom-models-with-helm","title":"Custom models with helm","text":"<p>If you prefer to add a custom model via the same Helm chart you use for installed KubeAI, you can add your custom model entry into the <code>.catalog</code> array of your existing values file for the <code>kubeai/models</code> Helm chart:</p> <pre><code># helm-values.yaml\ncatalog:\n  my-custom-model-name:\n    enabled: true\n    features: [\"TextEmbedding\"]\n    owner: me\n    url: \"hf://me/my-custom-model\"\n    resourceProfile: CPU:1\n</code></pre>"},{"location":"how-to/install-models/#installing-models-with-kubectl","title":"Installing models with kubectl","text":"<p>You can add your own model by defining a Model yaml file and applying it using <code>kubectl apply -f model.yaml</code>.</p> <p>If you have a running cluster with KubeAI installed you can inspect the schema for a Model using <code>kubectl explain</code>:</p> <pre><code>kubectl explain models\nkubectl explain models.spec\nkubectl explain models.spec.engine\n</code></pre>"},{"location":"how-to/install-models/#programmatically-installing-models","title":"Programmatically installing models","text":"<p>See the examples.</p>"},{"location":"how-to/install-models/#feedback-welcome-a-model-management-ui","title":"Feedback welcome: A model management UI","text":"<p>We are considering adding a UI for managing models in a running KubeAI instance. Give the GitHub Issue a thumbs up if you would be interested in this feature.</p>"},{"location":"installation/eks/","title":"Install on EKS","text":"TIP: Make sure you have enough GPU quota in your AWS account. <p>The default quotas for GPU instances are often 0. You will need to request a quota increase for the GPU instances you want to use.</p> <p>The following quotas may require an increase if you wish to use GPUs in your EKS cluster: - All G and VT Spot Instance Requests - All P5 Spot Instance Requests - All P4, P3 and P2 Spot Instance Requests - Running Dedicated p4d Hosts</p>"},{"location":"installation/eks/#1-create-eks-cluster-with-karpenter","title":"1. Create EKS cluster with Karpenter","text":"<p>Set the environment variables used throughout this guide:</p> <pre><code>export CLUSTER_NAME=\"cluster-with-karpenter\"\nexport AWS_DEFAULT_REGION=\"us-west-2\"\nexport K8S_VERSION=\"1.30\"\nexport GPU_AMI_ID=\"$(aws ssm get-parameter --name /aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-gpu/recommended/image_id --query Parameter.Value --output text)\"\n</code></pre> <p>Create the EKS cluster using eksctl: <pre><code>eksctl create cluster -f - &lt;&lt;EOF\n---\napiVersion: eksctl.io/v1alpha5\nkind: ClusterConfig\nmetadata:\n  name: \"${CLUSTER_NAME}\"\n  region: \"${AWS_DEFAULT_REGION}\"\n  version: \"${K8S_VERSION}\"\n  tags:\n    karpenter.sh/discovery: \"${CLUSTER_NAME}\" # here, it is set to the cluster name\n\niam:\n  withOIDC: true # required\n\nkarpenter:\n  version: '1.0.6' # Exact version must be specified\n\nmanagedNodeGroups:\n- instanceType: m5.large\n  amiFamily: AmazonLinux2\n  name: \"${CLUSTER_NAME}-m5-ng\"\n  desiredCapacity: 2\n  minSize: 1\n  maxSize: 10\nEOF\n</code></pre></p>"},{"location":"installation/eks/#2-configure-a-karpenter-gpu-nodepool","title":"2. Configure a Karpenter GPU NodePool","text":"<p>Create the NodePool and EC2NodeClass objects:</p> <pre><code>kubectl apply -f - &lt;&lt;EOF\napiVersion: karpenter.sh/v1\nkind: NodePool\nmetadata:\n  name: gpu\nspec:\n  template:\n    spec:\n      requirements:\n        - key: karpenter.sh/capacity-type\n          operator: In\n          values: [\"spot\", \"on-demand\"]\n        - key: karpenter.k8s.aws/instance-category\n          operator: In\n          values: [\"g\", \"p\"]\n      nodeClassRef:\n        group: karpenter.k8s.aws\n        kind: EC2NodeClass\n        name: gpu\n      expireAfter: 720h # 30 * 24h = 720h\n      taints:\n      - key: nvidia.com/gpu\n        value: \"true\"\n        effect: NoSchedule\n  limits:\n    cpu: 1000\n  disruption:\n    consolidationPolicy: WhenEmptyOrUnderutilized\n    consolidateAfter: 1m\n---\napiVersion: karpenter.k8s.aws/v1\nkind: EC2NodeClass\nmetadata:\n  name: gpu\nspec:\n  amiFamily: AL2 # Amazon Linux 2\n  role: \"eksctl-KarpenterNodeRole-${CLUSTER_NAME}\"\n  subnetSelectorTerms:\n    - tags:\n        karpenter.sh/discovery: \"${CLUSTER_NAME}\" # replace with your cluster name\n  securityGroupSelectorTerms:\n    - tags:\n        karpenter.sh/discovery: \"${CLUSTER_NAME}\" # replace with your cluster name\n  amiSelectorTerms:\n    - id: \"${GPU_AMI_ID}\" # &lt;- GPU Optimized AMD AMI \n  blockDeviceMappings:\n    - deviceName: /dev/xvda\n      ebs:\n        volumeSize: 300Gi\n        volumeType: gp3\n        encrypted: true\nEOF\n</code></pre> <p>Install the NVIDIA device plugin (needed for Karpenter nodes):</p> <pre><code>kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.16.1/deployments/static/nvidia-device-plugin.yml\n</code></pre>"},{"location":"installation/eks/#3-install-kubeai","title":"3. Install KubeAI","text":"<p>Add KubeAI Helm repository.</p> <pre><code>helm repo add kubeai https://www.kubeai.org\nhelm repo update\n</code></pre> <p>Make sure you have a HuggingFace Hub token set in your environment (<code>HUGGING_FACE_HUB_TOKEN</code>).</p> <pre><code>export HF_TOKEN=\"replace-with-your-huggingface-token\"\n</code></pre> <p>Install KubeAI with Helm.</p> <pre><code>curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-eks.yaml\n# Please review the values-eks.yaml file and edit the nodeSelectors if needed.\ncat values-eks.yaml\nhelm upgrade --install kubeai kubeai/kubeai \\\n    -f values-eks.yaml \\\n    --set secrets.huggingface.token=$HF_TOKEN \\\n    --wait\n</code></pre>"},{"location":"installation/eks/#3-optionally-configure-models","title":"3. Optionally configure models","text":"<p>Optionally install preconfigured models.</p> <pre><code>cat &lt;&lt;EOF &gt; kubeai-models.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n</code></pre>"},{"location":"installation/gke/","title":"Install on GKE","text":"TIP: Make sure you have enough quota in your GCP project. <p>Open the cloud console quotas page: https://console.cloud.google.com/iam-admin/quotas. Make sure your project is selected in the top left.</p> <p>You will need to verify that you have enough quota for the accelerators you want to use. Below is table of common quotas you will have to increase depending on your needs.</p> Quota Location Min Value Preemptible TPU v5 Lite Podslice chips <code>&lt;your-region&gt;</code> 8 Preemptible NVIDIA L4 GPUs <code>&lt;your-region&gt;</code> 2 GPUs (all regions) - 2 CPUs (all regions) - 24 <p>See the following screenshot examples of how these quotas appear in the console:</p> <p></p> <p></p> <p></p> <p></p>"},{"location":"installation/gke/#1-create-a-cluster","title":"1. Create a cluster","text":""},{"location":"installation/gke/#option-gke-autopilot","title":"Option: GKE Autopilot","text":"<p>Create an Autopilot cluster (replace <code>us-central1</code> with a region that you have quota).</p> <pre><code>gcloud container clusters create-auto cluster-1 \\\n    --location=us-central1\n</code></pre>"},{"location":"installation/gke/#option-gke-standard","title":"Option: GKE Standard","text":"<p>TODO: Reference gcloud commands for creating a GKE standard cluster.</p>"},{"location":"installation/gke/#2-install-kubeai","title":"2. Install KubeAI","text":"<p>Add KubeAI Helm repository.</p> <pre><code>helm repo add kubeai https://www.kubeai.org\nhelm repo update\n</code></pre> <p>Make sure you have a HuggingFace Hub token set in your environment (<code>HUGGING_FACE_HUB_TOKEN</code>).</p> <p>Install KubeAI with Helm.</p> <pre><code>curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-gke.yaml\nhelm upgrade --install kubeai kubeai/kubeai \\\n    -f values-gke.yaml \\\n    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \\\n    --wait\n</code></pre>"},{"location":"installation/gke/#3-optionally-configure-models","title":"3. Optionally configure models","text":"<p>Optionally install preconfigured models.</p> <pre><code>cat &lt;&lt;EOF &gt; kubeai-models.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n</code></pre>"},{"location":"reference/kubernetes-api/","title":"Kubernetes API","text":""},{"location":"reference/kubernetes-api/#packages","title":"Packages","text":"<ul> <li>kubeai.org/v1</li> </ul>"},{"location":"reference/kubernetes-api/#kubeaiorgv1","title":"kubeai.org/v1","text":"<p>Package v1 contains API Schema definitions for the kubeai v1 API group</p>"},{"location":"reference/kubernetes-api/#resource-types","title":"Resource Types","text":"<ul> <li>Model</li> </ul>"},{"location":"reference/kubernetes-api/#model","title":"Model","text":"<p>Model resources define the ML models that will be served by KubeAI.</p> Field Description Default Validation <code>apiVersion</code> string <code>kubeai.org/v1</code> <code>kind</code> string <code>Model</code> <code>metadata</code> ObjectMeta Refer to Kubernetes API documentation for fields of <code>metadata</code>. <code>spec</code> ModelSpec <code>status</code> ModelStatus"},{"location":"reference/kubernetes-api/#modelfeature","title":"ModelFeature","text":"<p>Underlying type: string</p> <p>Validation: - Enum: [TextGeneration TextEmbedding SpeechToText]</p> <p>Appears in: - ModelSpec</p>"},{"location":"reference/kubernetes-api/#modelspec","title":"ModelSpec","text":"<p>ModelSpec defines the desired state of Model.</p> <p>Appears in: - Model</p> Field Description Default Validation <code>url</code> string URL of the model to be served.Currently only the following formats are supported:For VLLM &amp; FasterWhisper engines: \"hf:///\"For OLlama engine: \"ollama:// Required: {}  <code>features</code> ModelFeature array Features that the model supports.Dictates the APIs that are available for the model. Enum: [TextGeneration TextEmbedding SpeechToText]  <code>engine</code> string Engine to be used for the server process. Enum: [OLlama VLLM FasterWhisper Infinity] Required: {}  <code>resourceProfile</code> string ResourceProfile required to serve the model.Use the format \":\".Example: \"nvidia-gpu-l4:2\" - 2x NVIDIA L4 GPUs.Must be a valid ResourceProfile defined in the system config. <code>cacheProfile</code> string CacheProfile to be used for caching model artifacts.Must be a valid CacheProfile defined in the system config. <code>image</code> string Image to be used for the server process.Will be set from ResourceProfile + Engine if not specified. <code>args</code> string array Args to be added to the server process. <code>env</code> object (keys:string, values:string) Env variables to be added to the server process. <code>replicas</code> integer Replicas is the number of Pod replicas that should be activelyserving the model. KubeAI will manage this field unless AutoscalingDisabledis set to true. <code>minReplicas</code> integer MinReplicas is the minimum number of Pod replicas that the model can scale down to.Note: 0 is a valid value. Minimum: 0 Optional: {}  <code>maxReplicas</code> integer MaxReplicas is the maximum number of Pod replicas that the model can scale up to.Empty value means no limit. Minimum: 1  <code>autoscalingDisabled</code> boolean AutoscalingDisabled will stop the controller from managing the replicasfor the Model. When disabled, metrics will not be collected on server Pods. <code>targetRequests</code> integer TargetRequests is average number of active requests that the autoscalerwill try to maintain on model server Pods. 100 Minimum: 1  <code>scaleDownDelaySeconds</code> integer ScaleDownDelay is the minimum time before a deployment is scaled down afterthe autoscaling algorithm determines that it should be scaled down. 30 <code>owner</code> string Owner of the model. Used solely to populate the owner field in theOpenAI /v1/models endpoint.DEPRECATED. Optional: {}"},{"location":"reference/kubernetes-api/#modelstatus","title":"ModelStatus","text":"<p>ModelStatus defines the observed state of Model.</p> <p>Appears in: - Model</p> Field Description Default Validation <code>replicas</code> ModelStatusReplicas <code>cache</code> ModelStatusCache"},{"location":"reference/kubernetes-api/#modelstatuscache","title":"ModelStatusCache","text":"<p>Appears in: - ModelStatus</p> Field Description Default Validation <code>loaded</code> boolean"},{"location":"reference/kubernetes-api/#modelstatusreplicas","title":"ModelStatusReplicas","text":"<p>Appears in: - ModelStatus</p> Field Description Default Validation <code>all</code> integer <code>ready</code> integer"},{"location":"reference/openai-api-compatibility/","title":"OpenAI API Compatibility","text":"<p>KubeAI provides an OpenAI API compatiblity layer.</p>"},{"location":"reference/openai-api-compatibility/#general","title":"General:","text":""},{"location":"reference/openai-api-compatibility/#models","title":"Models","text":"<pre><code>GET /v1/models\n</code></pre> <ul> <li>Lists all <code>kind: Model</code> object installed in teh Kubernetes API Server.</li> </ul>"},{"location":"reference/openai-api-compatibility/#inference","title":"Inference","text":""},{"location":"reference/openai-api-compatibility/#text-generation","title":"Text Generation","text":"<pre><code>POST /v1/chat/completions\nPOST /v1/completions\n</code></pre> <ul> <li>Supported for Models with <code>.spec.features: [\"TextGeneration\"]</code>.</li> </ul>"},{"location":"reference/openai-api-compatibility/#embeddings","title":"Embeddings","text":"<pre><code>POST /v1/embeddings\n</code></pre> <ul> <li>Supported for  Models with <code>.spec.features: [\"TextEmbedding\"]</code>.</li> </ul>"},{"location":"reference/openai-api-compatibility/#speech-to-text","title":"Speech-to-Text","text":"<pre><code>POST /v1/audio/transcriptions\n</code></pre> <ul> <li>Supported for Models with <code>.spec.features: [\"SpeechToText\"]</code>.</li> </ul>"},{"location":"reference/openai-api-compatibility/#openai-client-libaries","title":"OpenAI Client libaries","text":"<p>You can use the official OpenAI client libraries by setting the <code>base_url</code> to the KubeAI endpoint.</p> <p>For example, you can use the Python client like this: <pre><code>from openai import OpenAI\nclient = OpenAI(api_key=\"ignored\",\n                base_url=\"http://kubeai/openai/v1\")\nresponse = client.chat.completions.create(\n  model=\"gemma2-2b-cpu\",\n  messages=[\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n    {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n    {\"role\": \"user\", \"content\": \"Where was it played?\"}\n  ]\n)\n</code></pre></p>"},{"location":"tutorials/langchain/","title":"Using LangChain with KubeAI","text":"<p>LangChain makes it easy to build applications powered by LLMs. KubeAI makes it easy to deploy and manage LLMs at scale. Together, they make it easy to build and deploy private and secure AI applications.</p> <p>In this tutorial, we'll show you how to use LangChain with KubeAI's OpenAI compatible API. The beauty of KubeAI's OpenAI compatibility is that you can use KubeAI with any framework that supports OpenAI.</p>"},{"location":"tutorials/langchain/#prerequisites","title":"Prerequisites","text":"<p>A K8s cluster. You can use a local cluster like kind.</p>"},{"location":"tutorials/langchain/#installing-kubeai-with-gemma-2b","title":"Installing KubeAI with Gemma 2B","text":"<p>Run the following command to install KubeAI with Gemma 2B:</p> <pre><code>helm repo add kubeai https://www.kubeai.org\nhelm repo update\n\ncat &lt;&lt;EOF &gt; models-helm-values.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\nEOF\n\nhelm install kubeai kubeai/kubeai \\\n    -f ./helm-values.yaml \\\n    --wait --timeout 10m\n\nhelm install kubeai-models kubeai/models \\\n    -f ./models-helm-values.yaml\n</code></pre>"},{"location":"tutorials/langchain/#using-langchain","title":"Using LangChain","text":"<p>Install the required Python packages: <pre><code>pip install langchain_openai\n</code></pre></p> <p>Let's access the KubeAI OpenAI compatible API locally to make it easier.</p> <p>Run the following command to port-forward to the KubeAI service: <pre><code>kubectl port-forward svc/kubeai 8000:80\n</code></pre> Now the KubeAI OpenAI compatible API is available at <code>http://localhost:8000/openai</code> from your local machine.</p> <p>Let's create a simple Python script that uses LangChain and is connected to KubeAI.</p> <p>Create a file named <code>test-langchain.py</code> with the following content: <pre><code>from langchain_openai import ChatOpenAI\n\nllm = ChatOpenAI(\n    model=\"gemma2-2b-cpu\",\n    temperature=0,\n    max_tokens=None,\n    timeout=None,\n    max_retries=2,\n    api_key=\"thisIsIgnored\",\n    base_url=\"http://localhost:8000/openai/v1\",\n)\n\nmessages = [\n    (\n        \"system\",\n        \"You are a helpful assistant that translates English to French. Translate the user sentence.\",\n    ),\n    (\"human\", \"I love programming.\"),\n]\nai_msg = llm.invoke(messages)\nprint(ai_msg.content)\n</code></pre></p> <p>Run the Python script: <pre><code>python test-langchain.py\n</code></pre></p> <p>Notice that we set base_url to <code>http://localhost:8000/openai/v1</code>. This tells LangChain to use our local KubeAI OpenAI compatible AP instead of the default OpenAI public API.</p> <p>If you run langchain within the K8s cluster, you can use the following base_url instead: <code>http://kubeai/openai/v1</code>. So the code would look like this: <pre><code>llm = ChatOpenAI(\n    ...\n    base_url=\"http://kubeai/openai/v1\",\n)\n</code></pre></p> <p>That's it! You've successfully used LangChain with KubeAI. Now you can build and deploy private and secure AI applications with ease.</p>"},{"location":"tutorials/langtrace/","title":"Deploying KubeAI with Langtrace","text":"<p>Langtrace is an open source tool that helps you with tracing and monitoring your AI calls. It includes a self-hosted UI that for example shows you the estimated costs of your LLM calls.</p> <p>KubeAI is used for deploying LLMs with an OpenAI compatible endpoint.</p> <p>In this tutorial you will learn how to deploy KubeAI and Langtrace end-to-end. Both KubeAI and Langtrace are installed in your Kubernetes cluster. No cloud services or external dependencies are required.</p> <p>If you don't have a K8s cluster yet, you can create one using kind or minikube. <pre><code>kind create cluster # OR: minikube start\n</code></pre></p> <p>Install Langtrace: <pre><code>helm repo add langtrace https://Scale3-Labs.github.io/langtrace-helm-chart\nhelm repo update\nhelm install langtrace langtrace/langtrace\n</code></pre></p> <p>Install KubeAI and wait for all components to be ready (may take a minute). <pre><code>helm repo add kubeai https://www.kubeai.org\nhelm repo update\nhelm install kubeai kubeai/kubeai --wait --timeout 10m\n</code></pre></p> <p>Install the gemma2-2b-cpu model:</p> <pre><code>cat &lt;&lt;EOF &gt; kubeai-models.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n</code></pre> <p>Create a local Python environment and install dependencies: <pre><code>python3 -m venv .venv\nsource .venv/bin/activate\npip install langtrace-python-sdk openai\n</code></pre></p> <p>Expose the KubeAI service to your local port: <pre><code>kubectl port-forward service/kubeai 8000:80\n</code></pre></p> <p>Expose the Langtrace service to your local port: <pre><code>kubectl port-forward service/langtrace-app 3000:3000\n</code></pre></p> <p>A Langtrace API key is required to use the Langtrace SDK. So lets get one by visiting your self hosted Langtace UI.</p> <p>Open your browser to http://localhost:3000, create a project and get the API keys for your langtrace project.</p> <p>In the Python script below, replace <code>langtrace_api_key</code> with your API key.</p> <p>Create file named <code>langtrace-example.py</code> with the following content: <pre><code># Replace this with your langtrace API key by visiting http://localhost:3000\nlangtrace_api_key=\"f7e003de19b9a628258531c17c264002e985604ca9fa561debcc85c41f357b09\"\n\nfrom langtrace_python_sdk import langtrace\nfrom langtrace_python_sdk.utils.with_root_span import with_langtrace_root_span\n# Paste this code after your langtrace init function\n\nfrom openai import OpenAI\n\nlangtrace.init(\n    api_key=api_key,\n    api_host=\"http://localhost:3000/api/trace\",\n)\n\nbase_url = \"http://localhost:8000/openai/v1\"\nmodel = \"gemma2-2b-cpu\"\n\n@with_langtrace_root_span()\ndef example():\n    client = OpenAI(base_url=base_url, api_key=\"ignored-by-kubeai\")\n    response = client.chat.completions.create(\n        model=model,\n        messages=[\n            {\n                \"role\": \"system\",\n                \"content\": \"How many states of matter are there?\"\n            }\n        ],\n    )\n    print(response.choices[0].message.content)\n\nexample()\n</code></pre></p> <p>Run the Python script: <pre><code>python3 langtrace-example.py\n</code></pre></p> <p>Now you should see the trace in your Langtrace UI. Take a look by visiting http://localhost:3000.</p> <p></p>"},{"location":"tutorials/weaviate/","title":"Weaviate with local autoscaling embedding and generative models","text":"<p>Weaviate is a vector search engine that can integrate seamlessly with KubeAI's embedding and generative models. This tutorial demonstrates how to deploy both KubeAI and Weaviate in a Kubernetes cluster, using KubeAI as the OpenAI endpoint for Weaviate.</p> <p>Why use KubeAI with Weaviate?</p> <ul> <li>Security and privacy: KubeAI runs locally in your Kubernetes cluster, so your data never leaves your infrastructure.</li> <li>Cost savings: KubeAI can run on your existing hardware, reducing the need for paying for embeddings and generative models.</li> </ul> <p>This tutorial uses CPU only models, so it should work even on your laptop.</p> <p>As you go go through this tutorial, you will learn how to:</p> <ul> <li>Deploy KubeAI with embedding and generative models</li> <li>Install Weaviate and connect it to KubeAI</li> <li>Import data into Weaviate</li> <li>Perform semantic search using the embedding model</li> <li>Perform generative search using the generative model</li> </ul>"},{"location":"tutorials/weaviate/#prerequisites","title":"Prerequisites","text":"<p>A Kubernetes cluster. You can use kind or minikube.</p> <pre><code>kind create cluster\n</code></pre>"},{"location":"tutorials/weaviate/#kubeai-configuration","title":"KubeAI Configuration","text":"<p>Let's start by deploying KubeAI with the models we want to use. Nomic embedding model is used instead of text-embedding-ada-002. Gemma 2 2B is used instead of gpt-3.5-turbo. You could choose to use bigger models depending on your available hardware.</p> <p>Create a file named <code>kubeai-model-values.yaml</code> with the following content: <pre><code>catalog:\n  text-embedding-ada-002:\n    enabled: true\n    minReplicas: 1\n    features: [\"TextEmbedding\"]\n    owner: nomic\n    url: \"ollama://nomic-embed-text\"\n    engine: OLlama\n    resourceProfile: cpu:1\n  gpt-3.5-turbo:\n    enabled: true\n    minReplicas: 1\n    features: [\"TextGeneration\"]\n    owner: google\n    url: \"ollama://gemma2:2b\"\n    engine: OLlama\n    resourceProfile: cpu:2\n</code></pre></p> <p>Note: It's important that you name the models as <code>text-embedding-ada-002</code> and <code>gpt-3.5-turbo</code> as Weaviate expects these names.</p> <p>Run the following command to deploy KubeAI and install the configured models: <pre><code>helm repo add kubeai https://www.kubeai.org &amp;&amp; helm repo update\n\nhelm install kubeai kubeai/kubeai\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-model-values.yaml\n</code></pre></p>"},{"location":"tutorials/weaviate/#weaviate-installation","title":"Weaviate Installation","text":"<p>For this tutorial, we will use the Weaviate Helm chart to deploy Weaviate.</p> <p>Let's enable the text2vec-openai and generative-openai modules in Weaviate. We will also set the default vectorizer module to text2vec-openai.</p> <p>The <code>apiKey</code> is ignored in this case as we are using KubeAI as the OpenAI endpoint.</p> <p>Create a file named <code>weaviate-values.yaml</code> with the following content: <pre><code>modules:\n  text2vec-openai:\n    enabled: true\n    apiKey: thisIsIgnored\n  generative-openai:\n    enabled: true\n    apiKey: thisIsIgnored\n  default_vectorizer_module: text2vec-openai\nservice:\n  # To prevent Weaviate being exposed publicly\n  type: ClusterIP\n</code></pre></p> <p>Install Weaviate by running the following command: <pre><code>helm repo add weaviate https://weaviate.github.io/weaviate-helm &amp;&amp; helm repo update\n\nhelm install \\\n  \"weaviate\" \\\n  weaviate/weaviate \\\n  -f weaviate-values.yaml\n</code></pre></p>"},{"location":"tutorials/weaviate/#usage","title":"Usage","text":"<p>We will be using Python to interact with Weaviate. The 2 use cases we will cover are: - Semantic search using the embedding model - Generative search using the generative model</p>"},{"location":"tutorials/weaviate/#connectivity","title":"Connectivity","text":"<p>The remaining steps require connectivity to the Weaviate service. However, Weaviate is not exposed publicly in this setup. So we setup a local port forwards to access the Weaviate services.</p> <p>Setup a local port forwards to the Weaviate services by running: <pre><code>kubectl port-forward svc/weaviate 8080:80\nkubectl port-forward svc/weaviate-grpc 50051:50051\n</code></pre></p>"},{"location":"tutorials/weaviate/#weaviate-client-python-setup","title":"Weaviate client Python Setup","text":"<p>Create a virtual environment and install the Weaviate client: <pre><code>python -m venv .venv\nsource .venv/bin/activate\npip install -U weaviate-client requests\n</code></pre></p>"},{"location":"tutorials/weaviate/#collection-and-data-import","title":"Collection and Data Import","text":"<p>Create a file named <code>create-collection.py</code> with the following content: <pre><code>import json\nimport weaviate\nimport requests\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n\n    client.collections.create(\n        \"Question\",\n        vectorizer_config=Configure.Vectorizer.text2vec_openai(\n                model=\"text-embedding-ada-002\",\n                base_url=\"http://kubeai/openai\",\n        ),\n        generative_config=Configure.Generative.openai(\n            model=\"gpt-3.5-turbo\",\n            base_url=\"http://kubeai/openai\",\n        ),\n    )\n\n    # import data\n    resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')\n    data = json.loads(resp.text)  # Load data\n\n    question_objs = list()\n    for i, d in enumerate(data):\n        question_objs.append({\n            \"answer\": d[\"Answer\"],\n            \"question\": d[\"Question\"],\n            \"category\": d[\"Category\"],\n        })\n\n    questions = client.collections.get(\"Question\")\n    questions.data.insert_many(question_objs)\n    print(\"Data imported successfully\")\n</code></pre></p> <p>Create a collection that uses KubeAI as the openAI endpoint: <pre><code>python create-collection.py\n</code></pre> You should see a message <code>Data imported successfully</code>.</p> <p>The collection is now created and data is imported. The vectors are generated by KubeAI and stored in Weaviate.</p>"},{"location":"tutorials/weaviate/#semantic-search","title":"Semantic Search","text":"<p>Now let's do semantic search, which uses the embeddings. Create a file named <code>search.py</code> with the following content: <pre><code>import weaviate\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n    questions = client.collections.get(\"Question\")\n    response = questions.query.near_text(\n        query=\"biology\",\n        limit=2\n    )\n    print(response.objects[0].properties)  # Inspect the first object\n</code></pre></p> <p>Execute the python script: <pre><code>python search.py\n</code></pre></p> <p>You should see the following output: <pre><code>{\n  \"answer\": \"DNA\",\n  \"question\": \"In 1953 Watson &amp; Crick built a model of the molecular structure of this, the gene-carrying substance\",\n  \"category\": \"SCIENCE\"\n}\n</code></pre></p>"},{"location":"tutorials/weaviate/#generative-search-rag","title":"Generative Search (RAG)","text":"<p>Now let's do generative search, which uses the generative model (Text generation LLM). The generative model is run locally and managed by KubeAI.</p> <p>Create a file named <code>generate.py</code> with the following content: <pre><code>import weaviate\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n    questions = client.collections.get(\"Question\")\n\n    response = questions.generate.near_text(\n        query=\"biology\",\n        limit=2,\n        grouped_task=\"Write a tweet with emojis about these facts.\"\n    )\n\n    print(response.generated)  # Inspect the generated text\n</code></pre></p> <p>Run the python script: <pre><code>python generate.py\n</code></pre></p> <p>You should see something similar to this:</p> <p>\ud83e\uddec Watson &amp; Crick cracked the code in 1953!  \ud83e\udd2f They built a model of DNA, the blueprint of life. \ud83e\uddec \ud83e\udde0 Liver power! \ud83d\udcaa This organ keeps your blood sugar balanced by storing glucose as glycogen. \ud83e\ude78 #ScienceFacts #Biology</p>"},{"location":"tutorials/weaviate/#conclusion","title":"Conclusion","text":"<p>You've now successfully set up KubeAI with Weaviate for both embedding-based semantic search and generative tasks. You've also learned how to import data, perform searches, and generate content using KubeAI-managed models.</p>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"KubeAI: Private Open AI on Kubernetes","text":"<p>Get inferencing running on Kubernetes: LLMs, Embeddings, Speech-to-Text.</p> <p>\u2705\ufe0f  Drop-in replacement for OpenAI with API compatibility \ud83e\udde0  Serve top OSS models (LLMs, Whisper, etc.) \ud83d\ude80  Multi-platform: CPU-only, GPU, coming soon: TPU \u2696\ufe0f  Scale from zero, autoscale based on load \ud83d\udee0\ufe0f  Zero dependencies (does not depend on Istio, Knative, etc.)  \ud83d\udcac  Chat UI included (OpenWebUI) \ud83e\udd16  Operates OSS model servers (vLLM, Ollama, FasterWhisper, Infinity) \u2709  Stream/batch inference via messaging integrations (Kafka, PubSub, etc.)  </p> <p>Quotes from the community:</p> <p>reusable, well abstracted solution to run LLMs - Mike Ensor</p>"},{"location":"#architecture","title":"Architecture","text":"<p>KubeAI serves an OpenAI compatible HTTP API. Admins can configure ML models via <code>kind: Model</code> Kubernetes Custom Resources. KubeAI can be thought of as a Model Operator (See Operator Pattern) that manages vLLM and Ollama servers.</p> <p></p>"},{"location":"#local-quickstart","title":"Local Quickstart","text":"<p>Create a local cluster using kind or minikube.</p> TIP: If you are using Podman for kind... Make sure your Podman machine can use up to 6G of memory (by default it is capped at 2G):  <pre><code># You might need to stop and remove the existing machine:\npodman machine stop\npodman machine rm\n\n# Init and start a new machine:\npodman machine init --memory 6144 --disk-size 120\npodman machine start\n</code></pre> <pre><code>kind create cluster # OR: minikube start\n</code></pre> <p>Add the KubeAI Helm repository.</p> <pre><code>helm repo add kubeai https://www.kubeai.org\nhelm repo update\n</code></pre> <p>Install KubeAI and wait for all components to be ready (may take a minute).</p> <pre><code>helm install kubeai kubeai/kubeai --wait --timeout 10m\n</code></pre> <p>Install some predefined models.</p> <pre><code>cat &lt;&lt;EOF &gt; kubeai-models.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\n  qwen2-500m-cpu:\n    enabled: true\n  nomic-embed-text-cpu:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n</code></pre> <p>Before progressing to the next steps, start a watch on Pods in a standalone terminal to see how KubeAI deploys models. </p> <pre><code>kubectl get pods --watch\n</code></pre>"},{"location":"#interact-with-gemma2","title":"Interact with Gemma2","text":"<p>Because we set <code>minReplicas: 1</code> for the Gemma model you should see a model Pod already coming up.</p> <p>Start a local port-forward to the bundled chat UI.</p> <pre><code>kubectl port-forward svc/openwebui 8000:80\n</code></pre> <p>Now open your browser to localhost:8000 and select the Gemma model to start chatting with.</p>"},{"location":"#scale-up-qwen2-from-zero","title":"Scale up Qwen2 from Zero","text":"<p>If you go back to the browser and start a chat with Qwen2, you will notice that it will take a while to respond at first. This is because we set <code>minReplicas: 0</code> for this model and KubeAI needs to spin up a new Pod (you can verify with <code>kubectl get models -oyaml qwen2-500m-cpu</code>).</p>"},{"location":"#documentation","title":"Documentation","text":"<p>Checkout our documentation on kubeai.org to find info on:</p> <ul> <li>Installing KubeAI in the cloud</li> <li>How to guides (e.g. how to manage models and resource profiles).</li> <li>Concepts (how the components of KubeAI work).</li> <li>How to contribute</li> </ul>"},{"location":"#adopters","title":"Adopters","text":"<p>List of known adopters:</p> Name Description Link Telescope Telescope uses KubeAI for multi-region large scale batch LLM inference. trytelescope.ai Google Cloud Distributed Edge KubeAI is included as a reference architecture for inferencing at the edge. LinkedIn, GitLab <p>If you are using KubeAI and would like to be listed as an adopter, please make a PR.</p>"},{"location":"#openai-api-compatibility","title":"OpenAI API Compatibility","text":"<pre><code># Implemented #\n/v1/chat/completions\n/v1/completions\n/v1/embeddings\n/v1/models\n/v1/audio/transcriptions\n\n# Planned #\n# /v1/assistants/*\n# /v1/batches/*\n# /v1/fine_tuning/*\n# /v1/images/*\n# /v1/vector_stores/*\n</code></pre>"},{"location":"#immediate-roadmap","title":"Immediate Roadmap","text":"<ul> <li>Model caching</li> <li>LoRA finetuning (compatible with OpenAI finetuning API)</li> <li>Image generation (compatible with OpenAI images API)</li> </ul> <p>NOTE: KubeAI was born out of a project called Lingo which was a simple Kubernetes LLM proxy with basic autoscaling. We relaunched the project as KubeAI (late August 2024) and expanded the roadmap to what it is today.</p> <p>\ud83c\udf1f Don't forget to drop us a star on GitHub and follow the repo to stay up to date!</p> <p></p>"},{"location":"#contact","title":"Contact","text":"<p>Let us know about features you are interested in seeing or reach out with questions. Visit our Discord channel to join the discussion!</p> <p>Or just reach out on LinkedIn if you want to connect:</p> <ul> <li>Nick Stogner</li> <li>Sam Stoelinga</li> </ul>"},{"location":"benchmarks/llama-3.2-11b-vision/","title":"Llama 3.2 11B Vision Instruct vLLM Benchmarks","text":"<p>Single L4 GPU vLLM 0.6.2 <pre><code>python3 benchmark_serving.py --backend openai \\\n    --base-url http://localhost:8000/openai \\\n    --dataset-name=sharegpt --dataset-path=ShareGPT_V3_unfiltered_cleaned_split.json \\\n    --model meta-llama-3.2-11b-vision-instruct \\\n    --seed 12345 --tokenizer neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic\n============ Serving Benchmark Result ============\nSuccessful requests:                     1000\nBenchmark duration (s):                  681.93\nTotal input tokens:                      230969\nTotal generated tokens:                  194523\nRequest throughput (req/s):              1.47\nOutput token throughput (tok/s):         285.25\nTotal Token throughput (tok/s):          623.95\n---------------Time to First Token----------------\nMean TTFT (ms):                          319146.12\nMedian TTFT (ms):                        322707.98\nP99 TTFT (ms):                           642512.79\n-----Time per Output Token (excl. 1st token)------\nMean TPOT (ms):                          54.84\nMedian TPOT (ms):                        53.66\nP99 TPOT (ms):                           83.75\n---------------Inter-token Latency----------------\nMean ITL (ms):                           54.09\nMedian ITL (ms):                         47.44\nP99 ITL (ms):                            216.77\n==================================================\n</code></pre></p>"},{"location":"concepts/autoscaling/","title":"Autoscaling","text":"<p>KubeAI proxies HTTP and messaging (i.e. Kafka, etc) requests and messages to models. It will adjust the number Pods serving a given model based on the average active number of requests. If no Pods are running when a request comes in, KubeAI will hold the request, scale up a Pod and forward the request when the Pod is ready. This process happens in a manner that is transparent to the end client (other than the added delay from a cold-start).</p> <p> </p>"},{"location":"concepts/autoscaling/#next","title":"Next","text":"<p>Read about how to configure autoscaling.</p>"},{"location":"concepts/backend-servers/","title":"Backend Servers","text":"<p>KubeAI serves ML models by launching Pods on Kubernetes. The configuration and lifecycle of these Pods are managed by the KubeAI controller. Every model server Pod loads exactly one model on startup.</p> <p>In a Model manifest you can define what server to use for inference (<code>VLLM</code>, <code>OLlama</code>). Any model-specific settings can be passed to the server process via the <code>args</code> and <code>env</code> fields.</p>"},{"location":"concepts/backend-servers/#next","title":"Next","text":"<p>Read about how to install models.</p>"},{"location":"concepts/resource-profiles/","title":"Resource Profiles","text":"<p>A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are configured on inference server Pods. These profiles are defined in the KubeAI <code>config.yaml</code> file (via a ConfigMap). Each model specifies the resource profile that it requires.</p> <p>Kubernetes Model resources specify a resource profile and the count of that resource that they require (for example <code>resourceProfile: nvidia-gpu-l4:2</code> - 2x L4 GPUs).</p> <p>A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed in.</p> <p>Example: A resource profile named <code>nvidia-gpu-l4</code> might contain the following node selectors when installing KubeAI on a GKE Kubernetes cluster:</p> <pre><code>cloud.google.com/gke-accelerator: \"nvidia-l4\"\ncloud.google.com/gke-spot: \"true\"\n</code></pre> <p>and add the following resource requests to the model server Pods:</p> <pre><code>nvidia.com/gpu: \"1\"\n</code></pre> <p>In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource.</p>"},{"location":"concepts/resource-profiles/#next","title":"Next","text":"<p>Read about how to configure resource profiles.</p>"},{"location":"concepts/storage-caching/","title":"Storage / Caching","text":"<p>With \"Large\" in the name, caching is a critical part of serving LLMs.</p> <p>The best caching technique may very depending on your environment:</p> <ul> <li>What cloud features are available?</li> <li>Is your cluster deployed in an air-gapped environment?</li> </ul>"},{"location":"concepts/storage-caching/#a-model-built-into-container","title":"A. Model built into container","text":"<p>Status: Supported</p> <p>Building a model into a container image can provide a simple way to take advantage of image-related optimizations built into Kubernetes:</p> <ul> <li> <p>Relaunching a model server on the same Node that it ran on before will likely be able to reuse the previously pulled image.</p> </li> <li> <p>Secondary boot disks on GKE can be used to avoid needing to pull images.</p> </li> <li> <p>Image streaming on GKE can allow for containers to startup before the entire image is present on the Node.</p> </li> <li> <p>Container images can be pre-installed on Nodes in air-gapped environments (example: k3s airgap installation).</p> </li> </ul> <p>Guides:</p> <ul> <li>How to build models into container images</li> </ul>"},{"location":"concepts/storage-caching/#b-model-on-shared-filesystem-read-write-many","title":"B. Model on shared filesystem (read-write-many)","text":"<p>KubeAI can manage model caches on a shared filesystem (i.e. AWS EFS, GCP Filestore, NFS). It manages the full lifecycle of a cached model: loading, serving, and cache eviction (on deletion of the Model).</p> <p> </p>"},{"location":"concepts/storage-caching/#c-model-on-read-only-many-disk","title":"C. Model on read-only-many disk","text":"<p>Status: Planned.</p> <p>Examples: GCP Hyperdisk ML</p>"},{"location":"contributing/development-environment/","title":"Development environment","text":"<p>This document provides instructions for setting up an environment for developing KubeAI.</p>"},{"location":"contributing/development-environment/#optional-cloud-setup","title":"Optional: Cloud Setup","text":""},{"location":"contributing/development-environment/#gcp-pubsub","title":"GCP PubSub","text":"<p>If you are develop PubSub messaging integration on GCP, setup test topics and subscriptions and uncomment the <code>.messaging.streams</code> in <code>./hack/dev-config.yaml</code>.</p> <pre><code>gcloud auth login --update-adc\n\ngcloud pubsub topics create test-kubeai-requests\ngcloud pubsub subscriptions create test-kubeai-requests-sub --topic test-kubeai-requests\ngcloud pubsub topics create test-kubeai-responses\ngcloud pubsub subscriptions create test-kubeai-responses-sub --topic test-kubeai-responses\n</code></pre>"},{"location":"contributing/development-environment/#run-in-local-cluster","title":"Run in Local Cluster","text":"<pre><code>kind create cluster\n# OR\n#./hack/create-dev-gke-cluster.yaml\n\n# Generate CRDs from Go code.\nmake generate &amp;&amp; make manifests\n\n# When CRDs are changed reapply using kubectl:\nkubectl apply -f ./charts/kubeai/charts/crds/crds\n\n# Model with special address annotations:\nkubectl apply -f ./hack/dev-model.yaml\n\n# OPTION A #\n# Run KubeAI inside cluster\n# Change `-f` based on the cluster environment.\nhelm upgrade --install kubeai ./charts/kubeai \\\n    --set openwebui.enabled=true \\\n    --set image.tag=latest \\\n    --set image.pullPolicy=Always \\\n    --set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \\\n    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \\\n    --set replicaCount=1 -f ./hack/dev-gke-helm-values.yaml\n\n# OPTION B #\n# For quick local interation (run KubeAI outside of cluster)\nkubectl create cm kubeai-autoscaler-state -oyaml --dry-run=client | kubectl apply -f -\nCONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go\n\n# In another terminal:\nwhile true; do kubectl port-forward service/dev-model 7000:7000; done\n############\n</code></pre>"},{"location":"contributing/development-environment/#running","title":"Running","text":""},{"location":"contributing/development-environment/#completions-api","title":"Completions API","text":"<pre><code># If you are running kubeai in-cluster:\n# kubectl port-forward svc/kubeai 8000:80\n\ncurl http://localhost:8000/openai/v1/completions -H \"Content-Type: application/json\" -d '{\"prompt\": \"Hi\", \"model\": \"dev\"}' -v\n</code></pre>"},{"location":"contributing/development-environment/#messaging-integration","title":"Messaging Integration","text":"<pre><code>gcloud pubsub topics publish test-kubeai-requests \\                  \n  --message='{\"path\":\"/v1/completions\", \"metadata\":{\"a\":\"b\"}, \"body\": {\"model\": \"dev\", \"prompt\": \"hi\"}}'\n\ngcloud pubsub subscriptions pull test-kubeai-responses-sub --auto-ack\n</code></pre>"},{"location":"contributing/documentation/","title":"Documentation","text":"<p>We are grateful for anyone who takes the time to improve KubeAI documentation! In order to keep our docs clear and consistent we ask that you first read about the approach to documentation that we have standardized on...</p>"},{"location":"contributing/documentation/#read-before-writing","title":"Read before writing!","text":"<p>The KubeAI approach to documentation is loosely inspired by the Diataxis method.</p> <p>TLDR on how KubeAI docs are organized:</p> <ul> <li>Installation: How-to guides specific to installing KubeAI.</li> <li>How To: Directions that guide the reader through a problem or towards a result. How-to guides are goal-oriented. They assume the user is familiar with general concepts, tools, and has already installed KubeAI.</li> <li>Concepts: A reflective explanation of KubeAI topics with a focus on giving the reader an understanding of the why.</li> <li>Tutorials: Learning oriented experiences. Lessons that often guide a user from beginning to end. The goal is to help the reader learn something (compared to a how-to guide that is focused on helping the reader do something).</li> <li>Contributing: The docs in here differ from the rest of the docs by audience: these docs are for anyone who will be contributing code or docs to the KubeAI project.</li> </ul>"},{"location":"contributing/documentation/#how-to-serve-kubeaiorg-locally","title":"How to serve kubeai.org locally","text":"<p>Make sure you have python3 installed and run:</p> <pre><code>make docs\n</code></pre>"},{"location":"contributing/release-process/","title":"Release Process","text":"<p>This document describes the process for releasing a new version of the project.</p>"},{"location":"contributing/release-process/#docs","title":"Docs","text":"<p>The docs are automatically published whenever a PR updates the docs and the PR is merged into the main branch. The docs are published to the <code>gh-pages</code> branch, which is the source for the Github Pages site.</p>"},{"location":"contributing/release-process/#docker-images","title":"Docker images","text":"<p>The Docker image latest tag always points to the latest released version. The <code>main</code> tag points to the latest commit on the main branch.</p> <p>If you push a tag <code>vX.Y.Z</code> to the repository, the Docker image with the tag <code>vX.Y.Z</code> is built and pushed to Docker Hub. Afterwards, the <code>latest</code> tag is updated to point to the new version.</p>"},{"location":"contributing/release-process/#helm-chart","title":"Helm Chart","text":"<p>The Helm chart only gets released when a git tag is pushed to the repository with the format <code>helm-v*</code>.</p> <p>The appVersion in the Helm chart does not have to point to the latest released version. This allows us to first publish a new version of the Docker image without updating the Helm chart. The Helm chart is updated when we are ready to release a new version.</p> <p>This is important when a new appVersion isn't compatible with the current Helm chart. In those cases, we can first merge the PR, thoroughly test, release new container image, and then in a separate PR update the Helm chart and the appVersion.</p>"},{"location":"how-to/architect-for-multitenancy/","title":"Architect for Multitenancy","text":"<p>KubeAI can support multitenancy by filtering the models that it serves via Kubernetes label selectors. These label selectors can be applied when accessing any of the OpenAI-compatible endpoints through the <code>X-Label-Selector</code> HTTP header and will match on labels specified on the <code>kind: Model</code> objects. The pattern is similar to using a <code>WHERE</code> clause in a SQL query.</p> <p>Example Models:</p> <pre><code>kind: Model\nmetadata:\n  name: llama-3.2\n  labels:\n    tenancy: public\nspec:\n# ...\n---\nkind: Model\nmetadata:\n  name: custom-private-model\n  labels:\n    tenancy: org-abc\nspec:\n# ...\n</code></pre> <p>Example HTTP requests:</p> <pre><code># The returned list of models will be filtered.\ncurl http://$KUBEAI_ENDPOINT/openai/v1/models \\\n    -H \"X-Label-Selector: tenancy in (org-abc, public)\"\n\n# When running inference, if the label selector does not match\n# a 404 will be returned.\ncurl http://$KUBEAI_ENDPOINT/openai/v1/completions \\\n    -H \"Content-Type: application/json\" \\\n    -H \"X-Label-Selector: tenancy in (org-abc, public)\" \\\n    -d '{\"prompt\": \"Hi\", \"model\": \"llama-3.2\"}'\n</code></pre> <p>The header value can be any valid Kubernetes label selector. Some examples include:</p> <pre><code>X-Label-Selector: tenancy=org-abc\nX-Label-Selector: tenancy in (org-abc, public)\nX-Label-Selector: tenancy!=private\n</code></pre> <p>Multiple <code>X-Label-Selector</code> headers can be specified in the same HTTP request and will be treated as a logical <code>AND</code>. For example, the following request will only match Models that have a label <code>tenant: org-abc</code> and <code>user: sam</code>:</p> <pre><code>curl http://$KUBEAI_ENDPOINT/openai/v1/completions \\\n    -H \"Content-Type: application/json\" \\\n    -H \"X-Label-Selector: tenant=org-abc\" \\\n    -H \"X-Label-Selector: user=sam\" \\\n    -d '{\"prompt\": \"Hi\", \"model\": \"llama-3.2\"}'\n</code></pre> <p>Example architecture:</p> <p></p>"},{"location":"how-to/build-models-into-containers/","title":"Build models into containers","text":"<p>In this guide we will preload a LLM into a custom built Ollama serving image. You can follow the same steps for other models and other serving engines.</p> <p>Define some values <pre><code>export MODEL_URL=ollama://qwen2:0.5b\n\n# Customize with your own image repo.\nexport IMAGE=us-central1-docker.pkg.dev/substratus-dev/default/ollama-builtin-qwen2-05b:latest\n</code></pre></p> <p>Build and push image. Note: building (downloading base image &amp; model) and pushing (uploading image &amp; model) can take a while depending on the size of the model.</p> <pre><code>git clone https://github.com/substratusai/kubeai\ncd ./kubeai/examples/ollama-builtin\n\ndocker build --build-arg MODEL_URL=$MODEL_URL -t $IMAGE .\ndocker push $IMAGE\n</code></pre> <p>Create a model manifest &amp; apply into a cluster with KubeAI installed. NOTE: The only difference between an built-in model image and otherwise is the addition of the <code>image:</code> field.</p> <pre><code>kubectl apply -f - &lt;&lt; EOF\napiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: builtin-model-example\nspec:\n  features: [\"TextGeneration\"]\n  owner: alibaba\n  image: $IMAGE # &lt;-- The image with model built-in\n  url: \"$MODEL_URL\"\n  engine: OLlama\n  resourceProfile: cpu:1\nEOF\n</code></pre>"},{"location":"how-to/cache-models-with-aws-efs/","title":"Cache models with AWS EFS","text":"<p>KubeAI can manage model caches. AWS EFS is supported as a pluggable backend store.</p> <p> </p> <p>Follow the EKS install guide.</p>"},{"location":"how-to/cache-models-with-aws-efs/#1-create-an-efs-file-system","title":"1. Create an EFS File System","text":"<p>Set environment variables to match your environment.</p> <pre><code>export CLUSTER_NAME=\"cluster-with-karpenter\"\nexport CLUSTER_REGION=\"us-west-2\"\n</code></pre> <p>Create an EFS file system in the same VPC as your EKS cluster.</p> <pre><code>vpc_id=$(aws eks describe-cluster \\\n    --name $CLUSTER_NAME \\\n    --query \"cluster.resourcesVpcConfig.vpcId\" \\\n    --output text)\n\ncidr_range=$(aws ec2 describe-vpcs \\\n    --vpc-ids $vpc_id \\\n    --query \"Vpcs[].CidrBlock\" \\\n    --output text \\\n    --region ${CLUSTER_REGION})\n\nsecurity_group_id=$(aws ec2 create-security-group \\\n    --group-name MyEfsSecurityGroup \\\n    --description \"My EFS security group\" \\\n    --vpc-id $vpc_id \\\n    --output text)\n\naws ec2 authorize-security-group-ingress \\\n    --group-id $security_group_id \\\n    --protocol tcp \\\n    --port 2049 \\\n    --cidr $cidr_range\n\nfile_system_id=$(aws efs create-file-system \\\n    --region ${CLUSTER_REGION} \\\n    --performance-mode generalPurpose \\\n    --query 'FileSystemId' \\\n    --output text)\n</code></pre> <p>Expose the EFS file system to the subnets used by your EKS cluster. <pre><code>SUBNETS=$(eksctl get cluster --region us-west-2 ${CLUSTER_NAME} -o json | jq -r '.[0].ResourcesVpcConfig.SubnetIds[]')\n\nwhile IFS= read -r subnet; do\n    echo \"Creating EFS mount target in $subnet\"\n    aws efs create-mount-target --file-system-id $file_system_id \\\n      --subnet-id $subnet --security-groups $security_group_id --output text\ndone &lt;&lt;&lt; \"$SUBNETS\"\n</code></pre></p>"},{"location":"how-to/cache-models-with-aws-efs/#2-install-the-efs-csi-driver","title":"2. Install the EFS CSI driver","text":"<pre><code>export ROLE_NAME=AmazonEKS_EFS_CSI_DriverRole\neksctl create iamserviceaccount \\\n    --name efs-csi-controller-sa \\\n    --namespace kube-system \\\n    --cluster ${CLUSTER_NAME} \\\n    --role-name ${ROLE_NAME} \\\n    --role-only \\\n    --attach-policy-arn arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy \\\n    --approve\n\nTRUST_POLICY=$(aws iam get-role --role-name ${ROLE_NAME} \\\n    --query 'Role.AssumeRolePolicyDocument' --output json | \\\n    sed -e 's/efs-csi-controller-sa/efs-csi-*/' -e 's/StringEquals/StringLike/')\n\naws iam update-assume-role-policy --role-name ${ROLE_NAME} --policy-document \"$TRUST_POLICY\"\n\n# Get the role ARN\nEFS_ROLE_ARN=$(aws iam get-role --role-name AmazonEKS_EFS_CSI_DriverRole \\\n  --query 'Role.Arn' --output text)\n\naws eks create-addon --cluster-name $CLUSTER_NAME --addon-name aws-efs-csi-driver \\\n  --service-account-role-arn $EFS_ROLE_ARN\n</code></pre> <p>Wait for EKS Addon to active. <pre><code>aws eks wait addon-active --cluster-name $CLUSTER_NAME \\\n  --addon-name aws-efs-csi-driver\n</code></pre> Verify that the EFS CSI driver is running.</p> <pre><code>kubectl get daemonset efs-csi-node -n kube-system\n</code></pre> <p>Create a storage class for using EFS dynamic mode.</p> <pre><code>kubectl apply -f - &lt;&lt;EOF\nkind: StorageClass\napiVersion: storage.k8s.io/v1\nmetadata:\n  name: efs-sc\nprovisioner: efs.csi.aws.com\nparameters:\n  provisioningMode: efs-ap\n  fileSystemId: \"${file_system_id}\"\n  directoryPerms: \"700\"\nEOF\n</code></pre> <p>Make sure to set <code>file_system_id</code> match the EFS file system ID created in the first step.</p>"},{"location":"how-to/cache-models-with-aws-efs/#3-configure-kubeai-with-the-efs-cache-profile","title":"3. Configure KubeAI with the EFS cache profile","text":"<p>You can skip this step if you've already installed KubeAI using the EKS Helm values file: values-eks.yaml file.</p> <p>Configure KubeAI with the <code>efs-dynamic</code> cache profile. <pre><code>helm upgrade --install kubeai kubeai/kubeai \\\n  --reuse-values -f - &lt;&lt;EOF\ncacheProfiles:\n  efs-dynamic:\n    sharedFilesystem:\n      storageClassName: \"efs-sc\"\n  efs-static:\n    sharedFilesystem:\n      persistentVolumeName: \"efs-pv\"\nEOF\n</code></pre></p>"},{"location":"how-to/cache-models-with-aws-efs/#4-configure-a-model-to-use-the-efs-cache","title":"4. Configure a model to use the EFS cache","text":"<p>Apply a Model with <code>cacheProfile</code> set to <code>efs-dynamic</code>.</p> <p>NOTE: If you already installed the models chart, you will need to edit you values file and run <code>helm upgrade</code>.</p> <pre><code>helm install kubeai-models kubeai/models -f - &lt;&lt;EOF\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n    cacheProfile: efs-dynamic\nEOF\n</code></pre> <p>Wait for the Model to be fully cached.</p> <pre><code>kubectl wait --timeout 10m --for=jsonpath='{.status.cache.loaded}'=true model/llama-3.1-8b-instruct-fp8-l4\n</code></pre> <p>This model will now be loaded from Filestore when it is served.</p>"},{"location":"how-to/cache-models-with-aws-efs/#troubleshooting","title":"Troubleshooting","text":""},{"location":"how-to/cache-models-with-aws-efs/#mountvolesetup-failed-for-volume-pvc-deadline-exceeded","title":"MountVole.SetUp failed for volume pvc deadline exceeded","text":"<p><code>kubectl get events</code> may show an error like this: <pre><code>8s          Warning   FailedMount             pod/load-cache-llama-3.1-8b-instruct-fp8-l4-w7thh      MountVolume.SetUp failed for volume \"pvc-ceedb563-1e68-47fa-9d12-c697ae153d04\" : rpc error: code = DeadlineExceeded desc = context deadline exceeded\n</code></pre></p> <p>Checking the logs of the EFS CSI DaemonSet may show an error like this: <pre><code>kubectl logs -f efs-csi-node-4n75c -n kube-system\nOutput: Could not start amazon-efs-mount-watchdog, unrecognized init system \"aws-efs-csi-dri\"\nMount attempt 1/3 failed due to timeout after 15 sec, wait 0 sec before next attempt.\nMount attempt 2/3 failed due to timeout after 15 sec, wait 0 sec before next attempt.\nb'mount.nfs4: Connection timed out'\n</code></pre></p> <p>This likely means your mount target isn't setup correctly. Possibly the security group is not allowing traffic from the EKS cluster.</p>"},{"location":"how-to/cache-models-with-aws-efs/#model-loading-job","title":"Model Loading Job","text":"<p>Check to see if there is an ongoing model loader Job.</p> <pre><code>kubectl get jobs\n</code></pre>"},{"location":"how-to/cache-models-with-gcp-filestore/","title":"Cache models with GCP Filestore","text":"<p>KubeAI can manage model caches. GCP Filestore is supported as a pluggable backend store.</p> <p> </p> <p>Follow the GKE install guide.</p> <p>Ensure that the Filestore API is enabled.</p> <pre><code>gcloud services enable file.googleapis.com\n</code></pre> <p>Apply a Model with the cache profile set to <code>standard-filestore</code> (defined in the reference GKE Helm values file).</p> TIP: If you want to use `premium-filestore` you will need to ensure you have quota. <p>Open the cloud console quotas page: https://console.cloud.google.com/iam-admin/quotas. Make sure your project is selected in the top left.</p> <p>Ensure that you have at least 2.5Tb of <code>PremiumStorageGbPerRegion</code> quota in the region where your cluster is deployed.</p> <p></p> <p></p> <p>NOTE: If you already installed the models chart, you will need to edit you values file and run <code>helm upgrade</code>.</p> <pre><code>helm install kubeai-models kubeai/models -f - &lt;&lt;EOF\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n    cacheProfile: standard-filestore\nEOF\n</code></pre> <p>Wait for the Model to be fully cached. This may take a while if the Filestore instance needs to be created.</p> <pre><code>kubectl wait --timeout 10m --for=jsonpath='{.status.cache.loaded}'=true model/llama-3.1-8b-instruct-fp8-l4\n</code></pre> <p>This model will now be loaded from Filestore when it is served.</p>"},{"location":"how-to/cache-models-with-gcp-filestore/#troubleshooting","title":"Troubleshooting","text":""},{"location":"how-to/cache-models-with-gcp-filestore/#filestore-csi-driver","title":"Filestore CSI Driver","text":"<p>Ensure that the Filestore CSI driver is enabled by checking for the existance of Kubernetes storage classes. If they are not found, follow the GCP guide for enabling the CSI driver.</p> <pre><code>kubectl get storageclass standard-rwx premium-rwx\n</code></pre>"},{"location":"how-to/cache-models-with-gcp-filestore/#persistentvolumes","title":"PersistentVolumes","text":"<p>Check the PersistentVolumeClaim (that should be created by KubeAI).</p> <pre><code>kubectl describe pvc shared-model-cache-\n</code></pre> Example: Out-of-quota error <pre><code>  Warning  ProvisioningFailed    11m (x26 over 21m)  filestore.csi.storage.gke.io_gke-50826743a27a4d52bf5b-7fac-9607-vm_b4bdb2ec-b58b-4363-adec-15c270a14066  failed to provision volume with StorageClass \"premium-rwx\": rpc error: code = ResourceExhausted desc = googleapi: Error 429: Quota limit 'PremiumStorageGbPerRegion' has been exceeded. Limit: 0 in region us-central1.\nDetails:\n[\n  {\n    \"@type\": \"type.googleapis.com/google.rpc.QuotaFailure\",\n    \"violations\": [\n      {\n        \"description\": \"Quota 'PremiumStorageGbPerRegion' exhausted. Limit 0 in region us-central1\",\n        \"subject\": \"project:819220466562\"\n      }\n    ]\n  }\n]\n</code></pre> <p>Check to see if the PersistentVolume has been fully provisioned.</p> <pre><code>kubectl get pv\n# Find name of corresponding pv...\nkubectl describe pv &lt;name&gt;\n</code></pre>"},{"location":"how-to/cache-models-with-gcp-filestore/#model-loading-job","title":"Model Loading Job","text":"<p>Check to see if there is an ongoing model loader Job.</p> <pre><code>kubectl get jobs\n</code></pre>"},{"location":"how-to/configure-autoscaling/","title":"Configure autoscaling","text":"<p>This guide will cover how to configure KubeAI autoscaling parameters.</p>"},{"location":"how-to/configure-autoscaling/#system-settings","title":"System Settings","text":"<p>KubeAI administrators can define system-wide autoscaling settings by setting the following Helm values (for the <code>kubeai/kubeai</code> chart):</p> <p>Example:</p> <pre><code># helm-values.yaml\nmodelAutoscaling:\n  interval: 15s\n  timeWindow: 10m\n# ...\n</code></pre>"},{"location":"how-to/configure-autoscaling/#model-settings","title":"Model Settings","text":"<p>The following settings can be configured on a model-by-model basis.</p>"},{"location":"how-to/configure-autoscaling/#model-settings-helm","title":"Model settings: helm","text":"<p>If you are managing models via the <code>kubeai/models</code> Helm chart, you can use:</p> <pre><code># helm-values.yaml\ncatalog:\n  model-a:\n    # ...\n    minReplicas: 1\n    maxReplicas: 9\n    targetRequests: 250\n    scaleDownDelaySeconds: 45\n  model-b:\n    # ...\n    disableAutoscaling: true\n# ...\n</code></pre> <p>Re-running <code>helm upgrade</code> with these additional parameters will update model settings in the cluster.</p>"},{"location":"how-to/configure-autoscaling/#model-settings-kubectl","title":"Model settings: kubectl","text":"<p>You can also specify the autoscaling profile directly via the Models custom resource in the Kubernetes API:</p> <pre><code>apiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: my-model\nspec:\n  # ...\n  minReplicas: 1\n  maxReplicas: 9\n  targetRequests: 250\n  scaleDownDelaySeconds: 45\n</code></pre> <p>If you are already managing models using Model manifest files, you can make the update to your file and reapply it using <code>kubectl apply -f &lt;filename&gt;.yaml</code>.</p>"},{"location":"how-to/configure-embedding-models/","title":"Configure Embedding Models","text":"<p>KubeAI supports the following engines for text embedding models:</p> <ul> <li>Infinity</li> <li>vLLM</li> <li>Ollama</li> </ul> <p>Infinity supports any HuggingFace models listed as text-embedding. See the models, reranking or clip models on huggingface for reference.</p>"},{"location":"how-to/configure-embedding-models/#install-baaibge-small-en-v15-model-using-infinity","title":"Install BAAI/bge-small-en-v1.5 model using Infinity","text":"<p>Create a file named <code>kubeai-models.yaml</code> with the following content:</p> <pre><code>catalog:\n  bge-embed-text-cpu:\n    enabled: true\n    features: [\"TextEmbedding\"]\n    owner: baai\n    url: \"hf://BAAI/bge-small-en-v1.5\"\n    engine: Infinity\n    resourceProfile: cpu:1\n    minReplicas: 1\n</code></pre> <p>Apply the kubeai-models helm chart:</p> <pre><code>helm install kubeai-models kubeai/models -f ./kubeai-models.yaml\n</code></pre> <p>Once the pod is ready, you can use the OpenAI Python SDK to interact with the model:</p> <pre><code>from openai import OpenAI\n# Assumes port-forward of kubeai service to localhost:8000.\nclient = OpenAI(api_key=\"ignored\", base_url=\"http://localhost:8000/openai/v1\")\nresponse = client.embeddings.create(\n    input=\"Your text goes here.\",\n    model=\"bge-embed-text-cpu\"\n)\n</code></pre>"},{"location":"how-to/configure-resource-profiles/","title":"Configure resource profiles","text":"<p>This guide will cover modifying preconfigured resource profiles and adding your own.</p>"},{"location":"how-to/configure-resource-profiles/#modifying-preconfigured-resource-profiles","title":"Modifying preconfigured resource profiles","text":"<p>The KubeAI helm chart comes with preconfigured resource profiles for common resource types such as NVIDIA L4 GPUs. You can view these profiles in the default helm values file.</p> <p>These profiles usually require some additional settings based on the cluster/cloud that KubeAI is installed into. You can modify a resource profile by setting custom helm values and runing <code>helm install</code> or <code>helm upgrade</code>. For example, if you are installing KubeAI on GKE you will need to set GKE-specific node selectors:</p> <pre><code># helm-values.yaml\nresourceProfiles:\n  nvidia-gpu-l4:\n    nodeSelector:\n      cloud.google.com/gke-accelerator: \"nvidia-l4\"\n      cloud.google.com/gke-spot: \"true\"\n</code></pre> <p>NOTE: See the cloud-specific installation guide for a comprehensive list of settings.</p>"},{"location":"how-to/configure-resource-profiles/#adding-additional-resource-profiles","title":"Adding additional resource profiles","text":"<p>If the preconfigured resource profiles do not meet your needs you can add additional profiles by appending to the <code>.resourceProfiles</code> object in the helm values file you use to install KubeAI.</p> <pre><code># helm-values.yaml\nresourceProfiles:\n  my-custom-gpu:\n    imageName: \"optional-custom-image-name\"\n    nodeSelector:\n      my-custom-node-pool: \"some-value\"\n    limits:\n      custom.com/gpu: \"1\"\n    requests:\n      custom.com/gpu: \"1\"\n      cpu: \"3\"\n      memory: \"12Gi\"\n    runtimeClassName: \"my-custom-runtime-class\"\n</code></pre> <p>If you need to run custom model server images on your resource profile, make sure to also add those in the <code>modelServers</code> section:</p> <pre><code># helm-values.yaml\nmodelServers:\n  VLLM:\n    images:\n      optional-custom-image-name: \"my-repo/my-vllm-image:v1.2.3\"\n  OLlama:\n    images:\n      optional-custom-image-name: \"my-repo/my-ollama-image:v1.2.3\"\n</code></pre>"},{"location":"how-to/configure-resource-profiles/#next","title":"Next","text":"<p>See the guide on how to install models which includes how to configure the resource profile to use for a given model.</p>"},{"location":"how-to/configure-speech-to-text/","title":"Configure speech-to-text","text":"<p>KubeAI provides a Speech to Text endpoint that can be used to transcribe audio files. This guide will walk you through the steps to enable this feature.</p>"},{"location":"how-to/configure-speech-to-text/#enable-speech-to-text-model","title":"Enable Speech to Text model","text":"<p>You can create new models by creating a Model CRD object or by enabling a model from the model catalog.</p>"},{"location":"how-to/configure-speech-to-text/#enable-from-model-catalog","title":"Enable from model catalog","text":"<p>KubeAI provides predefined models in the <code>kubeai/models</code> Helm chart. To enable the Speech to Text model, you can set the <code>enabled</code> flag to <code>true</code> in your values file.</p> <pre><code># models-helm-values.yaml\ncatalog:\n  faster-whisper-medium-en-cpu:\n    enabled: true\n    minReplicas: 1\n</code></pre>"},{"location":"how-to/configure-speech-to-text/#enable-by-creating-model-crd","title":"Enable by creating Model CRD","text":"<p>You can also create a Model CRD object to enable the Speech to Text model. Here is an example of a Model CRD object for the Speech to Text model:</p> <pre><code>apiVersion: kubeai.org/v1\nkind: Model\nmetadata:\n  name: faster-whisper-medium-en-cpu\nspec:\n  features: [SpeechToText]\n  owner: Systran\n  url: hf://Systran/faster-whisper-medium.en\n  engine: FasterWhisper\n  resourceProfile: cpu:1\n</code></pre>"},{"location":"how-to/configure-speech-to-text/#usage","title":"Usage","text":"<p>The Speech to Text endpoint is available at <code>/openai/v1/transcriptions</code>.</p> <p>Example usage using curl:</p> <pre><code>curl -L -o kubeai.mp4 https://github.com/user-attachments/assets/711d1279-6af9-4c6c-a052-e59e7730b757\ncurl http://localhost:8000/openai/v1/audio/transcriptions \\\n  -F \"file=@kubeai.mp4\" \\\n  -F \"language=en\" \\\n  -F \"model=faster-whisper-medium-en-cpu\"\n</code></pre>"},{"location":"how-to/install-models/","title":"Install models","text":"<p>This guide provides instructions on how to configure KubeAI models.</p>"},{"location":"how-to/install-models/#installing-models-with-helm","title":"Installing models with helm","text":"<p>KubeAI provides a chart that contains preconfigured models.</p>"},{"location":"how-to/install-models/#preconfigured-models-with-helm","title":"Preconfigured models with helm","text":"<p>When you are defining Helm values for the <code>kubeai/models</code> chart you can install a preconfigured Model by setting <code>enabled: true</code>. You can view a list of all preconfigured models in the chart's default values file. </p> <pre><code># helm-values.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n</code></pre> <p>You can optionally override preconfigured settings, for example, <code>resourceProfile</code>:</p> <pre><code># helm-values.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\n    resourceProfile: nvidia-gpu-l4:2 # Require \"2 NVIDIA L4 GPUs\"\n</code></pre>"},{"location":"how-to/install-models/#custom-models-with-helm","title":"Custom models with helm","text":"<p>If you prefer to add a custom model via the same Helm chart you use for installed KubeAI, you can add your custom model entry into the <code>.catalog</code> array of your existing values file for the <code>kubeai/models</code> Helm chart:</p> <pre><code># helm-values.yaml\ncatalog:\n  my-custom-model-name:\n    enabled: true\n    features: [\"TextEmbedding\"]\n    owner: me\n    url: \"hf://me/my-custom-model\"\n    resourceProfile: CPU:1\n</code></pre>"},{"location":"how-to/install-models/#installing-models-with-kubectl","title":"Installing models with kubectl","text":"<p>You can add your own model by defining a Model yaml file and applying it using <code>kubectl apply -f model.yaml</code>.</p> <p>If you have a running cluster with KubeAI installed you can inspect the schema for a Model using <code>kubectl explain</code>:</p> <pre><code>kubectl explain models\nkubectl explain models.spec\nkubectl explain models.spec.engine\n</code></pre>"},{"location":"how-to/install-models/#programmatically-installing-models","title":"Programmatically installing models","text":"<p>See the examples.</p>"},{"location":"how-to/install-models/#feedback-welcome-a-model-management-ui","title":"Feedback welcome: A model management UI","text":"<p>We are considering adding a UI for managing models in a running KubeAI instance. Give the GitHub Issue a thumbs up if you would be interested in this feature.</p>"},{"location":"installation/eks/","title":"Install on EKS","text":"TIP: Make sure you have enough GPU quota in your AWS account. <p>The default quotas for GPU instances are often 0. You will need to request a quota increase for the GPU instances you want to use.</p> <p>The following quotas may require an increase if you wish to use GPUs in your EKS cluster: - All G and VT Spot Instance Requests - All P5 Spot Instance Requests - All P4, P3 and P2 Spot Instance Requests - Running Dedicated p4d Hosts</p>"},{"location":"installation/eks/#1-create-eks-cluster-with-karpenter","title":"1. Create EKS cluster with Karpenter","text":"<p>Set the environment variables used throughout this guide:</p> <pre><code>export CLUSTER_NAME=\"cluster-with-karpenter\"\nexport AWS_DEFAULT_REGION=\"us-west-2\"\nexport K8S_VERSION=\"1.30\"\nexport GPU_AMI_ID=\"$(aws ssm get-parameter --name /aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-gpu/recommended/image_id --query Parameter.Value --output text)\"\n</code></pre> <p>Create the EKS cluster using eksctl: <pre><code>eksctl create cluster -f - &lt;&lt;EOF\n---\napiVersion: eksctl.io/v1alpha5\nkind: ClusterConfig\nmetadata:\n  name: \"${CLUSTER_NAME}\"\n  region: \"${AWS_DEFAULT_REGION}\"\n  version: \"${K8S_VERSION}\"\n  tags:\n    karpenter.sh/discovery: \"${CLUSTER_NAME}\" # here, it is set to the cluster name\n\niam:\n  withOIDC: true # required\n\nkarpenter:\n  version: '1.0.6' # Exact version must be specified\n\nmanagedNodeGroups:\n- instanceType: m5.large\n  amiFamily: AmazonLinux2\n  name: \"${CLUSTER_NAME}-m5-ng\"\n  desiredCapacity: 2\n  minSize: 1\n  maxSize: 10\nEOF\n</code></pre></p>"},{"location":"installation/eks/#2-configure-a-karpenter-gpu-nodepool","title":"2. Configure a Karpenter GPU NodePool","text":"<p>Create the NodePool and EC2NodeClass objects:</p> <pre><code>kubectl apply -f - &lt;&lt;EOF\napiVersion: karpenter.sh/v1\nkind: NodePool\nmetadata:\n  name: gpu\nspec:\n  template:\n    spec:\n      requirements:\n        - key: karpenter.sh/capacity-type\n          operator: In\n          values: [\"spot\", \"on-demand\"]\n        - key: karpenter.k8s.aws/instance-category\n          operator: In\n          values: [\"g\", \"p\"]\n      nodeClassRef:\n        group: karpenter.k8s.aws\n        kind: EC2NodeClass\n        name: gpu\n      expireAfter: 720h # 30 * 24h = 720h\n      taints:\n      - key: nvidia.com/gpu\n        value: \"true\"\n        effect: NoSchedule\n  limits:\n    cpu: 1000\n  disruption:\n    consolidationPolicy: WhenEmptyOrUnderutilized\n    consolidateAfter: 1m\n---\napiVersion: karpenter.k8s.aws/v1\nkind: EC2NodeClass\nmetadata:\n  name: gpu\nspec:\n  amiFamily: AL2 # Amazon Linux 2\n  role: \"eksctl-KarpenterNodeRole-${CLUSTER_NAME}\"\n  subnetSelectorTerms:\n    - tags:\n        karpenter.sh/discovery: \"${CLUSTER_NAME}\" # replace with your cluster name\n  securityGroupSelectorTerms:\n    - tags:\n        karpenter.sh/discovery: \"${CLUSTER_NAME}\" # replace with your cluster name\n  amiSelectorTerms:\n    - id: \"${GPU_AMI_ID}\" # &lt;- GPU Optimized AMD AMI \n  blockDeviceMappings:\n    - deviceName: /dev/xvda\n      ebs:\n        volumeSize: 300Gi\n        volumeType: gp3\n        encrypted: true\nEOF\n</code></pre> <p>Install the NVIDIA device plugin (needed for Karpenter nodes):</p> <pre><code>kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.16.1/deployments/static/nvidia-device-plugin.yml\n</code></pre>"},{"location":"installation/eks/#3-install-kubeai","title":"3. Install KubeAI","text":"<p>Add KubeAI Helm repository.</p> <pre><code>helm repo add kubeai https://www.kubeai.org\nhelm repo update\n</code></pre> <p>Make sure you have a HuggingFace Hub token set in your environment (<code>HUGGING_FACE_HUB_TOKEN</code>).</p> <pre><code>export HF_TOKEN=\"replace-with-your-huggingface-token\"\n</code></pre> <p>Install KubeAI with Helm.</p> <pre><code>curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-eks.yaml\n# Please review the values-eks.yaml file and edit the nodeSelectors if needed.\ncat values-eks.yaml\nhelm upgrade --install kubeai kubeai/kubeai \\\n    -f values-eks.yaml \\\n    --set secrets.huggingface.token=$HF_TOKEN \\\n    --wait\n</code></pre>"},{"location":"installation/eks/#3-optionally-configure-models","title":"3. Optionally configure models","text":"<p>Optionally install preconfigured models.</p> <pre><code>cat &lt;&lt;EOF &gt; kubeai-models.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n</code></pre>"},{"location":"installation/gke/","title":"Install on GKE","text":"TIP: Make sure you have enough quota in your GCP project. <p>Open the cloud console quotas page: https://console.cloud.google.com/iam-admin/quotas. Make sure your project is selected in the top left.</p> <p>You will need to verify that you have enough quota for the accelerators you want to use. Below is table of common quotas you will have to increase depending on your needs.</p> Quota Location Min Value Preemptible TPU v5 Lite Podslice chips <code>&lt;your-region&gt;</code> 8 Preemptible NVIDIA L4 GPUs <code>&lt;your-region&gt;</code> 2 GPUs (all regions) - 2 CPUs (all regions) - 24 <p>See the following screenshot examples of how these quotas appear in the console:</p> <p></p> <p></p> <p></p> <p></p>"},{"location":"installation/gke/#1-create-a-cluster","title":"1. Create a cluster","text":""},{"location":"installation/gke/#option-gke-autopilot","title":"Option: GKE Autopilot","text":"<p>Create an Autopilot cluster (replace <code>us-central1</code> with a region that you have quota).</p> <pre><code>gcloud container clusters create-auto cluster-1 \\\n    --location=us-central1\n</code></pre>"},{"location":"installation/gke/#option-gke-standard","title":"Option: GKE Standard","text":"<p>TODO: Reference gcloud commands for creating a GKE standard cluster.</p>"},{"location":"installation/gke/#2-install-kubeai","title":"2. Install KubeAI","text":"<p>Add KubeAI Helm repository.</p> <pre><code>helm repo add kubeai https://www.kubeai.org\nhelm repo update\n</code></pre> <p>Make sure you have a HuggingFace Hub token set in your environment (<code>HUGGING_FACE_HUB_TOKEN</code>).</p> <p>Install KubeAI with Helm.</p> <pre><code>curl -L -O https://raw.githubusercontent.com/substratusai/kubeai/refs/heads/main/charts/kubeai/values-gke.yaml\nhelm upgrade --install kubeai kubeai/kubeai \\\n    -f values-gke.yaml \\\n    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \\\n    --wait\n</code></pre>"},{"location":"installation/gke/#3-optionally-configure-models","title":"3. Optionally configure models","text":"<p>Optionally install preconfigured models.</p> <pre><code>cat &lt;&lt;EOF &gt; kubeai-models.yaml\ncatalog:\n  llama-3.1-8b-instruct-fp8-l4:\n    enabled: true\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n</code></pre>"},{"location":"reference/kubernetes-api/","title":"Kubernetes API","text":""},{"location":"reference/kubernetes-api/#packages","title":"Packages","text":"<ul> <li>kubeai.org/v1</li> </ul>"},{"location":"reference/kubernetes-api/#kubeaiorgv1","title":"kubeai.org/v1","text":"<p>Package v1 contains API Schema definitions for the kubeai v1 API group</p>"},{"location":"reference/kubernetes-api/#resource-types","title":"Resource Types","text":"<ul> <li>Model</li> </ul>"},{"location":"reference/kubernetes-api/#model","title":"Model","text":"<p>Model resources define the ML models that will be served by KubeAI.</p> Field Description Default Validation <code>apiVersion</code> string <code>kubeai.org/v1</code> <code>kind</code> string <code>Model</code> <code>metadata</code> ObjectMeta Refer to Kubernetes API documentation for fields of <code>metadata</code>. <code>spec</code> ModelSpec <code>status</code> ModelStatus"},{"location":"reference/kubernetes-api/#modelfeature","title":"ModelFeature","text":"<p>Underlying type: string</p> <p>Validation: - Enum: [TextGeneration TextEmbedding SpeechToText]</p> <p>Appears in: - ModelSpec</p>"},{"location":"reference/kubernetes-api/#modelspec","title":"ModelSpec","text":"<p>ModelSpec defines the desired state of Model.</p> <p>Appears in: - Model</p> Field Description Default Validation <code>url</code> string URL of the model to be served.Currently only the following formats are supported:For VLLM &amp; FasterWhisper engines: \"hf:///\"For OLlama engine: \"ollama:// Required: {}  <code>features</code> ModelFeature array Features that the model supports.Dictates the APIs that are available for the model. Enum: [TextGeneration TextEmbedding SpeechToText]  <code>engine</code> string Engine to be used for the server process. Enum: [OLlama VLLM FasterWhisper Infinity] Required: {}  <code>resourceProfile</code> string ResourceProfile required to serve the model.Use the format \":\".Example: \"nvidia-gpu-l4:2\" - 2x NVIDIA L4 GPUs.Must be a valid ResourceProfile defined in the system config. <code>cacheProfile</code> string CacheProfile to be used for caching model artifacts.Must be a valid CacheProfile defined in the system config. <code>image</code> string Image to be used for the server process.Will be set from ResourceProfile + Engine if not specified. <code>args</code> string array Args to be added to the server process. <code>env</code> object (keys:string, values:string) Env variables to be added to the server process. <code>replicas</code> integer Replicas is the number of Pod replicas that should be activelyserving the model. KubeAI will manage this field unless AutoscalingDisabledis set to true. <code>minReplicas</code> integer MinReplicas is the minimum number of Pod replicas that the model can scale down to.Note: 0 is a valid value. Minimum: 0 Optional: {}  <code>maxReplicas</code> integer MaxReplicas is the maximum number of Pod replicas that the model can scale up to.Empty value means no limit. Minimum: 1  <code>autoscalingDisabled</code> boolean AutoscalingDisabled will stop the controller from managing the replicasfor the Model. When disabled, metrics will not be collected on server Pods. <code>targetRequests</code> integer TargetRequests is average number of active requests that the autoscalerwill try to maintain on model server Pods. 100 Minimum: 1  <code>scaleDownDelaySeconds</code> integer ScaleDownDelay is the minimum time before a deployment is scaled down afterthe autoscaling algorithm determines that it should be scaled down. 30 <code>owner</code> string Owner of the model. Used solely to populate the owner field in theOpenAI /v1/models endpoint.DEPRECATED. Optional: {}"},{"location":"reference/kubernetes-api/#modelstatus","title":"ModelStatus","text":"<p>ModelStatus defines the observed state of Model.</p> <p>Appears in: - Model</p> Field Description Default Validation <code>replicas</code> ModelStatusReplicas <code>cache</code> ModelStatusCache"},{"location":"reference/kubernetes-api/#modelstatuscache","title":"ModelStatusCache","text":"<p>Appears in: - ModelStatus</p> Field Description Default Validation <code>loaded</code> boolean"},{"location":"reference/kubernetes-api/#modelstatusreplicas","title":"ModelStatusReplicas","text":"<p>Appears in: - ModelStatus</p> Field Description Default Validation <code>all</code> integer <code>ready</code> integer"},{"location":"reference/openai-api-compatibility/","title":"OpenAI API Compatibility","text":"<p>KubeAI provides an OpenAI API compatiblity layer.</p>"},{"location":"reference/openai-api-compatibility/#general","title":"General:","text":""},{"location":"reference/openai-api-compatibility/#models","title":"Models","text":"<pre><code>GET /v1/models\n</code></pre> <ul> <li>Lists all <code>kind: Model</code> object installed in teh Kubernetes API Server.</li> </ul>"},{"location":"reference/openai-api-compatibility/#inference","title":"Inference","text":""},{"location":"reference/openai-api-compatibility/#text-generation","title":"Text Generation","text":"<pre><code>POST /v1/chat/completions\nPOST /v1/completions\n</code></pre> <ul> <li>Supported for Models with <code>.spec.features: [\"TextGeneration\"]</code>.</li> </ul>"},{"location":"reference/openai-api-compatibility/#embeddings","title":"Embeddings","text":"<pre><code>POST /v1/embeddings\n</code></pre> <ul> <li>Supported for  Models with <code>.spec.features: [\"TextEmbedding\"]</code>.</li> </ul>"},{"location":"reference/openai-api-compatibility/#speech-to-text","title":"Speech-to-Text","text":"<pre><code>POST /v1/audio/transcriptions\n</code></pre> <ul> <li>Supported for Models with <code>.spec.features: [\"SpeechToText\"]</code>.</li> </ul>"},{"location":"reference/openai-api-compatibility/#openai-client-libaries","title":"OpenAI Client libaries","text":"<p>You can use the official OpenAI client libraries by setting the <code>base_url</code> to the KubeAI endpoint.</p> <p>For example, you can use the Python client like this: <pre><code>from openai import OpenAI\nclient = OpenAI(api_key=\"ignored\",\n                base_url=\"http://kubeai/openai/v1\")\nresponse = client.chat.completions.create(\n  model=\"gemma2-2b-cpu\",\n  messages=[\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n    {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n    {\"role\": \"user\", \"content\": \"Where was it played?\"}\n  ]\n)\n</code></pre></p>"},{"location":"tutorials/langchain/","title":"Using LangChain with KubeAI","text":"<p>LangChain makes it easy to build applications powered by LLMs. KubeAI makes it easy to deploy and manage LLMs at scale. Together, they make it easy to build and deploy private and secure AI applications.</p> <p>In this tutorial, we'll show you how to use LangChain with KubeAI's OpenAI compatible API. The beauty of KubeAI's OpenAI compatibility is that you can use KubeAI with any framework that supports OpenAI.</p>"},{"location":"tutorials/langchain/#prerequisites","title":"Prerequisites","text":"<p>A K8s cluster. You can use a local cluster like kind.</p>"},{"location":"tutorials/langchain/#installing-kubeai-with-gemma-2b","title":"Installing KubeAI with Gemma 2B","text":"<p>Run the following command to install KubeAI with Gemma 2B:</p> <pre><code>helm repo add kubeai https://www.kubeai.org\nhelm repo update\n\ncat &lt;&lt;EOF &gt; models-helm-values.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\nEOF\n\nhelm install kubeai kubeai/kubeai \\\n    -f ./helm-values.yaml \\\n    --wait --timeout 10m\n\nhelm install kubeai-models kubeai/models \\\n    -f ./models-helm-values.yaml\n</code></pre>"},{"location":"tutorials/langchain/#using-langchain","title":"Using LangChain","text":"<p>Install the required Python packages: <pre><code>pip install langchain_openai\n</code></pre></p> <p>Let's access the KubeAI OpenAI compatible API locally to make it easier.</p> <p>Run the following command to port-forward to the KubeAI service: <pre><code>kubectl port-forward svc/kubeai 8000:80\n</code></pre> Now the KubeAI OpenAI compatible API is available at <code>http://localhost:8000/openai</code> from your local machine.</p> <p>Let's create a simple Python script that uses LangChain and is connected to KubeAI.</p> <p>Create a file named <code>test-langchain.py</code> with the following content: <pre><code>from langchain_openai import ChatOpenAI\n\nllm = ChatOpenAI(\n    model=\"gemma2-2b-cpu\",\n    temperature=0,\n    max_tokens=None,\n    timeout=None,\n    max_retries=2,\n    api_key=\"thisIsIgnored\",\n    base_url=\"http://localhost:8000/openai/v1\",\n)\n\nmessages = [\n    (\n        \"system\",\n        \"You are a helpful assistant that translates English to French. Translate the user sentence.\",\n    ),\n    (\"human\", \"I love programming.\"),\n]\nai_msg = llm.invoke(messages)\nprint(ai_msg.content)\n</code></pre></p> <p>Run the Python script: <pre><code>python test-langchain.py\n</code></pre></p> <p>Notice that we set base_url to <code>http://localhost:8000/openai/v1</code>. This tells LangChain to use our local KubeAI OpenAI compatible AP instead of the default OpenAI public API.</p> <p>If you run langchain within the K8s cluster, you can use the following base_url instead: <code>http://kubeai/openai/v1</code>. So the code would look like this: <pre><code>llm = ChatOpenAI(\n    ...\n    base_url=\"http://kubeai/openai/v1\",\n)\n</code></pre></p> <p>That's it! You've successfully used LangChain with KubeAI. Now you can build and deploy private and secure AI applications with ease.</p>"},{"location":"tutorials/langtrace/","title":"Deploying KubeAI with Langtrace","text":"<p>Langtrace is an open source tool that helps you with tracing and monitoring your AI calls. It includes a self-hosted UI that for example shows you the estimated costs of your LLM calls.</p> <p>KubeAI is used for deploying LLMs with an OpenAI compatible endpoint.</p> <p>In this tutorial you will learn how to deploy KubeAI and Langtrace end-to-end. Both KubeAI and Langtrace are installed in your Kubernetes cluster. No cloud services or external dependencies are required.</p> <p>If you don't have a K8s cluster yet, you can create one using kind or minikube. <pre><code>kind create cluster # OR: minikube start\n</code></pre></p> <p>Install Langtrace: <pre><code>helm repo add langtrace https://Scale3-Labs.github.io/langtrace-helm-chart\nhelm repo update\nhelm install langtrace langtrace/langtrace\n</code></pre></p> <p>Install KubeAI and wait for all components to be ready (may take a minute). <pre><code>helm repo add kubeai https://www.kubeai.org\nhelm repo update\nhelm install kubeai kubeai/kubeai --wait --timeout 10m\n</code></pre></p> <p>Install the gemma2-2b-cpu model:</p> <pre><code>cat &lt;&lt;EOF &gt; kubeai-models.yaml\ncatalog:\n  gemma2-2b-cpu:\n    enabled: true\n    minReplicas: 1\nEOF\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-models.yaml\n</code></pre> <p>Create a local Python environment and install dependencies: <pre><code>python3 -m venv .venv\nsource .venv/bin/activate\npip install langtrace-python-sdk openai\n</code></pre></p> <p>Expose the KubeAI service to your local port: <pre><code>kubectl port-forward service/kubeai 8000:80\n</code></pre></p> <p>Expose the Langtrace service to your local port: <pre><code>kubectl port-forward service/langtrace-app 3000:3000\n</code></pre></p> <p>A Langtrace API key is required to use the Langtrace SDK. So lets get one by visiting your self hosted Langtace UI.</p> <p>Open your browser to http://localhost:3000, create a project and get the API keys for your langtrace project.</p> <p>In the Python script below, replace <code>langtrace_api_key</code> with your API key.</p> <p>Create file named <code>langtrace-example.py</code> with the following content: <pre><code># Replace this with your langtrace API key by visiting http://localhost:3000\nlangtrace_api_key=\"f7e003de19b9a628258531c17c264002e985604ca9fa561debcc85c41f357b09\"\n\nfrom langtrace_python_sdk import langtrace\nfrom langtrace_python_sdk.utils.with_root_span import with_langtrace_root_span\n# Paste this code after your langtrace init function\n\nfrom openai import OpenAI\n\nlangtrace.init(\n    api_key=api_key,\n    api_host=\"http://localhost:3000/api/trace\",\n)\n\nbase_url = \"http://localhost:8000/openai/v1\"\nmodel = \"gemma2-2b-cpu\"\n\n@with_langtrace_root_span()\ndef example():\n    client = OpenAI(base_url=base_url, api_key=\"ignored-by-kubeai\")\n    response = client.chat.completions.create(\n        model=model,\n        messages=[\n            {\n                \"role\": \"system\",\n                \"content\": \"How many states of matter are there?\"\n            }\n        ],\n    )\n    print(response.choices[0].message.content)\n\nexample()\n</code></pre></p> <p>Run the Python script: <pre><code>python3 langtrace-example.py\n</code></pre></p> <p>Now you should see the trace in your Langtrace UI. Take a look by visiting http://localhost:3000.</p> <p></p>"},{"location":"tutorials/weaviate/","title":"Weaviate with local autoscaling embedding and generative models","text":"<p>Weaviate is a vector search engine that can integrate seamlessly with KubeAI's embedding and generative models. This tutorial demonstrates how to deploy both KubeAI and Weaviate in a Kubernetes cluster, using KubeAI as the OpenAI endpoint for Weaviate.</p> <p>Why use KubeAI with Weaviate?</p> <ul> <li>Security and privacy: KubeAI runs locally in your Kubernetes cluster, so your data never leaves your infrastructure.</li> <li>Cost savings: KubeAI can run on your existing hardware, reducing the need for paying for embeddings and generative models.</li> </ul> <p>This tutorial uses CPU only models, so it should work even on your laptop.</p> <p>As you go go through this tutorial, you will learn how to:</p> <ul> <li>Deploy KubeAI with embedding and generative models</li> <li>Install Weaviate and connect it to KubeAI</li> <li>Import data into Weaviate</li> <li>Perform semantic search using the embedding model</li> <li>Perform generative search using the generative model</li> </ul>"},{"location":"tutorials/weaviate/#prerequisites","title":"Prerequisites","text":"<p>A Kubernetes cluster. You can use kind or minikube.</p> <pre><code>kind create cluster\n</code></pre>"},{"location":"tutorials/weaviate/#kubeai-configuration","title":"KubeAI Configuration","text":"<p>Let's start by deploying KubeAI with the models we want to use. Nomic embedding model is used instead of text-embedding-ada-002. Gemma 2 2B is used instead of gpt-3.5-turbo. You could choose to use bigger models depending on your available hardware.</p> <p>Create a file named <code>kubeai-model-values.yaml</code> with the following content: <pre><code>catalog:\n  text-embedding-ada-002:\n    enabled: true\n    minReplicas: 1\n    features: [\"TextEmbedding\"]\n    owner: nomic\n    url: \"ollama://nomic-embed-text\"\n    engine: OLlama\n    resourceProfile: cpu:1\n  gpt-3.5-turbo:\n    enabled: true\n    minReplicas: 1\n    features: [\"TextGeneration\"]\n    owner: google\n    url: \"ollama://gemma2:2b\"\n    engine: OLlama\n    resourceProfile: cpu:2\n</code></pre></p> <p>Note: It's important that you name the models as <code>text-embedding-ada-002</code> and <code>gpt-3.5-turbo</code> as Weaviate expects these names.</p> <p>Run the following command to deploy KubeAI and install the configured models: <pre><code>helm repo add kubeai https://www.kubeai.org &amp;&amp; helm repo update\n\nhelm install kubeai kubeai/kubeai\n\nhelm install kubeai-models kubeai/models \\\n    -f ./kubeai-model-values.yaml\n</code></pre></p>"},{"location":"tutorials/weaviate/#weaviate-installation","title":"Weaviate Installation","text":"<p>For this tutorial, we will use the Weaviate Helm chart to deploy Weaviate.</p> <p>Let's enable the text2vec-openai and generative-openai modules in Weaviate. We will also set the default vectorizer module to text2vec-openai.</p> <p>The <code>apiKey</code> is ignored in this case as we are using KubeAI as the OpenAI endpoint.</p> <p>Create a file named <code>weaviate-values.yaml</code> with the following content: <pre><code>modules:\n  text2vec-openai:\n    enabled: true\n    apiKey: thisIsIgnored\n  generative-openai:\n    enabled: true\n    apiKey: thisIsIgnored\n  default_vectorizer_module: text2vec-openai\nservice:\n  # To prevent Weaviate being exposed publicly\n  type: ClusterIP\n</code></pre></p> <p>Install Weaviate by running the following command: <pre><code>helm repo add weaviate https://weaviate.github.io/weaviate-helm &amp;&amp; helm repo update\n\nhelm install \\\n  \"weaviate\" \\\n  weaviate/weaviate \\\n  -f weaviate-values.yaml\n</code></pre></p>"},{"location":"tutorials/weaviate/#usage","title":"Usage","text":"<p>We will be using Python to interact with Weaviate. The 2 use cases we will cover are: - Semantic search using the embedding model - Generative search using the generative model</p>"},{"location":"tutorials/weaviate/#connectivity","title":"Connectivity","text":"<p>The remaining steps require connectivity to the Weaviate service. However, Weaviate is not exposed publicly in this setup. So we setup a local port forwards to access the Weaviate services.</p> <p>Setup a local port forwards to the Weaviate services by running: <pre><code>kubectl port-forward svc/weaviate 8080:80\nkubectl port-forward svc/weaviate-grpc 50051:50051\n</code></pre></p>"},{"location":"tutorials/weaviate/#weaviate-client-python-setup","title":"Weaviate client Python Setup","text":"<p>Create a virtual environment and install the Weaviate client: <pre><code>python -m venv .venv\nsource .venv/bin/activate\npip install -U weaviate-client requests\n</code></pre></p>"},{"location":"tutorials/weaviate/#collection-and-data-import","title":"Collection and Data Import","text":"<p>Create a file named <code>create-collection.py</code> with the following content: <pre><code>import json\nimport weaviate\nimport requests\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n\n    client.collections.create(\n        \"Question\",\n        vectorizer_config=Configure.Vectorizer.text2vec_openai(\n                model=\"text-embedding-ada-002\",\n                base_url=\"http://kubeai/openai\",\n        ),\n        generative_config=Configure.Generative.openai(\n            model=\"gpt-3.5-turbo\",\n            base_url=\"http://kubeai/openai\",\n        ),\n    )\n\n    # import data\n    resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')\n    data = json.loads(resp.text)  # Load data\n\n    question_objs = list()\n    for i, d in enumerate(data):\n        question_objs.append({\n            \"answer\": d[\"Answer\"],\n            \"question\": d[\"Question\"],\n            \"category\": d[\"Category\"],\n        })\n\n    questions = client.collections.get(\"Question\")\n    questions.data.insert_many(question_objs)\n    print(\"Data imported successfully\")\n</code></pre></p> <p>Create a collection that uses KubeAI as the openAI endpoint: <pre><code>python create-collection.py\n</code></pre> You should see a message <code>Data imported successfully</code>.</p> <p>The collection is now created and data is imported. The vectors are generated by KubeAI and stored in Weaviate.</p>"},{"location":"tutorials/weaviate/#semantic-search","title":"Semantic Search","text":"<p>Now let's do semantic search, which uses the embeddings. Create a file named <code>search.py</code> with the following content: <pre><code>import weaviate\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n    questions = client.collections.get(\"Question\")\n    response = questions.query.near_text(\n        query=\"biology\",\n        limit=2\n    )\n    print(response.objects[0].properties)  # Inspect the first object\n</code></pre></p> <p>Execute the python script: <pre><code>python search.py\n</code></pre></p> <p>You should see the following output: <pre><code>{\n  \"answer\": \"DNA\",\n  \"question\": \"In 1953 Watson &amp; Crick built a model of the molecular structure of this, the gene-carrying substance\",\n  \"category\": \"SCIENCE\"\n}\n</code></pre></p>"},{"location":"tutorials/weaviate/#generative-search-rag","title":"Generative Search (RAG)","text":"<p>Now let's do generative search, which uses the generative model (Text generation LLM). The generative model is run locally and managed by KubeAI.</p> <p>Create a file named <code>generate.py</code> with the following content: <pre><code>import weaviate\nfrom weaviate.classes.config import Configure\n\n# This works due to port forward in previous step\nwith weaviate.connect_to_local(port=8080, grpc_port=50051) as client:\n    questions = client.collections.get(\"Question\")\n\n    response = questions.generate.near_text(\n        query=\"biology\",\n        limit=2,\n        grouped_task=\"Write a tweet with emojis about these facts.\"\n    )\n\n    print(response.generated)  # Inspect the generated text\n</code></pre></p> <p>Run the python script: <pre><code>python generate.py\n</code></pre></p> <p>You should see something similar to this:</p> <p>\ud83e\uddec Watson &amp; Crick cracked the code in 1953!  \ud83e\udd2f They built a model of DNA, the blueprint of life. \ud83e\uddec \ud83e\udde0 Liver power! \ud83d\udcaa This organ keeps your blood sugar balanced by storing glucose as glycogen. \ud83e\ude78 #ScienceFacts #Biology</p>"},{"location":"tutorials/weaviate/#conclusion","title":"Conclusion","text":"<p>You've now successfully set up KubeAI with Weaviate for both embedding-based semantic search and generative tasks. You've also learned how to import data, perform searches, and generate content using KubeAI-managed models.</p>"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index cc7c63c2..f962cedd 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -44,6 +44,10 @@
          <loc>https://www.kubeai.org/how-to/build-models-into-containers/</loc>
          <lastmod>2024-10-25</lastmod>
     </url>
+    <url>
+         <loc>https://www.kubeai.org/how-to/cache-models-with-aws-efs/</loc>
+         <lastmod>2024-10-25</lastmod>
+    </url>
     <url>
          <loc>https://www.kubeai.org/how-to/cache-models-with-gcp-filestore/</loc>
          <lastmod>2024-10-25</lastmod>
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index e53fd8112c32c7944371a6a17b1d1691119a848c..96d75eabfb4e70ebb580b973ab909efabce2d9ae 100644
GIT binary patch
delta 484
zcmV<A0UQ451MmZo7Jto`OOBi{5Qg`2iWd9A-JL{AqHboF6J!ohm;x+}ja&sxZa+3n
z9@4C$NG5EsjaC0gxgKzPejV*;p}@wwdvPhx#fhBujd9(*c=`SfJ{Ql`(`^oxNltOI
z<9pGIOkb5+*R^CLG~&di51nG&#|f)_QJu>#MfG%wS2FMzZGYFzdR-+eyqg*3n$T=U
zZYn+eb1$-tzNxO~>qoep;rc0V+1DP%)i3LRp;O2A>_-d(D{FB?_)uQqa;af4$r;%=
zw0D{&VTH4J)@Y;a^NZK02XYNCg%l0>0Rpj~gC>}QZ;hn`5LkS`jzFXCGarHkQ_qq_
z8(PTvX(T5=ZhtYsyEHxs#_Kt$qh!vA-w1&$A#Y)!iStE8?^h77FvM68(gLjyFwR!u
z%%Rgia+B1vu?<99WEqST56MqxT+VdeSyG13PnOk454bW=`yYUGdV;Nf#AK%IINr6U
zn*+fS#rz!vjWso?mG2%pj^w!N{tksFBHhO|5PFr|99_n-ptZRVWS4!^aT*R1?uL9B
z8K|WYKk-PV#o$PY0Vi_+fS+Q=FiAX?>~V^|sEsv}uY*V|M!{f=H;b;*eQX*BX-hy&
a2WYQ^i$VDZ>FpN(fvPu@w~2-`4FCYR2I&a^

delta 477
zcmV<30V4kJ1L*^h7JtQ8%Z{8d5WMeKjJP+RnQWvanqhDI1<40ArUxvHZMhrR{QcT9
z<RR@Tie!AjZmg=J-Ce-t<!!Qug(4g8uGOhNRtIw0cgFSC>c{t2_)@(zPnS7bCOM?d
zj<3}q622+5u4~OkXu`m?kG*0&qy?)jsE+m5s(Cu3kpUjF`+vAtubYg*8_YPjgys%%
zQTgWIdy!@GU2{I3pW$?b^XIf>i#@>AZ}flQY14BK6ULF1wKySss?TsbwXm3+jAA;p
zcbbB*!dX0Pv?=xF$y?MTxel0Ou7>ggBC(&NCJ52rjHMkUu=t2Qfkr(PJ_L!Toh6fY
zw2<{-A}2s@F@Mp!Jl{!-*K=k^Dcli%Bt)`=d@l=4T&^kxzk+y$G1Y>Q7P$Ec(`+TJ
z96J58w32o<wu6L4mccmjkn@Dbm2B5dOYRW*rDZiTK(7Pb+cxzO6_?}P&Gd65*uz6V
zh@h#ZF26yy4DF|6KU4i6g#(cuQVfLNq_hs>SkT(i2UfDtKJ1gmorwFfTt`N_QA}S}
zq;fDi5@Nt$b`ao0GHR2_lT%J1<)Ss#NIBz{WJb|osyB<S*F*B^owy~UrX9Ff!o{Hc
T2lf4uem$CZ1H%#rstf=C)+zI8

diff --git a/tutorials/langchain/index.html b/tutorials/langchain/index.html
index 1573a9e3..842590f7 100644
--- a/tutorials/langchain/index.html
+++ b/tutorials/langchain/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/tutorials/langtrace/index.html b/tutorials/langtrace/index.html
index 0645e785..51812a59 100644
--- a/tutorials/langtrace/index.html
+++ b/tutorials/langtrace/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">
         
diff --git a/tutorials/weaviate/index.html b/tutorials/weaviate/index.html
index 97ae8e7c..5cca5a93 100644
--- a/tutorials/weaviate/index.html
+++ b/tutorials/weaviate/index.html
@@ -405,6 +405,27 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../how-to/cache-models-with-aws-efs/" class="md-nav__link">
+        
+  
+  <span class="md-ellipsis">
+    Cache models with AWS EFS
+  </span>
+  
+
+      </a>
+    </li>
+  
+
+              
+            
+              
+                
+  
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../how-to/cache-models-with-gcp-filestore/" class="md-nav__link">