From f0170c5d3a735fd0413efa1837e799b3b86a4b7e Mon Sep 17 00:00:00 2001
From: Raghotham Murthy <rsm@meta.com>
Date: Thu, 5 Jun 2025 02:03:21 -0700
Subject: [PATCH 1/2] chore: remove explicit sqlite dependency

---
 llama_stack/distribution/store/registry.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama_stack/distribution/store/registry.py b/llama_stack/distribution/store/registry.py
index 0e84854c2..36a86a921 100644
--- a/llama_stack/distribution/store/registry.py
+++ b/llama_stack/distribution/store/registry.py
@@ -14,7 +14,6 @@ from llama_stack.distribution.datatypes import KVStoreConfig, RoutableObjectWith
 from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
-from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
 
 logger = get_logger(__name__, category="core")
 
@@ -193,12 +192,13 @@ async def create_dist_registry(
     image_name: str,
 ) -> tuple[CachedDiskDistributionRegistry, KVStore]:
     # instantiate kvstore for storing and retrieving distribution metadata
-    if metadata_store:
-        dist_kvstore = await kvstore_impl(metadata_store)
-    else:
-        dist_kvstore = await kvstore_impl(
-            SqliteKVStoreConfig(db_path=(DISTRIBS_BASE_DIR / image_name / "kvstore.db").as_posix())
+    if not metadata_store:
+        metadata_store = KVStoreConfig(
+            type="sqlite",
+            namespace=None,
+            db_path=(DISTRIBS_BASE_DIR / image_name / "kvstore.db").as_posix()
         )
+    dist_kvstore = await kvstore_impl(metadata_store)
     dist_registry = CachedDiskDistributionRegistry(dist_kvstore)
     await dist_registry.initialize()
     return dist_registry, dist_kvstore

From f99ca37f91d43d86c450ba15a33a7f5988befc56 Mon Sep 17 00:00:00 2001
From: Raghotham Murthy <rsm@meta.com>
Date: Tue, 24 Jun 2025 12:20:04 -0700
Subject: [PATCH 2/2] make it work on gpus

---
 docs/source/distributions/k8s/apply.sh         |  8 ++++++++
 .../distributions/k8s/chroma-k8s.yaml.template |  3 +++
 .../k8s/postgres-k8s.yaml.template             |  3 +++
 .../distributions/k8s/stack-k8s.yaml.template  |  5 ++++-
 .../distributions/k8s/ui-k8s.yaml.template     |  2 ++
 .../distributions/k8s/vllm-k8s.yaml.template   | 18 ++++++++----------
 .../k8s/vllm-safety-k8s.yaml.template          | 12 ++----------
 7 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh
index 7ff7d28eb..b29e18d2a 100755
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@@ -16,6 +16,14 @@ export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
 set -euo pipefail
 set -x
 
+# Install NVIDIA device plugin for GPU support
+echo "Installing NVIDIA device plugin..."
+kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/refs/tags/v0.17.2/deployments/static/nvidia-device-plugin.yml
+
+# Wait for NVIDIA device plugin to be ready
+echo "Waiting for NVIDIA device plugin to be ready..."
+kubectl wait --for=condition=ready pod -l name=nvidia-device-plugin-ds -n kube-system --timeout=300s
+
 envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
 envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
 envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
diff --git a/docs/source/distributions/k8s/chroma-k8s.yaml.template b/docs/source/distributions/k8s/chroma-k8s.yaml.template
index a2a5e3be3..2083a566b 100644
--- a/docs/source/distributions/k8s/chroma-k8s.yaml.template
+++ b/docs/source/distributions/k8s/chroma-k8s.yaml.template
@@ -5,6 +5,7 @@ metadata:
 spec:
   accessModes:
     - ReadWriteOnce
+  storageClassName: gp2
   resources:
     requests:
       storage: 20Gi
@@ -23,6 +24,8 @@ spec:
       labels:
         app: chromadb
     spec:
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: cpu
       containers:
       - name: chromadb
         image: chromadb/chroma:latest
diff --git a/docs/source/distributions/k8s/postgres-k8s.yaml.template b/docs/source/distributions/k8s/postgres-k8s.yaml.template
index 86a765652..66e197b15 100644
--- a/docs/source/distributions/k8s/postgres-k8s.yaml.template
+++ b/docs/source/distributions/k8s/postgres-k8s.yaml.template
@@ -5,6 +5,7 @@ metadata:
 spec:
   accessModes:
     - ReadWriteOnce
+  storageClassName: gp2
   resources:
     requests:
       storage: 10Gi
@@ -23,6 +24,8 @@ spec:
       labels:
         app.kubernetes.io/name: postgres
     spec:
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: cpu
       containers:
       - name: postgres
         image: postgres:15
diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template
index 1cfc63ef5..44f69f69f 100644
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@@ -5,6 +5,7 @@ metadata:
 spec:
   accessModes:
     - ReadWriteOnce
+  storageClassName: gp2
   resources:
     requests:
       storage: 1Gi
@@ -25,9 +26,11 @@ spec:
         app.kubernetes.io/name: llama-stack
         app.kubernetes.io/component: server
     spec:
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: cpu
       containers:
       - name: llama-stack
-        image: llamastack/distribution-remote-vllm:latest
+        image: llamastack/distribution-postgres-demo:latest
         imagePullPolicy: Always # since we have specified latest instead of a version
         env:
         - name: ENABLE_CHROMADB
diff --git a/docs/source/distributions/k8s/ui-k8s.yaml.template b/docs/source/distributions/k8s/ui-k8s.yaml.template
index ef1bf0c55..ca429c029 100644
--- a/docs/source/distributions/k8s/ui-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ui-k8s.yaml.template
@@ -17,6 +17,8 @@ spec:
         app.kubernetes.io/name: llama-stack
         app.kubernetes.io/component: ui
     spec:
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: cpu
       containers:
       - name: llama-stack-ui
         image: node:18-alpine
diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template
index 6256cc7e1..3988066b2 100644
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@@ -6,6 +6,7 @@ spec:
   accessModes:
     - ReadWriteOnce
   volumeMode: Filesystem
+  storageClassName: gp2
   resources:
     requests:
       storage: 50Gi
@@ -25,16 +26,8 @@ spec:
         app.kubernetes.io/name: vllm
         workload-type: inference
     spec:
-      affinity:
-        podAntiAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-          - labelSelector:
-              matchExpressions:
-              - key: workload-type
-                operator: In
-                values:
-                - inference
-            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: gpu
       containers:
       - name: vllm
         image: vllm/vllm-openai:latest
@@ -49,6 +42,11 @@ spec:
               key: token
         ports:
           - containerPort: 8000
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
         volumeMounts:
           - name: llama-storage
             mountPath: /root/.cache/huggingface
diff --git a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
index 8857e83b6..9bce4aa95 100644
--- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
@@ -26,16 +26,8 @@ spec:
         app.kubernetes.io/name: vllm-safety
         workload-type: inference
     spec:
-      affinity:
-        podAntiAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-          - labelSelector:
-              matchExpressions:
-              - key: workload-type
-                operator: In
-                values:
-                - inference
-            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: gpu
       containers:
       - name: vllm-safety
         image: vllm/vllm-openai:latest