mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
Merge f99ca37f91
into 40fdce79b3
This commit is contained in:
commit
79147a554a
8 changed files with 36 additions and 27 deletions
|
@ -16,6 +16,14 @@ export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
|
# Install NVIDIA device plugin for GPU support
|
||||||
|
echo "Installing NVIDIA device plugin..."
|
||||||
|
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/refs/tags/v0.17.2/deployments/static/nvidia-device-plugin.yml
|
||||||
|
|
||||||
|
# Wait for NVIDIA device plugin to be ready
|
||||||
|
echo "Waiting for NVIDIA device plugin to be ready..."
|
||||||
|
kubectl wait --for=condition=ready pod -l name=nvidia-device-plugin-ds -n kube-system --timeout=300s
|
||||||
|
|
||||||
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
||||||
|
|
|
@ -5,6 +5,7 @@ metadata:
|
||||||
spec:
|
spec:
|
||||||
accessModes:
|
accessModes:
|
||||||
- ReadWriteOnce
|
- ReadWriteOnce
|
||||||
|
storageClassName: gp2
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
storage: 20Gi
|
storage: 20Gi
|
||||||
|
@ -23,6 +24,8 @@ spec:
|
||||||
labels:
|
labels:
|
||||||
app: chromadb
|
app: chromadb
|
||||||
spec:
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
eks.amazonaws.com/nodegroup: cpu
|
||||||
containers:
|
containers:
|
||||||
- name: chromadb
|
- name: chromadb
|
||||||
image: chromadb/chroma:latest
|
image: chromadb/chroma:latest
|
||||||
|
|
|
@ -5,6 +5,7 @@ metadata:
|
||||||
spec:
|
spec:
|
||||||
accessModes:
|
accessModes:
|
||||||
- ReadWriteOnce
|
- ReadWriteOnce
|
||||||
|
storageClassName: gp2
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
storage: 10Gi
|
storage: 10Gi
|
||||||
|
@ -23,6 +24,8 @@ spec:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: postgres
|
app.kubernetes.io/name: postgres
|
||||||
spec:
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
eks.amazonaws.com/nodegroup: cpu
|
||||||
containers:
|
containers:
|
||||||
- name: postgres
|
- name: postgres
|
||||||
image: postgres:15
|
image: postgres:15
|
||||||
|
|
|
@ -5,6 +5,7 @@ metadata:
|
||||||
spec:
|
spec:
|
||||||
accessModes:
|
accessModes:
|
||||||
- ReadWriteOnce
|
- ReadWriteOnce
|
||||||
|
storageClassName: gp2
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
storage: 1Gi
|
storage: 1Gi
|
||||||
|
@ -25,9 +26,11 @@ spec:
|
||||||
app.kubernetes.io/name: llama-stack
|
app.kubernetes.io/name: llama-stack
|
||||||
app.kubernetes.io/component: server
|
app.kubernetes.io/component: server
|
||||||
spec:
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
eks.amazonaws.com/nodegroup: cpu
|
||||||
containers:
|
containers:
|
||||||
- name: llama-stack
|
- name: llama-stack
|
||||||
image: llamastack/distribution-remote-vllm:latest
|
image: llamastack/distribution-postgres-demo:latest
|
||||||
imagePullPolicy: Always # since we have specified latest instead of a version
|
imagePullPolicy: Always # since we have specified latest instead of a version
|
||||||
env:
|
env:
|
||||||
- name: ENABLE_CHROMADB
|
- name: ENABLE_CHROMADB
|
||||||
|
|
|
@ -17,6 +17,8 @@ spec:
|
||||||
app.kubernetes.io/name: llama-stack
|
app.kubernetes.io/name: llama-stack
|
||||||
app.kubernetes.io/component: ui
|
app.kubernetes.io/component: ui
|
||||||
spec:
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
eks.amazonaws.com/nodegroup: cpu
|
||||||
containers:
|
containers:
|
||||||
- name: llama-stack-ui
|
- name: llama-stack-ui
|
||||||
image: node:18-alpine
|
image: node:18-alpine
|
||||||
|
|
|
@ -6,6 +6,7 @@ spec:
|
||||||
accessModes:
|
accessModes:
|
||||||
- ReadWriteOnce
|
- ReadWriteOnce
|
||||||
volumeMode: Filesystem
|
volumeMode: Filesystem
|
||||||
|
storageClassName: gp2
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
storage: 50Gi
|
storage: 50Gi
|
||||||
|
@ -25,16 +26,8 @@ spec:
|
||||||
app.kubernetes.io/name: vllm
|
app.kubernetes.io/name: vllm
|
||||||
workload-type: inference
|
workload-type: inference
|
||||||
spec:
|
spec:
|
||||||
affinity:
|
nodeSelector:
|
||||||
podAntiAffinity:
|
eks.amazonaws.com/nodegroup: gpu
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- labelSelector:
|
|
||||||
matchExpressions:
|
|
||||||
- key: workload-type
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- inference
|
|
||||||
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
|
|
||||||
containers:
|
containers:
|
||||||
- name: vllm
|
- name: vllm
|
||||||
image: vllm/vllm-openai:latest
|
image: vllm/vllm-openai:latest
|
||||||
|
@ -49,6 +42,11 @@ spec:
|
||||||
key: token
|
key: token
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8000
|
- containerPort: 8000
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
nvidia.com/gpu: 1
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 1
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: llama-storage
|
- name: llama-storage
|
||||||
mountPath: /root/.cache/huggingface
|
mountPath: /root/.cache/huggingface
|
||||||
|
|
|
@ -26,16 +26,8 @@ spec:
|
||||||
app.kubernetes.io/name: vllm-safety
|
app.kubernetes.io/name: vllm-safety
|
||||||
workload-type: inference
|
workload-type: inference
|
||||||
spec:
|
spec:
|
||||||
affinity:
|
nodeSelector:
|
||||||
podAntiAffinity:
|
eks.amazonaws.com/nodegroup: gpu
|
||||||
requiredDuringSchedulingIgnoredDuringExecution:
|
|
||||||
- labelSelector:
|
|
||||||
matchExpressions:
|
|
||||||
- key: workload-type
|
|
||||||
operator: In
|
|
||||||
values:
|
|
||||||
- inference
|
|
||||||
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
|
|
||||||
containers:
|
containers:
|
||||||
- name: vllm-safety
|
- name: vllm-safety
|
||||||
image: vllm/vllm-openai:latest
|
image: vllm/vllm-openai:latest
|
||||||
|
|
|
@ -14,7 +14,6 @@ from llama_stack.distribution.datatypes import KVStoreConfig, RoutableObjectWith
|
||||||
from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
|
from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
|
from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
|
||||||
from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
|
|
||||||
|
|
||||||
logger = get_logger(__name__, category="core")
|
logger = get_logger(__name__, category="core")
|
||||||
|
|
||||||
|
@ -193,12 +192,13 @@ async def create_dist_registry(
|
||||||
image_name: str,
|
image_name: str,
|
||||||
) -> tuple[CachedDiskDistributionRegistry, KVStore]:
|
) -> tuple[CachedDiskDistributionRegistry, KVStore]:
|
||||||
# instantiate kvstore for storing and retrieving distribution metadata
|
# instantiate kvstore for storing and retrieving distribution metadata
|
||||||
if metadata_store:
|
if not metadata_store:
|
||||||
dist_kvstore = await kvstore_impl(metadata_store)
|
metadata_store = KVStoreConfig(
|
||||||
else:
|
type="sqlite",
|
||||||
dist_kvstore = await kvstore_impl(
|
namespace=None,
|
||||||
SqliteKVStoreConfig(db_path=(DISTRIBS_BASE_DIR / image_name / "kvstore.db").as_posix())
|
db_path=(DISTRIBS_BASE_DIR / image_name / "kvstore.db").as_posix()
|
||||||
)
|
)
|
||||||
|
dist_kvstore = await kvstore_impl(metadata_store)
|
||||||
dist_registry = CachedDiskDistributionRegistry(dist_kvstore)
|
dist_registry = CachedDiskDistributionRegistry(dist_kvstore)
|
||||||
await dist_registry.initialize()
|
await dist_registry.initialize()
|
||||||
return dist_registry, dist_kvstore
|
return dist_registry, dist_kvstore
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue