Merge branch 'main' into nvidia-e2e-notebook

This commit is contained in:
Jash Gulabrai 2025-06-06 11:11:53 -04:00
commit 1a492ad0cc
200 changed files with 8714 additions and 3175 deletions

View file

@ -260,7 +260,41 @@ Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM pyth
You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
```
After this step is successful, you should be able to find the built container image and test it with `llama stack run <path/to/run.yaml>`.
Now set some environment variables for the inference model ID and Llama Stack Port and create a local directory to mount into the container's file system.
```
export INFERENCE_MODEL="llama3.2:3b"
export LLAMA_STACK_PORT=8321
mkdir -p ~/.llama
```
After this step is successful, you should be able to find the built container image and test it with the below Docker command:
```
docker run -d \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
localhost/distribution-ollama:dev \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434
```
Here are the docker flags and their uses:
* `-d`: Runs the container in the detached mode as a background process
* `-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT`: Maps the container port to the host port for accessing the server
* `-v ~/.llama:/root/.llama`: Mounts the local .llama directory to persist configurations and data
* `localhost/distribution-ollama:dev`: The name and tag of the container image to run
* `--port $LLAMA_STACK_PORT`: Port number for the server to listen on
* `--env INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the model to use for inference
* `--env OLLAMA_URL=http://host.docker.internal:11434`: Configures the URL for the Ollama service
:::
::::

View file

@ -0,0 +1,32 @@
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
export POSTGRES_USER=${POSTGRES_USER:-llamastack}
export POSTGRES_DB=${POSTGRES_DB:-llamastack}
export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack}
export INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
set -euo pipefail
set -x
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
--dry-run=client -o yaml > stack-configmap.yaml
kubectl apply -f stack-configmap.yaml
envsubst < ./stack-k8s.yaml.template | kubectl apply -f -
envsubst < ./ingress-k8s.yaml.template | kubectl apply -f -
envsubst < ./ui-k8s.yaml.template | kubectl apply -f -

View file

@ -0,0 +1,66 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: chromadb-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 20Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: chromadb
spec:
replicas: 1
selector:
matchLabels:
app: chromadb
template:
metadata:
labels:
app: chromadb
spec:
containers:
- name: chromadb
image: chromadb/chroma:latest
ports:
- containerPort: 6000
env:
- name: CHROMA_HOST
value: "0.0.0.0"
- name: CHROMA_PORT
value: "6000"
- name: PERSIST_DIRECTORY
value: "/chroma/chroma"
- name: CHROMA_DB_IMPL
value: "duckdb+parquet"
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "2Gi"
cpu: "1000m"
volumeMounts:
- name: chromadb-storage
mountPath: /chroma/chroma
volumes:
- name: chromadb-storage
persistentVolumeClaim:
claimName: chromadb-pvc
---
apiVersion: v1
kind: Service
metadata:
name: chromadb
spec:
selector:
app: chromadb
ports:
- protocol: TCP
port: 6000
targetPort: 6000
type: ClusterIP

View file

@ -0,0 +1,17 @@
apiVersion: v1
kind: Service
metadata:
name: llama-stack-service
spec:
type: LoadBalancer
selector:
app.kubernetes.io/name: llama-stack
ports:
- name: llama-stack-api
port: 8321
targetPort: 8321
protocol: TCP
- name: llama-stack-ui
port: 8322
targetPort: 8322
protocol: TCP

View file

@ -0,0 +1,66 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: postgres-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgres
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: postgres
template:
metadata:
labels:
app.kubernetes.io/name: postgres
spec:
containers:
- name: postgres
image: postgres:15
env:
- name: POSTGRES_DB
value: "${POSTGRES_DB}"
- name: POSTGRES_USER
value: "${POSTGRES_USER}"
- name: POSTGRES_PASSWORD
value: "${POSTGRES_PASSWORD}"
- name: PGDATA
value: "/var/lib/postgresql/data/pgdata"
ports:
- containerPort: 5432
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
volumeMounts:
- name: postgres-storage
mountPath: /var/lib/postgresql/data
volumes:
- name: postgres-storage
persistentVolumeClaim:
claimName: postgres-pvc
---
apiVersion: v1
kind: Service
metadata:
name: postgres-server
spec:
selector:
app.kubernetes.io/name: postgres
ports:
- protocol: TCP
port: 5432
targetPort: 5432
type: ClusterIP

View file

@ -0,0 +1,128 @@
apiVersion: v1
data:
stack_run_config.yaml: |
version: '2'
image_name: kubernetes-demo
apis:
- agents
- inference
- safety
- telemetry
- tool_runtime
- vector_io
providers:
inference:
- provider_id: vllm-inference
provider_type: remote::vllm
config:
url: ${env.VLLM_URL:http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
tls_verify: ${env.VLLM_TLS_VERIFY:true}
- provider_id: vllm-safety
provider_type: remote::vllm
config:
url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
tls_verify: ${env.VLLM_TLS_VERIFY:true}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: ${env.ENABLE_CHROMADB+chromadb}
provider_type: remote::chromadb
config:
url: ${env.CHROMADB_URL:}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: postgres
host: ${env.POSTGRES_HOST:localhost}
port: ${env.POSTGRES_PORT:5432}
db: ${env.POSTGRES_DB:llamastack}
user: ${env.POSTGRES_USER:llamastack}
password: ${env.POSTGRES_PASSWORD:llamastack}
responses_store:
type: postgres
host: ${env.POSTGRES_HOST:localhost}
port: ${env.POSTGRES_PORT:5432}
db: ${env.POSTGRES_DB:llamastack}
user: ${env.POSTGRES_USER:llamastack}
password: ${env.POSTGRES_PASSWORD:llamastack}
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: ${env.OTEL_SERVICE_NAME:}
sinks: ${env.TELEMETRY_SINKS:console}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
metadata_store:
type: postgres
host: ${env.POSTGRES_HOST:localhost}
port: ${env.POSTGRES_PORT:5432}
db: ${env.POSTGRES_DB:llamastack}
user: ${env.POSTGRES_USER:llamastack}
password: ${env.POSTGRES_PASSWORD:llamastack}
table_name: llamastack_kvstore
inference_store:
type: postgres
host: ${env.POSTGRES_HOST:localhost}
port: ${env.POSTGRES_PORT:5432}
db: ${env.POSTGRES_DB:llamastack}
user: ${env.POSTGRES_USER:llamastack}
password: ${env.POSTGRES_PASSWORD:llamastack}
models:
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- metadata: {}
model_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
provider_id: vllm-safety
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
server:
port: 8321
kind: ConfigMap
metadata:
creationTimestamp: null
name: llama-stack-config

View file

@ -0,0 +1,69 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: llama-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-stack-server
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: llama-stack
app.kubernetes.io/component: server
template:
metadata:
labels:
app.kubernetes.io/name: llama-stack
app.kubernetes.io/component: server
spec:
containers:
- name: llama-stack
image: llamastack/distribution-remote-vllm:latest
imagePullPolicy: Always # since we have specified latest instead of a version
env:
- name: ENABLE_CHROMADB
value: "true"
- name: CHROMADB_URL
value: http://chromadb.default.svc.cluster.local:6000
- name: VLLM_URL
value: http://vllm-server.default.svc.cluster.local:8000/v1
- name: VLLM_MAX_TOKENS
value: "3072"
- name: VLLM_SAFETY_URL
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
- name: POSTGRES_HOST
value: postgres-server.default.svc.cluster.local
- name: POSTGRES_PORT
value: "5432"
- name: VLLM_TLS_VERIFY
value: "false"
- name: INFERENCE_MODEL
value: "${INFERENCE_MODEL}"
- name: SAFETY_MODEL
value: "${SAFETY_MODEL}"
- name: TAVILY_SEARCH_API_KEY
value: "${TAVILY_SEARCH_API_KEY}"
command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
ports:
- containerPort: 8321
volumeMounts:
- name: llama-storage
mountPath: /root/.llama
- name: llama-config
mountPath: /etc/config
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: llama-pvc
- name: llama-config
configMap:
name: llama-stack-config

View file

@ -0,0 +1,121 @@
version: '2'
image_name: kubernetes-demo
apis:
- agents
- inference
- safety
- telemetry
- tool_runtime
- vector_io
providers:
inference:
- provider_id: vllm-inference
provider_type: remote::vllm
config:
url: ${env.VLLM_URL:http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
tls_verify: ${env.VLLM_TLS_VERIFY:true}
- provider_id: vllm-safety
provider_type: remote::vllm
config:
url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
tls_verify: ${env.VLLM_TLS_VERIFY:true}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: ${env.ENABLE_CHROMADB+chromadb}
provider_type: remote::chromadb
config:
url: ${env.CHROMADB_URL:}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: postgres
host: ${env.POSTGRES_HOST:localhost}
port: ${env.POSTGRES_PORT:5432}
db: ${env.POSTGRES_DB:llamastack}
user: ${env.POSTGRES_USER:llamastack}
password: ${env.POSTGRES_PASSWORD:llamastack}
responses_store:
type: postgres
host: ${env.POSTGRES_HOST:localhost}
port: ${env.POSTGRES_PORT:5432}
db: ${env.POSTGRES_DB:llamastack}
user: ${env.POSTGRES_USER:llamastack}
password: ${env.POSTGRES_PASSWORD:llamastack}
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: ${env.OTEL_SERVICE_NAME:}
sinks: ${env.TELEMETRY_SINKS:console}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
metadata_store:
type: postgres
host: ${env.POSTGRES_HOST:localhost}
port: ${env.POSTGRES_PORT:5432}
db: ${env.POSTGRES_DB:llamastack}
user: ${env.POSTGRES_USER:llamastack}
password: ${env.POSTGRES_PASSWORD:llamastack}
table_name: llamastack_kvstore
inference_store:
type: postgres
host: ${env.POSTGRES_HOST:localhost}
port: ${env.POSTGRES_PORT:5432}
db: ${env.POSTGRES_DB:llamastack}
user: ${env.POSTGRES_USER:llamastack}
password: ${env.POSTGRES_PASSWORD:llamastack}
models:
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- metadata: {}
model_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
provider_id: vllm-safety
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B}
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
server:
port: 8321

View file

@ -0,0 +1,62 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-stack-ui
labels:
app.kubernetes.io/name: llama-stack
app.kubernetes.io/component: ui
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: llama-stack
app.kubernetes.io/component: ui
template:
metadata:
labels:
app.kubernetes.io/name: llama-stack
app.kubernetes.io/component: ui
spec:
containers:
- name: llama-stack-ui
image: node:18-alpine
command: ["/bin/sh"]
env:
- name: LLAMA_STACK_BACKEND_URL
value: "http://llama-stack-service:8321"
- name: LLAMA_STACK_UI_PORT
value: "8322"
args:
- -c
- |
# Install git (not included in alpine by default)
apk add --no-cache git
# Clone the repository
echo "Cloning repository..."
git clone https://github.com/meta-llama/llama-stack.git /app
# Navigate to the UI directory
echo "Navigating to UI directory..."
cd /app/llama_stack/ui
# Check if package.json exists
if [ ! -f "package.json" ]; then
echo "ERROR: package.json not found in $(pwd)"
ls -la
exit 1
fi
# Install dependencies with verbose output
echo "Installing dependencies..."
npm install --verbose
# Verify next is installed
echo "Checking if next is installed..."
npx next --version || echo "Next.js not found, checking node_modules..."
ls -la node_modules/.bin/ | grep next || echo "No next binary found"
npm run dev
ports:
- containerPort: 8322
workingDir: /app

View file

@ -0,0 +1,71 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: vllm-models
spec:
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
resources:
requests:
storage: 50Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm
template:
metadata:
labels:
app.kubernetes.io/name: vllm
workload-type: inference
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: workload-type
operator: In
values:
- inference
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
containers:
- name: vllm
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args:
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8000
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server
spec:
selector:
app.kubernetes.io/name: vllm
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP

View file

@ -0,0 +1,73 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: vllm-models-safety
spec:
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
storageClassName: gp2
resources:
requests:
storage: 30Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server-safety
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm-safety
template:
metadata:
labels:
app.kubernetes.io/name: vllm-safety
workload-type: inference
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: workload-type
operator: In
values:
- inference
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
containers:
- name: vllm-safety
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args: [
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3"
]
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8001
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models-safety
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server-safety
spec:
selector:
app.kubernetes.io/name: vllm-safety
ports:
- protocol: TCP
port: 8001
targetPort: 8001
type: ClusterIP

View file

@ -18,6 +18,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| files | `inline::localfs` |
| inference | `remote::fireworks`, `inline::sentence-transformers` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |