mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
distro templates
This commit is contained in:
parent
c6694beb44
commit
d35e05c63c
13 changed files with 727 additions and 0 deletions
19
docs/docs/distributions/eks/apply.sh
Executable file
19
docs/docs/distributions/eks/apply.sh
Executable file
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
K8S_DIR="${SCRIPT_DIR}/../k8s"
|
||||
|
||||
echo "Setting up AWS EKS-specific storage class..."
|
||||
kubectl apply -f gp3-topology-aware.yaml
|
||||
|
||||
echo "Running main Kubernetes deployment..."
|
||||
cd "${K8S_DIR}"
|
||||
./apply.sh "$@"
|
15
docs/docs/distributions/eks/gp3-topology-aware.yaml
Normal file
15
docs/docs/distributions/eks/gp3-topology-aware.yaml
Normal file
|
@ -0,0 +1,15 @@
|
|||
# Set up default storage class on AWS EKS
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: gp3-topology-aware
|
||||
annotations:
|
||||
storageclass.kubernetes.io/is-default-class: "true"
|
||||
parameters:
|
||||
type: gp3
|
||||
iops: "3000"
|
||||
throughput: "125"
|
||||
provisioner: ebs.csi.aws.com
|
||||
reclaimPolicy: Delete
|
||||
volumeBindingMode: WaitForFirstConsumer
|
||||
allowVolumeExpansion: true
|
63
docs/docs/distributions/k8s/apply.sh
Executable file
63
docs/docs/distributions/k8s/apply.sh
Executable file
|
@ -0,0 +1,63 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
export POSTGRES_USER=llamastack
|
||||
export POSTGRES_DB=llamastack
|
||||
export POSTGRES_PASSWORD=llamastack
|
||||
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
|
||||
# HF_TOKEN should be set by the user; base64 encode it for the secret
|
||||
if [ -n "${HF_TOKEN:-}" ]; then
|
||||
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
|
||||
else
|
||||
echo "ERROR: HF_TOKEN not set. You need it for vLLM to download models from Hugging Face."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "${GITHUB_CLIENT_ID:-}" ]; then
|
||||
echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. See the Kubernetes Deployment Guide in the Llama Stack documentation."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then
|
||||
echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. See the Kubernetes Deployment Guide in the Llama Stack documentation."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "${LLAMA_STACK_UI_URL:-}" ]; then
|
||||
echo "ERROR: LLAMA_STACK_UI_URL not set. Should be set to the external URL of the UI (excluding port). You need it for Github login to work. See the Kubernetes Deployment Guide in the Llama Stack documentation."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
|
||||
|
||||
set -euo pipefail
|
||||
set -x
|
||||
|
||||
# Apply the HF token secret if HF_TOKEN is provided
|
||||
if [ -n "${HF_TOKEN:-}" ]; then
|
||||
envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
|
||||
fi
|
||||
|
||||
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
|
||||
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
|
||||
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
||||
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
|
||||
|
||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||
--dry-run=client -o yaml > stack-configmap.yaml
|
||||
|
||||
kubectl apply -f stack-configmap.yaml
|
||||
|
||||
envsubst < ./stack-k8s.yaml.template | kubectl apply -f -
|
||||
envsubst < ./ingress-k8s.yaml.template | kubectl apply -f -
|
||||
|
||||
envsubst < ./ui-k8s.yaml.template | kubectl apply -f -
|
66
docs/docs/distributions/k8s/chroma-k8s.yaml.template
Normal file
66
docs/docs/distributions/k8s/chroma-k8s.yaml.template
Normal file
|
@ -0,0 +1,66 @@
|
|||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: chromadb-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: chromadb
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: chromadb
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: chromadb
|
||||
spec:
|
||||
containers:
|
||||
- name: chromadb
|
||||
image: chromadb/chroma:latest
|
||||
ports:
|
||||
- containerPort: 6000
|
||||
env:
|
||||
- name: CHROMA_HOST
|
||||
value: "0.0.0.0"
|
||||
- name: CHROMA_PORT
|
||||
value: "6000"
|
||||
- name: PERSIST_DIRECTORY
|
||||
value: "/chroma/chroma"
|
||||
- name: CHROMA_DB_IMPL
|
||||
value: "duckdb+parquet"
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "2Gi"
|
||||
cpu: "1000m"
|
||||
volumeMounts:
|
||||
- name: chromadb-storage
|
||||
mountPath: /chroma/chroma
|
||||
volumes:
|
||||
- name: chromadb-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: chromadb-pvc
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: chromadb
|
||||
spec:
|
||||
selector:
|
||||
app: chromadb
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 6000
|
||||
targetPort: 6000
|
||||
type: ClusterIP
|
|
@ -0,0 +1,7 @@
|
|||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: hf-token-secret
|
||||
type: Opaque
|
||||
data:
|
||||
token: ${HF_TOKEN_BASE64}
|
17
docs/docs/distributions/k8s/ingress-k8s.yaml.template
Normal file
17
docs/docs/distributions/k8s/ingress-k8s.yaml.template
Normal file
|
@ -0,0 +1,17 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: llama-stack-service
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
selector:
|
||||
app.kubernetes.io/name: llama-stack
|
||||
ports:
|
||||
- name: llama-stack-api
|
||||
port: 8321
|
||||
targetPort: 8321
|
||||
protocol: TCP
|
||||
- name: llama-stack-ui
|
||||
port: 8322
|
||||
targetPort: 8322
|
||||
protocol: TCP
|
66
docs/docs/distributions/k8s/postgres-k8s.yaml.template
Normal file
66
docs/docs/distributions/k8s/postgres-k8s.yaml.template
Normal file
|
@ -0,0 +1,66 @@
|
|||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: postgres-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: postgres
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: postgres
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: postgres
|
||||
spec:
|
||||
containers:
|
||||
- name: postgres
|
||||
image: postgres:15
|
||||
env:
|
||||
- name: POSTGRES_DB
|
||||
value: "${POSTGRES_DB}"
|
||||
- name: POSTGRES_USER
|
||||
value: "${POSTGRES_USER}"
|
||||
- name: POSTGRES_PASSWORD
|
||||
value: "${POSTGRES_PASSWORD}"
|
||||
- name: PGDATA
|
||||
value: "/var/lib/postgresql/data/pgdata"
|
||||
ports:
|
||||
- containerPort: 5432
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
volumeMounts:
|
||||
- name: postgres-storage
|
||||
mountPath: /var/lib/postgresql/data
|
||||
volumes:
|
||||
- name: postgres-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: postgres-pvc
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: postgres-server
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: postgres
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 5432
|
||||
targetPort: 5432
|
||||
type: ClusterIP
|
56
docs/docs/distributions/k8s/stack-configmap.yaml
Normal file
56
docs/docs/distributions/k8s/stack-configmap.yaml
Normal file
|
@ -0,0 +1,56 @@
|
|||
apiVersion: v1
|
||||
data:
|
||||
stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n-
|
||||
inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n
|
||||
\ inference:\n - provider_id: vllm-inference\n provider_type: remote::vllm\n
|
||||
\ config:\n url: ${env.VLLM_URL:=http://localhost:8000/v1}\n max_tokens:
|
||||
${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n tls_verify:
|
||||
${env.VLLM_TLS_VERIFY:=true}\n - provider_id: vllm-safety\n provider_type:
|
||||
remote::vllm\n config:\n url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n
|
||||
\ max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n
|
||||
\ tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n - provider_id: sentence-transformers\n
|
||||
\ provider_type: inline::sentence-transformers\n config: {}\n vector_io:\n
|
||||
\ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n provider_type: remote::chromadb\n
|
||||
\ config:\n url: ${env.CHROMADB_URL:=}\n kvstore:\n type: postgres\n
|
||||
\ host: ${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n
|
||||
\ db: ${env.POSTGRES_DB:=llamastack}\n user: ${env.POSTGRES_USER:=llamastack}\n
|
||||
\ password: ${env.POSTGRES_PASSWORD:=llamastack}\n files:\n - provider_id:
|
||||
meta-reference-files\n provider_type: inline::localfs\n config:\n storage_dir:
|
||||
${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n metadata_store:\n
|
||||
\ type: sqlite\n db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
|
||||
\ \n safety:\n - provider_id: llama-guard\n provider_type: inline::llama-guard\n
|
||||
\ config:\n excluded_categories: []\n agents:\n - provider_id: meta-reference\n
|
||||
\ provider_type: inline::meta-reference\n config:\n persistence_store:\n
|
||||
\ type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n port:
|
||||
${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user:
|
||||
${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
|
||||
\ responses_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n
|
||||
\ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n
|
||||
\ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
|
||||
\ telemetry:\n - provider_id: meta-reference\n provider_type: inline::meta-reference\n
|
||||
\ config:\n service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n sinks:
|
||||
${env.TELEMETRY_SINKS:=console}\n tool_runtime:\n - provider_id: brave-search\n
|
||||
\ provider_type: remote::brave-search\n config:\n api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n
|
||||
\ max_results: 3\n - provider_id: tavily-search\n provider_type: remote::tavily-search\n
|
||||
\ config:\n api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n max_results:
|
||||
3\n - provider_id: rag-runtime\n provider_type: inline::rag-runtime\n config:
|
||||
{}\n - provider_id: model-context-protocol\n provider_type: remote::model-context-protocol\n
|
||||
\ config: {}\nmetadata_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n
|
||||
\ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user:
|
||||
${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
|
||||
\ table_name: llamastack_kvstore\ninference_store:\n type: postgres\n host:
|
||||
${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n
|
||||
\ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n-
|
||||
metadata:\n embedding_dimension: 384\n model_id: all-MiniLM-L6-v2\n provider_id:
|
||||
sentence-transformers\n model_type: embedding\n- metadata: {}\n model_id: ${env.INFERENCE_MODEL}\n
|
||||
\ provider_id: vllm-inference\n model_type: llm\n- metadata: {}\n model_id:
|
||||
${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n provider_id: vllm-safety\n
|
||||
\ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs:
|
||||
[]\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id:
|
||||
builtin::websearch\n provider_id: tavily-search\n- toolgroup_id: builtin::rag\n
|
||||
\ provider_id: rag-runtime\nserver:\n port: 8321\n auth:\n provider_config:\n
|
||||
\ type: github_token\n"
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
name: llama-stack-config
|
69
docs/docs/distributions/k8s/stack-k8s.yaml.template
Normal file
69
docs/docs/distributions/k8s/stack-k8s.yaml.template
Normal file
|
@ -0,0 +1,69 @@
|
|||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: llama-pvc
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: llama-stack-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: llama-stack
|
||||
app.kubernetes.io/component: server
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: llama-stack
|
||||
app.kubernetes.io/component: server
|
||||
spec:
|
||||
containers:
|
||||
- name: llama-stack
|
||||
image: llamastack/distribution-starter:latest
|
||||
imagePullPolicy: Always # since we have specified latest instead of a version
|
||||
env:
|
||||
- name: ENABLE_CHROMADB
|
||||
value: "true"
|
||||
- name: CHROMADB_URL
|
||||
value: http://chromadb.default.svc.cluster.local:6000
|
||||
- name: VLLM_URL
|
||||
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
||||
- name: VLLM_MAX_TOKENS
|
||||
value: "3072"
|
||||
- name: VLLM_SAFETY_URL
|
||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||
- name: VLLM_TLS_VERIFY
|
||||
value: "false"
|
||||
- name: POSTGRES_HOST
|
||||
value: postgres-server.default.svc.cluster.local
|
||||
- name: POSTGRES_PORT
|
||||
value: "5432"
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${INFERENCE_MODEL}"
|
||||
- name: SAFETY_MODEL
|
||||
value: "${SAFETY_MODEL}"
|
||||
- name: TAVILY_SEARCH_API_KEY
|
||||
value: "${TAVILY_SEARCH_API_KEY}"
|
||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
|
||||
ports:
|
||||
- containerPort: 8321
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.llama
|
||||
- name: llama-config
|
||||
mountPath: /etc/config
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: llama-pvc
|
||||
- name: llama-config
|
||||
configMap:
|
||||
name: llama-stack-config
|
140
docs/docs/distributions/k8s/stack_run_config.yaml
Normal file
140
docs/docs/distributions/k8s/stack_run_config.yaml
Normal file
|
@ -0,0 +1,140 @@
|
|||
version: '2'
|
||||
image_name: kubernetes-demo
|
||||
apis:
|
||||
- agents
|
||||
- inference
|
||||
- files
|
||||
- safety
|
||||
- telemetry
|
||||
- tool_runtime
|
||||
- vector_io
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm-inference
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: vllm-safety
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
vector_io:
|
||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
||||
provider_type: remote::chromadb
|
||||
config:
|
||||
url: ${env.CHROMADB_URL:=}
|
||||
kvstore:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
files:
|
||||
- provider_id: meta-reference-files
|
||||
provider_type: inline::localfs
|
||||
config:
|
||||
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
|
||||
metadata_store:
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
excluded_categories: []
|
||||
agents:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
responses_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
config:
|
||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: tavily-search
|
||||
provider_type: remote::tavily-search
|
||||
config:
|
||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
||||
max_results: 3
|
||||
- provider_id: rag-runtime
|
||||
provider_type: inline::rag-runtime
|
||||
config: {}
|
||||
- provider_id: model-context-protocol
|
||||
provider_type: remote::model-context-protocol
|
||||
config: {}
|
||||
metadata_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
table_name: llamastack_kvstore
|
||||
inference_store:
|
||||
type: postgres
|
||||
host: ${env.POSTGRES_HOST:=localhost}
|
||||
port: ${env.POSTGRES_PORT:=5432}
|
||||
db: ${env.POSTGRES_DB:=llamastack}
|
||||
user: ${env.POSTGRES_USER:=llamastack}
|
||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||
models:
|
||||
- metadata:
|
||||
embedding_dimension: 384
|
||||
model_id: all-MiniLM-L6-v2
|
||||
provider_id: sentence-transformers
|
||||
model_type: embedding
|
||||
- metadata: {}
|
||||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: vllm-inference
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||
provider_id: vllm-safety
|
||||
model_type: llm
|
||||
shields:
|
||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||
vector_dbs: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
benchmarks: []
|
||||
tool_groups:
|
||||
- toolgroup_id: builtin::websearch
|
||||
provider_id: tavily-search
|
||||
- toolgroup_id: builtin::rag
|
||||
provider_id: rag-runtime
|
||||
server:
|
||||
port: 8321
|
||||
auth:
|
||||
provider_config:
|
||||
type: github_token
|
68
docs/docs/distributions/k8s/ui-k8s.yaml.template
Normal file
68
docs/docs/distributions/k8s/ui-k8s.yaml.template
Normal file
|
@ -0,0 +1,68 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: llama-stack-ui
|
||||
labels:
|
||||
app.kubernetes.io/name: llama-stack
|
||||
app.kubernetes.io/component: ui
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: llama-stack
|
||||
app.kubernetes.io/component: ui
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: llama-stack
|
||||
app.kubernetes.io/component: ui
|
||||
spec:
|
||||
containers:
|
||||
- name: llama-stack-ui
|
||||
image: node:18-alpine
|
||||
command: ["/bin/sh"]
|
||||
env:
|
||||
- name: LLAMA_STACK_BACKEND_URL
|
||||
value: "http://llama-stack-service:8321"
|
||||
- name: LLAMA_STACK_UI_PORT
|
||||
value: "8322"
|
||||
- name: GITHUB_CLIENT_ID
|
||||
value: "${GITHUB_CLIENT_ID}"
|
||||
- name: GITHUB_CLIENT_SECRET
|
||||
value: "${GITHUB_CLIENT_SECRET}"
|
||||
- name: NEXTAUTH_URL
|
||||
value: "${LLAMA_STACK_UI_URL}:8322"
|
||||
args:
|
||||
- -c
|
||||
- |
|
||||
# Install git (not included in alpine by default)
|
||||
apk add --no-cache git
|
||||
|
||||
# Clone the repository
|
||||
echo "Cloning repository..."
|
||||
git clone https://github.com/meta-llama/llama-stack.git /app
|
||||
|
||||
# Navigate to the UI directory
|
||||
echo "Navigating to UI directory..."
|
||||
cd /app/llama_stack/ui
|
||||
|
||||
# Check if package.json exists
|
||||
if [ ! -f "package.json" ]; then
|
||||
echo "ERROR: package.json not found in $(pwd)"
|
||||
ls -la
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Install dependencies with verbose output
|
||||
echo "Installing dependencies..."
|
||||
npm install --verbose
|
||||
|
||||
# Verify next is installed
|
||||
echo "Checking if next is installed..."
|
||||
npx next --version || echo "Next.js not found, checking node_modules..."
|
||||
ls -la node_modules/.bin/ | grep next || echo "No next binary found"
|
||||
|
||||
npm run dev
|
||||
ports:
|
||||
- containerPort: 8322
|
||||
workingDir: /app
|
70
docs/docs/distributions/k8s/vllm-k8s.yaml.template
Normal file
70
docs/docs/distributions/k8s/vllm-k8s.yaml.template
Normal file
|
@ -0,0 +1,70 @@
|
|||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: vllm-models
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vllm
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm
|
||||
workload-type: inference
|
||||
spec:
|
||||
nodeSelector:
|
||||
eks.amazonaws.com/nodegroup: gpu
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6 --enable-auto-tool-choice --tool-call-parser llama4_pythonic"
|
||||
env:
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${INFERENCE_MODEL}"
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
requests:
|
||||
nvidia.com/gpu: 1
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.cache/huggingface
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: vllm-models
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-server
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: vllm
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
type: ClusterIP
|
71
docs/docs/distributions/k8s/vllm-safety-k8s.yaml.template
Normal file
71
docs/docs/distributions/k8s/vllm-safety-k8s.yaml.template
Normal file
|
@ -0,0 +1,71 @@
|
|||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: vllm-models-safety
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
volumeMode: Filesystem
|
||||
resources:
|
||||
requests:
|
||||
storage: 30Gi
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: vllm-server-safety
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: vllm-safety
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: vllm-safety
|
||||
workload-type: inference
|
||||
spec:
|
||||
nodeSelector:
|
||||
eks.amazonaws.com/nodegroup: gpu
|
||||
containers:
|
||||
- name: vllm-safety
|
||||
image: vllm/vllm-openai:latest
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: [
|
||||
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3"
|
||||
]
|
||||
env:
|
||||
- name: SAFETY_MODEL
|
||||
value: "${SAFETY_MODEL}"
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hf-token-secret
|
||||
key: token
|
||||
ports:
|
||||
- containerPort: 8001
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
requests:
|
||||
nvidia.com/gpu: 1
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.cache/huggingface
|
||||
volumes:
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: vllm-models-safety
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-server-safety
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: vllm-safety
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8001
|
||||
targetPort: 8001
|
||||
type: ClusterIP
|
Loading…
Add table
Add a link
Reference in a new issue