diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh new file mode 100644 index 000000000..c8fe2d68a --- /dev/null +++ b/docs/source/distributions/k8s/apply.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +export POSTGRES_USER=${POSTGRES_USER:-llamastack} +export POSTGRES_DB=${POSTGRES_DB:-llamastack} +export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack} + +export INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} +export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} + +set -euo pipefail +set -x + +envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - +envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - +envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - + +kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \ + --dry-run=client -o yaml > stack-configmap.yaml + +kubectl apply -f stack-configmap.yaml + +envsubst < ./stack-k8s.yaml.template | kubectl apply -f - diff --git a/docs/source/distributions/k8s/chroma-k8s.yaml.template b/docs/source/distributions/k8s/chroma-k8s.yaml.template new file mode 100644 index 000000000..c6733c121 --- /dev/null +++ b/docs/source/distributions/k8s/chroma-k8s.yaml.template @@ -0,0 +1,67 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: chromadb-pvc +spec: + accessModes: + - ReadWriteOnce + storageClassName: gp2 + resources: + requests: + storage: 20Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chromadb +spec: + replicas: 1 + selector: + matchLabels: + app: chromadb + template: + metadata: + labels: + app: chromadb + spec: + containers: + - name: chromadb + image: chromadb/chroma:latest + ports: + - containerPort: 6000 + env: + - name: CHROMA_HOST + value: "0.0.0.0" + - name: CHROMA_PORT + value: "6000" + - name: PERSIST_DIRECTORY + value: "/chroma/chroma" + - name: CHROMA_DB_IMPL + value: "duckdb+parquet" + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: chromadb-storage + mountPath: /chroma/chroma + volumes: + - name: chromadb-storage + persistentVolumeClaim: + claimName: chromadb-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: chromadb +spec: + selector: + app: chromadb + ports: + - protocol: TCP + port: 6000 + targetPort: 6000 + type: ClusterIP diff --git a/docs/source/distributions/k8s/postgres-k8s.yaml.template b/docs/source/distributions/k8s/postgres-k8s.yaml.template new file mode 100644 index 000000000..d00af8e21 --- /dev/null +++ b/docs/source/distributions/k8s/postgres-k8s.yaml.template @@ -0,0 +1,67 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: postgres-pvc +spec: + accessModes: + - ReadWriteOnce + storageClassName: gp2 + resources: + requests: + storage: 10Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: postgres + template: + metadata: + labels: + app.kubernetes.io/name: postgres + spec: + containers: + - name: postgres + image: postgres:15 + env: + - name: POSTGRES_DB + value: "${POSTGRES_DB}" + - name: POSTGRES_USER + value: "${POSTGRES_USER}" + - name: POSTGRES_PASSWORD + value: "${POSTGRES_PASSWORD}" + - name: PGDATA + value: "/var/lib/postgresql/data/pgdata" + ports: + - containerPort: 5432 + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + volumeMounts: + - name: postgres-storage + mountPath: /var/lib/postgresql/data + volumes: + - name: postgres-storage + persistentVolumeClaim: + claimName: postgres-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres-server +spec: + selector: + app.kubernetes.io/name: postgres + ports: + - protocol: TCP + port: 5432 + targetPort: 5432 + type: ClusterIP diff --git a/docs/source/distributions/k8s/stack-configmap.yaml b/docs/source/distributions/k8s/stack-configmap.yaml new file mode 100644 index 000000000..fa7bacd8f --- /dev/null +++ b/docs/source/distributions/k8s/stack-configmap.yaml @@ -0,0 +1,128 @@ +apiVersion: v1 +data: + stack_run_config.yaml: | + version: '2' + image_name: kubernetes-demo + apis: + - agents + - inference + - safety + - telemetry + - tool_runtime + - vector_io + providers: + inference: + - provider_id: vllm-inference + provider_type: remote::vllm + config: + url: ${env.VLLM_URL:http://localhost:8000/v1} + max_tokens: ${env.VLLM_MAX_TOKENS:4096} + api_token: ${env.VLLM_API_TOKEN:fake} + tls_verify: ${env.VLLM_TLS_VERIFY:true} + - provider_id: vllm-safety + provider_type: remote::vllm + config: + url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1} + max_tokens: ${env.VLLM_MAX_TOKENS:4096} + api_token: ${env.VLLM_API_TOKEN:fake} + tls_verify: ${env.VLLM_TLS_VERIFY:true} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} + vector_io: + - provider_id: ${env.ENABLE_CHROMADB+chromadb} + provider_type: remote::chromadb + config: + url: ${env.CHROMADB_URL:} + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: postgres + host: ${env.POSTGRES_HOST:localhost} + port: ${env.POSTGRES_PORT:5432} + db: ${env.POSTGRES_DB:llamastack} + user: ${env.POSTGRES_USER:llamastack} + password: ${env.POSTGRES_PASSWORD:llamastack} + responses_store: + type: postgres + host: ${env.POSTGRES_HOST:localhost} + port: ${env.POSTGRES_PORT:5432} + db: ${env.POSTGRES_DB:llamastack} + user: ${env.POSTGRES_USER:llamastack} + password: ${env.POSTGRES_PASSWORD:llamastack} + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: ${env.OTEL_SERVICE_NAME:} + sinks: ${env.TELEMETRY_SINKS:console} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} + metadata_store: + type: postgres + host: ${env.POSTGRES_HOST:localhost} + port: ${env.POSTGRES_PORT:5432} + db: ${env.POSTGRES_DB:llamastack} + user: ${env.POSTGRES_USER:llamastack} + password: ${env.POSTGRES_PASSWORD:llamastack} + table_name: llamastack_kvstore + inference_store: + type: postgres + host: ${env.POSTGRES_HOST:localhost} + port: ${env.POSTGRES_PORT:5432} + db: ${env.POSTGRES_DB:llamastack} + user: ${env.POSTGRES_USER:llamastack} + password: ${env.POSTGRES_PASSWORD:llamastack} + models: + - metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + model_type: embedding + - metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: vllm-inference + model_type: llm + - metadata: {} + model_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B} + provider_id: vllm-safety + model_type: llm + shields: + - shield_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B} + vector_dbs: [] + datasets: [] + scoring_fns: [] + benchmarks: [] + tool_groups: + - toolgroup_id: builtin::websearch + provider_id: tavily-search + - toolgroup_id: builtin::rag + provider_id: rag-runtime + server: + port: 8321 +kind: ConfigMap +metadata: + creationTimestamp: null + name: llama-stack-config diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template new file mode 100644 index 000000000..198e88aed --- /dev/null +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -0,0 +1,75 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: llama-pvc +spec: + accessModes: + - ReadWriteOnce + storageClassName: gp2 + resources: + requests: + storage: 1Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llama-stack-server +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: llama-stack + template: + metadata: + labels: + app.kubernetes.io/name: llama-stack + spec: + containers: + - name: llama-stack + image: llamastack/distribution-remote-vllm:latest + imagePullPolicy: IfNotPresent + env: + - name: VLLM_URL + value: http://vllm-server.default.svc.cluster.local:8000/v1 + - name: VLLM_SAFETY_URL + value: http://vllm-server-safety.default.svc.cluster.local:8001/v1 + - name: POSTGRES_HOST + value: postgres-server.default.svc.cluster.local + - name: POSTGRES_PORT + value: "5432" + - name: VLLM_TLS_VERIFY + value: "false" + - name: INFERENCE_MODEL + value: "${INFERENCE_MODEL}" + - name: SAFETY_MODEL + value: "${SAFETY_MODEL}" + - name: TAVILY_SEARCH_API_KEY + value: "${TAVILY_SEARCH_API_KEY}" + command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"] + ports: + - containerPort: 8321 + volumeMounts: + - name: llama-storage + mountPath: /root/.llama + - name: llama-config + mountPath: /etc/config + volumes: + - name: llama-storage + persistentVolumeClaim: + claimName: llama-pvc + - name: llama-config + configMap: + name: llama-stack-config +--- +apiVersion: v1 +kind: Service +metadata: + name: llama-stack-service +spec: + selector: + app.kubernetes.io/name: llama-stack + ports: + - protocol: TCP + port: 8321 + targetPort: 8321 + type: ClusterIP diff --git a/docs/source/distributions/k8s/stack_run_config.yaml b/docs/source/distributions/k8s/stack_run_config.yaml new file mode 100644 index 000000000..8e2773dd1 --- /dev/null +++ b/docs/source/distributions/k8s/stack_run_config.yaml @@ -0,0 +1,121 @@ +version: '2' +image_name: kubernetes-demo +apis: +- agents +- inference +- safety +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: vllm-inference + provider_type: remote::vllm + config: + url: ${env.VLLM_URL:http://localhost:8000/v1} + max_tokens: ${env.VLLM_MAX_TOKENS:4096} + api_token: ${env.VLLM_API_TOKEN:fake} + tls_verify: ${env.VLLM_TLS_VERIFY:true} + - provider_id: vllm-safety + provider_type: remote::vllm + config: + url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1} + max_tokens: ${env.VLLM_MAX_TOKENS:4096} + api_token: ${env.VLLM_API_TOKEN:fake} + tls_verify: ${env.VLLM_TLS_VERIFY:true} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} + vector_io: + - provider_id: ${env.ENABLE_CHROMADB+chromadb} + provider_type: remote::chromadb + config: + url: ${env.CHROMADB_URL:} + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: postgres + host: ${env.POSTGRES_HOST:localhost} + port: ${env.POSTGRES_PORT:5432} + db: ${env.POSTGRES_DB:llamastack} + user: ${env.POSTGRES_USER:llamastack} + password: ${env.POSTGRES_PASSWORD:llamastack} + responses_store: + type: postgres + host: ${env.POSTGRES_HOST:localhost} + port: ${env.POSTGRES_PORT:5432} + db: ${env.POSTGRES_DB:llamastack} + user: ${env.POSTGRES_USER:llamastack} + password: ${env.POSTGRES_PASSWORD:llamastack} + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: ${env.OTEL_SERVICE_NAME:} + sinks: ${env.TELEMETRY_SINKS:console} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} +metadata_store: + type: postgres + host: ${env.POSTGRES_HOST:localhost} + port: ${env.POSTGRES_PORT:5432} + db: ${env.POSTGRES_DB:llamastack} + user: ${env.POSTGRES_USER:llamastack} + password: ${env.POSTGRES_PASSWORD:llamastack} + table_name: llamastack_kvstore +inference_store: + type: postgres + host: ${env.POSTGRES_HOST:localhost} + port: ${env.POSTGRES_PORT:5432} + db: ${env.POSTGRES_DB:llamastack} + user: ${env.POSTGRES_USER:llamastack} + password: ${env.POSTGRES_PASSWORD:llamastack} +models: +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + model_type: embedding +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: vllm-inference + model_type: llm +- metadata: {} + model_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B} + provider_id: vllm-safety + model_type: llm +shields: +- shield_id: ${env.SAFETY_MODEL:meta-llama/Llama-Guard-3-1B} +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::rag + provider_id: rag-runtime +server: + port: 8321 diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template new file mode 100644 index 000000000..ff060ac41 --- /dev/null +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -0,0 +1,113 @@ +# NOTE: this template does not really do any fancy node mapping or affinity declarations +# so the inference and safety models may land on the same GPU node +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vllm-models +spec: + accessModes: + - ReadWriteOnce + volumeMode: Filesystem + storageClassName: gp2 + resources: + requests: + storage: 50Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-server +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: vllm + template: + metadata: + labels: + app.kubernetes.io/name: vllm + spec: + containers: + - name: vllm + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: + - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.5" + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + volumeMounts: + - name: llama-storage + mountPath: /root/.cache/huggingface + volumes: + - name: llama-storage + persistentVolumeClaim: + claimName: vllm-models +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-server +spec: + selector: + app.kubernetes.io/name: vllm + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-server-safety +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: vllm-safety + template: + metadata: + labels: + app.kubernetes.io/name: vllm-safety + spec: + containers: + - name: vllm-safety + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.3" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8001 + volumeMounts: + - name: llama-storage + mountPath: /root/.cache/huggingface + volumes: + - name: llama-storage + persistentVolumeClaim: + claimName: vllm-models +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-server-safety +spec: + selector: + app.kubernetes.io/name: vllm-safety + ports: + - protocol: TCP + port: 8001 + targetPort: 8001 + type: ClusterIP