mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-02 00:34:44 +00:00
second try
This commit is contained in:
parent
31a15332c4
commit
1cb9d3bca2
11 changed files with 237 additions and 64 deletions
|
@ -6,12 +6,18 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
# Check if NGC_API_KEY is provided as argument
|
||||||
|
if [ -n "$1" ]; then
|
||||||
|
export NGC_API_KEY=$1
|
||||||
|
echo "Using NGC API key provided as argument."
|
||||||
|
fi
|
||||||
|
|
||||||
export POSTGRES_USER=llamastack
|
export POSTGRES_USER=llamastack
|
||||||
export POSTGRES_DB=llamastack
|
export POSTGRES_DB=llamastack
|
||||||
export POSTGRES_PASSWORD=llamastack
|
export POSTGRES_PASSWORD=llamastack
|
||||||
|
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
export CODE_MODEL=bigcode/starcoder2-7b
|
||||||
|
|
||||||
# Set USE_EBS to false if you don't have permission to use EKS EBS
|
# Set USE_EBS to false if you don't have permission to use EKS EBS
|
||||||
export USE_EBS=${USE_EBS:-false}
|
export USE_EBS=${USE_EBS:-false}
|
||||||
|
@ -24,13 +30,16 @@ else
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z "${GITHUB_CLIENT_ID:-}" ]; then
|
# NGC_API_KEY should be set by the user or provided as argument; base64 encode it for the secret
|
||||||
echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
|
if [ -n "${NGC_API_KEY:-}" ]; then
|
||||||
exit 1
|
export NGC_API_KEY_BASE64=$(echo -n "$NGC_API_KEY" | base64)
|
||||||
fi
|
# Create Docker config JSON for NGC image pull
|
||||||
|
NGC_DOCKER_CONFIG="{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"$NGC_API_KEY\"}}}"
|
||||||
if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then
|
export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
|
||||||
echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
|
else
|
||||||
|
echo "ERROR: NGC_API_KEY not set. You need it for NIM to download models from NVIDIA."
|
||||||
|
echo "Usage: $0 [your-ngc-api-key]"
|
||||||
|
echo "You can either provide your NGC API key as an argument or set it as an environment variable."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -41,20 +50,18 @@ fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Apply the HF token secret if HF_TOKEN is provided
|
||||||
|
if [ -n "${HF_TOKEN:-}" ]; then
|
||||||
|
envsubst < ./set-secret.yaml.template | kubectl apply -f -
|
||||||
|
fi
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
# Apply the HF token secret if HF_TOKEN is provided
|
|
||||||
if [ -n "${HF_TOKEN:-}" ]; then
|
|
||||||
envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Apply templates with appropriate storage configuration based on USE_EBS setting
|
# Apply templates with appropriate storage configuration based on USE_EBS setting
|
||||||
if [ "$USE_EBS" = "true" ]; then
|
if [ "$USE_EBS" = "true" ]; then
|
||||||
echo "Using EBS storage for persistent volumes"
|
echo "Using EBS storage for persistent volumes"
|
||||||
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
|
||||||
|
|
||||||
|
@ -70,7 +77,7 @@ else
|
||||||
echo "Using emptyDir for storage (data will not persist across pod restarts)"
|
echo "Using emptyDir for storage (data will not persist across pod restarts)"
|
||||||
# Process templates to replace EBS storage with emptyDir
|
# Process templates to replace EBS storage with emptyDir
|
||||||
envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
envsubst < ./vllm-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
|
||||||
|
|
14
docs/source/distributions/k8s/delete.sh
Normal file → Executable file
14
docs/source/distributions/k8s/delete.sh
Normal file → Executable file
|
@ -21,6 +21,14 @@ if [ -n "${HF_TOKEN:-}" ]; then
|
||||||
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
|
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# NGC_API_KEY should be set by the user; base64 encode it for the secret
|
||||||
|
if [ -n "${NGC_API_KEY:-}" ]; then
|
||||||
|
export NGC_API_KEY_BASE64=$(echo -n "$NGC_API_KEY" | base64)
|
||||||
|
# Create Docker config JSON for NGC image pull
|
||||||
|
NGC_DOCKER_CONFIG="{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"$NGC_API_KEY\"}}}"
|
||||||
|
export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
|
||||||
|
fi
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
|
@ -45,14 +53,16 @@ envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=t
|
||||||
envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete vllm-safety deployment
|
# Delete vllm-safety deployment
|
||||||
envsubst < ./vllm-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./llama-nim.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete vllm deployment
|
# Delete vllm deployment
|
||||||
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete the HF token secret if it exists
|
# Delete the HF token secret if it exists
|
||||||
if [ -n "${HF_TOKEN:-}" ]; then
|
if [ -n "${HF_TOKEN:-}" ]; then
|
||||||
envsubst < ./hf-token-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./set-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# NGC API key secrets are now part of llama-nim.yaml.template and are deleted with it
|
||||||
|
|
||||||
echo "All LlamaStack Kubernetes resources have been deleted."
|
echo "All LlamaStack Kubernetes resources have been deleted."
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
apiVersion: v1
|
|
||||||
kind: Secret
|
|
||||||
metadata:
|
|
||||||
name: hf-token-secret
|
|
||||||
type: Opaque
|
|
||||||
data:
|
|
||||||
token: ${HF_TOKEN_BASE64}
|
|
|
@ -1,25 +1,25 @@
|
||||||
# -------------------------------------------------
|
# -------------------------------------------------
|
||||||
# NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1
|
# NVIDIA NIM - Code
|
||||||
# -------------------------------------------------
|
# -------------------------------------------------
|
||||||
|
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
metadata:
|
metadata:
|
||||||
name: llama-nano-nim
|
name: llm-nim-code
|
||||||
labels:
|
labels:
|
||||||
app: llama-nano-nim
|
app: llm-nim-code
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
app: llama-nano-nim
|
app: llm-nim-code
|
||||||
template:
|
template:
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: llama-nano-nim
|
app: llm-nim-code
|
||||||
spec:
|
spec:
|
||||||
imagePullSecrets:
|
imagePullSecrets:
|
||||||
- name: ngc-secret # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
|
- name: ngc-docker-registry # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
|
||||||
volumes:
|
volumes:
|
||||||
- name: model-cache
|
- name: model-cache
|
||||||
emptyDir:
|
emptyDir:
|
||||||
|
@ -27,7 +27,7 @@ spec:
|
||||||
sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed
|
sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed
|
||||||
containers:
|
containers:
|
||||||
- name: nim
|
- name: nim
|
||||||
image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0
|
image: nvcr.io/nim/bigcode/starcoder2-7b:1.8.1
|
||||||
ports:
|
ports:
|
||||||
- name: http-openai
|
- name: http-openai
|
||||||
containerPort: 8000
|
containerPort: 8000
|
||||||
|
@ -36,7 +36,7 @@ spec:
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu: 1
|
||||||
env:
|
env:
|
||||||
- name: NIM_MODEL_NAME
|
- name: NIM_MODEL_NAME
|
||||||
value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1"
|
value: "nvidia/starcoder2-7b"
|
||||||
- name: NGC_API_KEY
|
- name: NGC_API_KEY
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
|
@ -49,23 +49,23 @@ spec:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /v1/models
|
path: /v1/models
|
||||||
port: http-openai
|
port: http-openai
|
||||||
initialDelaySeconds: 20
|
initialDelaySeconds: 360
|
||||||
periodSeconds: 10
|
periodSeconds: 360
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /v1/health
|
path: /v1/health
|
||||||
port: http-openai
|
port: http-openai
|
||||||
initialDelaySeconds: 60
|
initialDelaySeconds: 600
|
||||||
periodSeconds: 30
|
periodSeconds: 360
|
||||||
|
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: Service
|
||||||
metadata:
|
metadata:
|
||||||
name: llama-nano-nim
|
name: llm-nim-code
|
||||||
spec:
|
spec:
|
||||||
selector:
|
selector:
|
||||||
app: llama-nano-nim
|
app: llm-nim-code
|
||||||
ports:
|
ports:
|
||||||
- name: http-openai
|
- name: http-openai
|
||||||
port: 8000
|
port: 8000
|
||||||
|
|
31
docs/source/distributions/k8s/set-secret.yaml.template
Normal file
31
docs/source/distributions/k8s/set-secret.yaml.template
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: hf-token-secret
|
||||||
|
type: Opaque
|
||||||
|
data:
|
||||||
|
token: ${HF_TOKEN_BASE64}
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# NGC Docker Registry Secret
|
||||||
|
# -------------------------------------------------
|
||||||
|
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: ngc-docker-registry
|
||||||
|
type: kubernetes.io/dockerconfigjson
|
||||||
|
data:
|
||||||
|
.dockerconfigjson: ${NGC_DOCKER_CONFIG_JSON}
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# NGC API Secret
|
||||||
|
# -------------------------------------------------
|
||||||
|
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: ngc-api
|
||||||
|
type: Opaque
|
||||||
|
data:
|
||||||
|
NGC_API_KEY: ${NGC_API_KEY_BASE64}
|
|
@ -0,0 +1,125 @@
|
||||||
|
apiVersion: v1
|
||||||
|
data:
|
||||||
|
stack_run_config.yaml: |
|
||||||
|
version: '2'
|
||||||
|
image_name: kubernetes-demo
|
||||||
|
apis:
|
||||||
|
- agents
|
||||||
|
- inference
|
||||||
|
- safety
|
||||||
|
- telemetry
|
||||||
|
- tool_runtime
|
||||||
|
- vector_io
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- provider_id: vllm-inference
|
||||||
|
provider_type: remote::vllm
|
||||||
|
config:
|
||||||
|
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
||||||
|
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||||
|
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||||
|
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||||
|
- provider_id: nvidia
|
||||||
|
provider_type: remote::nvidia
|
||||||
|
config:
|
||||||
|
url: ${env.NVIDIA_BASE_URL:=http://localhost:8001/v1}
|
||||||
|
api_key: ${env.NVIDIA_API_KEY:=}
|
||||||
|
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
|
||||||
|
- provider_id: sentence-transformers
|
||||||
|
provider_type: inline::sentence-transformers
|
||||||
|
config: {}
|
||||||
|
vector_io:
|
||||||
|
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
||||||
|
provider_type: remote::chromadb
|
||||||
|
config:
|
||||||
|
url: ${env.CHROMADB_URL:=}
|
||||||
|
safety:
|
||||||
|
- provider_id: llama-guard
|
||||||
|
provider_type: inline::llama-guard
|
||||||
|
config:
|
||||||
|
excluded_categories: []
|
||||||
|
agents:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
persistence_store:
|
||||||
|
type: postgres
|
||||||
|
host: ${env.POSTGRES_HOST:=localhost}
|
||||||
|
port: ${env.POSTGRES_PORT:=5432}
|
||||||
|
db: ${env.POSTGRES_DB:=llamastack}
|
||||||
|
user: ${env.POSTGRES_USER:=llamastack}
|
||||||
|
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||||
|
responses_store:
|
||||||
|
type: postgres
|
||||||
|
host: ${env.POSTGRES_HOST:=localhost}
|
||||||
|
port: ${env.POSTGRES_PORT:=5432}
|
||||||
|
db: ${env.POSTGRES_DB:=llamastack}
|
||||||
|
user: ${env.POSTGRES_USER:=llamastack}
|
||||||
|
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||||
|
telemetry:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||||
|
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||||
|
tool_runtime:
|
||||||
|
- provider_id: brave-search
|
||||||
|
provider_type: remote::brave-search
|
||||||
|
config:
|
||||||
|
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
||||||
|
max_results: 3
|
||||||
|
- provider_id: tavily-search
|
||||||
|
provider_type: remote::tavily-search
|
||||||
|
config:
|
||||||
|
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
||||||
|
max_results: 3
|
||||||
|
- provider_id: rag-runtime
|
||||||
|
provider_type: inline::rag-runtime
|
||||||
|
config: {}
|
||||||
|
- provider_id: model-context-protocol
|
||||||
|
provider_type: remote::model-context-protocol
|
||||||
|
config: {}
|
||||||
|
metadata_store:
|
||||||
|
type: postgres
|
||||||
|
host: ${env.POSTGRES_HOST:=localhost}
|
||||||
|
port: ${env.POSTGRES_PORT:=5432}
|
||||||
|
db: ${env.POSTGRES_DB:=llamastack}
|
||||||
|
user: ${env.POSTGRES_USER:=llamastack}
|
||||||
|
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||||
|
table_name: llamastack_kvstore
|
||||||
|
inference_store:
|
||||||
|
type: postgres
|
||||||
|
host: ${env.POSTGRES_HOST:=localhost}
|
||||||
|
port: ${env.POSTGRES_PORT:=5432}
|
||||||
|
db: ${env.POSTGRES_DB:=llamastack}
|
||||||
|
user: ${env.POSTGRES_USER:=llamastack}
|
||||||
|
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||||
|
models:
|
||||||
|
- metadata:
|
||||||
|
embedding_dimension: 384
|
||||||
|
model_id: all-MiniLM-L6-v2
|
||||||
|
provider_id: sentence-transformers
|
||||||
|
model_type: embedding
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.INFERENCE_MODEL}
|
||||||
|
provider_id: vllm-inference
|
||||||
|
model_type: llm
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
|
||||||
|
provider_id: nvidia
|
||||||
|
model_type: llm
|
||||||
|
vector_dbs: []
|
||||||
|
datasets: []
|
||||||
|
scoring_fns: []
|
||||||
|
benchmarks: []
|
||||||
|
tool_groups:
|
||||||
|
- toolgroup_id: builtin::websearch
|
||||||
|
provider_id: tavily-search
|
||||||
|
- toolgroup_id: builtin::rag
|
||||||
|
provider_id: rag-runtime
|
||||||
|
server:
|
||||||
|
port: 8321
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
creationTimestamp: null
|
||||||
|
name: llama-stack-config
|
|
@ -28,27 +28,34 @@ spec:
|
||||||
initContainers:
|
initContainers:
|
||||||
- name: wait-for-vllm-server
|
- name: wait-for-vllm-server
|
||||||
image: busybox:1.28
|
image: busybox:1.28
|
||||||
command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8000; do echo waiting for vllm-server on port 8000; sleep 2; done;']
|
command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8001; do echo waiting for vllm-server on port 8000; sleep 2; done;']
|
||||||
- name: wait-for-vllm-server-safety
|
- name: wait-for-llm-nim-code
|
||||||
image: busybox:1.28
|
image: busybox:1.28
|
||||||
command: ['sh', '-c', 'until nc -z vllm-server-safety.default.svc.cluster.local 8001; do echo waiting for vllm-server-safety on port 8001; sleep 2; done;']
|
command: ['sh', '-c', 'until nc -z llm-nim-code.default.svc.cluster.local 8000; do echo waiting for llm-nim-code on port 8001; sleep 2; done;']
|
||||||
containers:
|
containers:
|
||||||
- name: llama-stack
|
- name: llama-stack
|
||||||
image: llamastack/distribution-starter:latest
|
image: llamastack/distribution-starter:latest
|
||||||
imagePullPolicy: Always # since we have specified latest instead of a version
|
imagePullPolicy: Always # since we have specified latest instead of a version
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "512Mi"
|
||||||
|
cpu: "500m"
|
||||||
|
ephemeral-storage: "2Gi"
|
||||||
|
limits:
|
||||||
|
memory: "1Gi"
|
||||||
|
cpu: "1000m"
|
||||||
|
ephemeral-storage: "5Gi"
|
||||||
env:
|
env:
|
||||||
- name: ENABLE_CHROMADB
|
- name: ENABLE_CHROMADB
|
||||||
value: "true"
|
value: "true"
|
||||||
- name: CHROMADB_URL
|
- name: CHROMADB_URL
|
||||||
value: http://chromadb.default.svc.cluster.local:6000
|
value: http://chromadb.default.svc.cluster.local:6000
|
||||||
- name: VLLM_URL
|
- name: VLLM_URL
|
||||||
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
value: http://vllm-server.default.svc.cluster.local:8001/v1
|
||||||
- name: VLLM_MAX_TOKENS
|
- name: VLLM_MAX_TOKENS
|
||||||
value: "3072"
|
value: "3072"
|
||||||
- name: NVIDIA_BASE_URL
|
- name: NVIDIA_BASE_URL
|
||||||
value: http://llama-nano-nim.default.svc.cluster.local:8000/v1
|
value: http://llm-nim-code.default.svc.cluster.local:8000/v1
|
||||||
- name: VLLM_SAFETY_URL
|
|
||||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
|
||||||
- name: POSTGRES_HOST
|
- name: POSTGRES_HOST
|
||||||
value: postgres-server.default.svc.cluster.local
|
value: postgres-server.default.svc.cluster.local
|
||||||
- name: POSTGRES_PORT
|
- name: POSTGRES_PORT
|
||||||
|
@ -57,8 +64,8 @@ spec:
|
||||||
value: "false"
|
value: "false"
|
||||||
- name: INFERENCE_MODEL
|
- name: INFERENCE_MODEL
|
||||||
value: "${INFERENCE_MODEL}"
|
value: "${INFERENCE_MODEL}"
|
||||||
- name: SAFETY_MODEL
|
- name: CODE_MODEL
|
||||||
value: "${SAFETY_MODEL}"
|
value: "${CODE_MODEL}"
|
||||||
- name: TAVILY_SEARCH_API_KEY
|
- name: TAVILY_SEARCH_API_KEY
|
||||||
value: "${TAVILY_SEARCH_API_KEY}"
|
value: "${TAVILY_SEARCH_API_KEY}"
|
||||||
command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
|
command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
|
||||||
|
|
|
@ -16,13 +16,12 @@ providers:
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||||
- provider_id: vllm-safety
|
- provider_id: nvidia
|
||||||
provider_type: remote::vllm
|
provider_type: remote::nvidia
|
||||||
config:
|
config:
|
||||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
url: ${env.NVIDIA_BASE_URL:=http://localhost:8001/v1}
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
api_key: ${env.NVIDIA_API_KEY:=}
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: sentence-transformers
|
- provider_id: sentence-transformers
|
||||||
provider_type: inline::sentence-transformers
|
provider_type: inline::sentence-transformers
|
||||||
config: {}
|
config: {}
|
||||||
|
@ -103,11 +102,9 @@ models:
|
||||||
provider_id: vllm-inference
|
provider_id: vllm-inference
|
||||||
model_type: llm
|
model_type: llm
|
||||||
- metadata: {}
|
- metadata: {}
|
||||||
model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
|
||||||
provider_id: vllm-safety
|
provider_id: nvidia
|
||||||
model_type: llm
|
model_type: llm
|
||||||
shields:
|
|
||||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
|
|
|
@ -34,7 +34,7 @@ spec:
|
||||||
image: vllm/vllm-openai:latest
|
image: vllm/vllm-openai:latest
|
||||||
command: ["/bin/sh", "-c"]
|
command: ["/bin/sh", "-c"]
|
||||||
args:
|
args:
|
||||||
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4"
|
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4 --port 8001"
|
||||||
env:
|
env:
|
||||||
- name: INFERENCE_MODEL
|
- name: INFERENCE_MODEL
|
||||||
value: "${INFERENCE_MODEL}"
|
value: "${INFERENCE_MODEL}"
|
||||||
|
@ -44,7 +44,7 @@ spec:
|
||||||
name: hf-token-secret
|
name: hf-token-secret
|
||||||
key: token
|
key: token
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8000
|
- containerPort: 8001
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu: 1
|
||||||
|
@ -67,6 +67,6 @@ spec:
|
||||||
app.kubernetes.io/name: vllm
|
app.kubernetes.io/name: vllm
|
||||||
ports:
|
ports:
|
||||||
- protocol: TCP
|
- protocol: TCP
|
||||||
port: 8000
|
port: 8001
|
||||||
targetPort: 8000
|
targetPort: 8001
|
||||||
type: ClusterIP
|
type: ClusterIP
|
||||||
|
|
|
@ -273,6 +273,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper, Models
|
||||||
response_format: ResponseFormat | None = None,
|
response_format: ResponseFormat | None = None,
|
||||||
stream: bool | None = False,
|
stream: bool | None = False,
|
||||||
logprobs: LogProbConfig | None = None,
|
logprobs: LogProbConfig | None = None,
|
||||||
|
suffix: str | None = None,
|
||||||
) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
|
) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
|
||||||
if sampling_params is None:
|
if sampling_params is None:
|
||||||
sampling_params = SamplingParams()
|
sampling_params = SamplingParams()
|
||||||
|
@ -293,6 +294,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper, Models
|
||||||
response_format=response_format,
|
response_format=response_format,
|
||||||
stream=stream,
|
stream=stream,
|
||||||
logprobs=logprobs,
|
logprobs=logprobs,
|
||||||
|
suffix=suffix,
|
||||||
),
|
),
|
||||||
n=1,
|
n=1,
|
||||||
)
|
)
|
||||||
|
|
|
@ -155,7 +155,8 @@ def convert_completion_request(
|
||||||
|
|
||||||
if request.logprobs:
|
if request.logprobs:
|
||||||
payload.update(logprobs=request.logprobs.top_k)
|
payload.update(logprobs=request.logprobs.top_k)
|
||||||
|
if request.suffix:
|
||||||
|
payload.update(suffix=request.suffix)
|
||||||
if request.sampling_params:
|
if request.sampling_params:
|
||||||
nvext.update(repetition_penalty=request.sampling_params.repetition_penalty)
|
nvext.update(repetition_penalty=request.sampling_params.repetition_penalty)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue