diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh index 11c327821..a38144898 100755 --- a/docs/source/distributions/k8s/apply.sh +++ b/docs/source/distributions/k8s/apply.sh @@ -6,12 +6,18 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +# Check if NGC_API_KEY is provided as argument +if [ -n "$1" ]; then + export NGC_API_KEY=$1 + echo "Using NGC API key provided as argument." +fi + export POSTGRES_USER=llamastack export POSTGRES_DB=llamastack export POSTGRES_PASSWORD=llamastack -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B +export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct +export CODE_MODEL=bigcode/starcoder2-7b # Set USE_EBS to false if you don't have permission to use EKS EBS export USE_EBS=${USE_EBS:-false} @@ -24,13 +30,16 @@ else exit 1 fi -if [ -z "${GITHUB_CLIENT_ID:-}" ]; then - echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide" - exit 1 -fi - -if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then - echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide" +# NGC_API_KEY should be set by the user or provided as argument; base64 encode it for the secret +if [ -n "${NGC_API_KEY:-}" ]; then + export NGC_API_KEY_BASE64=$(echo -n "$NGC_API_KEY" | base64) + # Create Docker config JSON for NGC image pull + NGC_DOCKER_CONFIG="{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"$NGC_API_KEY\"}}}" + export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64) +else + echo "ERROR: NGC_API_KEY not set. You need it for NIM to download models from NVIDIA." + echo "Usage: $0 [your-ngc-api-key]" + echo "You can either provide your NGC API key as an argument or set it as an environment variable." exit 1 fi @@ -41,20 +50,18 @@ fi +# Apply the HF token secret if HF_TOKEN is provided +if [ -n "${HF_TOKEN:-}" ]; then + envsubst < ./set-secret.yaml.template | kubectl apply -f - +fi set -euo pipefail set -x - -# Apply the HF token secret if HF_TOKEN is provided -if [ -n "${HF_TOKEN:-}" ]; then - envsubst < ./hf-token-secret.yaml.template | kubectl apply -f - -fi - # Apply templates with appropriate storage configuration based on USE_EBS setting if [ "$USE_EBS" = "true" ]; then echo "Using EBS storage for persistent volumes" envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - - envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f - + envsubst < ./llama-nim.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - @@ -70,7 +77,7 @@ else echo "Using emptyDir for storage (data will not persist across pod restarts)" # Process templates to replace EBS storage with emptyDir envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - - envsubst < ./vllm-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - + envsubst < ./llama-nim.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - diff --git a/docs/source/distributions/k8s/delete.sh b/docs/source/distributions/k8s/delete.sh old mode 100644 new mode 100755 index c095212e5..54afb0bc4 --- a/docs/source/distributions/k8s/delete.sh +++ b/docs/source/distributions/k8s/delete.sh @@ -21,6 +21,14 @@ if [ -n "${HF_TOKEN:-}" ]; then export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64) fi +# NGC_API_KEY should be set by the user; base64 encode it for the secret +if [ -n "${NGC_API_KEY:-}" ]; then + export NGC_API_KEY_BASE64=$(echo -n "$NGC_API_KEY" | base64) + # Create Docker config JSON for NGC image pull + NGC_DOCKER_CONFIG="{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"$NGC_API_KEY\"}}}" + export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64) +fi + set -euo pipefail set -x @@ -45,14 +53,16 @@ envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=t envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true # Delete vllm-safety deployment -envsubst < ./vllm-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true +envsubst < ./llama-nim.yaml.template | kubectl delete -f - --ignore-not-found=true # Delete vllm deployment envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true # Delete the HF token secret if it exists if [ -n "${HF_TOKEN:-}" ]; then - envsubst < ./hf-token-secret.yaml.template | kubectl delete -f - --ignore-not-found=true + envsubst < ./set-secret.yaml.template | kubectl delete -f - --ignore-not-found=true fi +# NGC API key secrets are now part of llama-nim.yaml.template and are deleted with it + echo "All LlamaStack Kubernetes resources have been deleted." diff --git a/docs/source/distributions/k8s/hf-token-secret.yaml.template b/docs/source/distributions/k8s/hf-token-secret.yaml.template deleted file mode 100644 index b6db8e7bc..000000000 --- a/docs/source/distributions/k8s/hf-token-secret.yaml.template +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: hf-token-secret -type: Opaque -data: - token: ${HF_TOKEN_BASE64} diff --git a/docs/source/distributions/k8s/llama-nim.yaml.template b/docs/source/distributions/k8s/llama-nim.yaml.template index 775e85629..e384e3635 100644 --- a/docs/source/distributions/k8s/llama-nim.yaml.template +++ b/docs/source/distributions/k8s/llama-nim.yaml.template @@ -1,25 +1,25 @@ # ------------------------------------------------- -# NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1 +# NVIDIA NIM - Code # ------------------------------------------------- apiVersion: apps/v1 kind: Deployment metadata: - name: llama-nano-nim + name: llm-nim-code labels: - app: llama-nano-nim + app: llm-nim-code spec: replicas: 1 selector: matchLabels: - app: llama-nano-nim + app: llm-nim-code template: metadata: labels: - app: llama-nano-nim + app: llm-nim-code spec: imagePullSecrets: - - name: ngc-secret # docker-registry secret: nvcr.io / $oauthtoken / + - name: ngc-docker-registry # docker-registry secret: nvcr.io / $oauthtoken / volumes: - name: model-cache emptyDir: @@ -27,7 +27,7 @@ spec: sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed containers: - name: nim - image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0 + image: nvcr.io/nim/bigcode/starcoder2-7b:1.8.1 ports: - name: http-openai containerPort: 8000 @@ -36,7 +36,7 @@ spec: nvidia.com/gpu: 1 env: - name: NIM_MODEL_NAME - value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1" + value: "nvidia/starcoder2-7b" - name: NGC_API_KEY valueFrom: secretKeyRef: @@ -49,23 +49,23 @@ spec: httpGet: path: /v1/models port: http-openai - initialDelaySeconds: 20 - periodSeconds: 10 + initialDelaySeconds: 360 + periodSeconds: 360 livenessProbe: httpGet: path: /v1/health port: http-openai - initialDelaySeconds: 60 - periodSeconds: 30 + initialDelaySeconds: 600 + periodSeconds: 360 --- apiVersion: v1 kind: Service metadata: - name: llama-nano-nim + name: llm-nim-code spec: selector: - app: llama-nano-nim + app: llm-nim-code ports: - name: http-openai port: 8000 diff --git a/docs/source/distributions/k8s/set-secret.yaml.template b/docs/source/distributions/k8s/set-secret.yaml.template new file mode 100644 index 000000000..e020e3076 --- /dev/null +++ b/docs/source/distributions/k8s/set-secret.yaml.template @@ -0,0 +1,31 @@ +apiVersion: v1 +kind: Secret +metadata: + name: hf-token-secret +type: Opaque +data: + token: ${HF_TOKEN_BASE64} + +# ------------------------------------------------- +# NGC Docker Registry Secret +# ------------------------------------------------- + +apiVersion: apps/v1 +kind: Secret +metadata: + name: ngc-docker-registry +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: ${NGC_DOCKER_CONFIG_JSON} + +# ------------------------------------------------- +# NGC API Secret +# ------------------------------------------------- + +apiVersion: apps/v1 +kind: Secret +metadata: + name: ngc-api +type: Opaque +data: + NGC_API_KEY: ${NGC_API_KEY_BASE64} diff --git a/docs/source/distributions/k8s/stack-configmap.yaml b/docs/source/distributions/k8s/stack-configmap.yaml index e69de29bb..79630795a 100644 --- a/docs/source/distributions/k8s/stack-configmap.yaml +++ b/docs/source/distributions/k8s/stack-configmap.yaml @@ -0,0 +1,125 @@ +apiVersion: v1 +data: + stack_run_config.yaml: | + version: '2' + image_name: kubernetes-demo + apis: + - agents + - inference + - safety + - telemetry + - tool_runtime + - vector_io + providers: + inference: + - provider_id: vllm-inference + provider_type: remote::vllm + config: + url: ${env.VLLM_URL:=http://localhost:8000/v1} + max_tokens: ${env.VLLM_MAX_TOKENS:=4096} + api_token: ${env.VLLM_API_TOKEN:=fake} + tls_verify: ${env.VLLM_TLS_VERIFY:=true} + - provider_id: nvidia + provider_type: remote::nvidia + config: + url: ${env.NVIDIA_BASE_URL:=http://localhost:8001/v1} + api_key: ${env.NVIDIA_API_KEY:=} + append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} + vector_io: + - provider_id: ${env.ENABLE_CHROMADB:+chromadb} + provider_type: remote::chromadb + config: + url: ${env.CHROMADB_URL:=} + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: postgres + host: ${env.POSTGRES_HOST:=localhost} + port: ${env.POSTGRES_PORT:=5432} + db: ${env.POSTGRES_DB:=llamastack} + user: ${env.POSTGRES_USER:=llamastack} + password: ${env.POSTGRES_PASSWORD:=llamastack} + responses_store: + type: postgres + host: ${env.POSTGRES_HOST:=localhost} + port: ${env.POSTGRES_PORT:=5432} + db: ${env.POSTGRES_DB:=llamastack} + user: ${env.POSTGRES_USER:=llamastack} + password: ${env.POSTGRES_PASSWORD:=llamastack} + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" + sinks: ${env.TELEMETRY_SINKS:=console} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:+} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:+} + max_results: 3 + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} + metadata_store: + type: postgres + host: ${env.POSTGRES_HOST:=localhost} + port: ${env.POSTGRES_PORT:=5432} + db: ${env.POSTGRES_DB:=llamastack} + user: ${env.POSTGRES_USER:=llamastack} + password: ${env.POSTGRES_PASSWORD:=llamastack} + table_name: llamastack_kvstore + inference_store: + type: postgres + host: ${env.POSTGRES_HOST:=localhost} + port: ${env.POSTGRES_PORT:=5432} + db: ${env.POSTGRES_DB:=llamastack} + user: ${env.POSTGRES_USER:=llamastack} + password: ${env.POSTGRES_PASSWORD:=llamastack} + models: + - metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + model_type: embedding + - metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: vllm-inference + model_type: llm + - metadata: {} + model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b} + provider_id: nvidia + model_type: llm + vector_dbs: [] + datasets: [] + scoring_fns: [] + benchmarks: [] + tool_groups: + - toolgroup_id: builtin::websearch + provider_id: tavily-search + - toolgroup_id: builtin::rag + provider_id: rag-runtime + server: + port: 8321 +kind: ConfigMap +metadata: + creationTimestamp: null + name: llama-stack-config diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 0d6aba6f5..1ac8d743e 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -28,27 +28,34 @@ spec: initContainers: - name: wait-for-vllm-server image: busybox:1.28 - command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8000; do echo waiting for vllm-server on port 8000; sleep 2; done;'] - - name: wait-for-vllm-server-safety + command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8001; do echo waiting for vllm-server on port 8000; sleep 2; done;'] + - name: wait-for-llm-nim-code image: busybox:1.28 - command: ['sh', '-c', 'until nc -z vllm-server-safety.default.svc.cluster.local 8001; do echo waiting for vllm-server-safety on port 8001; sleep 2; done;'] + command: ['sh', '-c', 'until nc -z llm-nim-code.default.svc.cluster.local 8000; do echo waiting for llm-nim-code on port 8001; sleep 2; done;'] containers: - name: llama-stack image: llamastack/distribution-starter:latest imagePullPolicy: Always # since we have specified latest instead of a version + resources: + requests: + memory: "512Mi" + cpu: "500m" + ephemeral-storage: "2Gi" + limits: + memory: "1Gi" + cpu: "1000m" + ephemeral-storage: "5Gi" env: - name: ENABLE_CHROMADB value: "true" - name: CHROMADB_URL value: http://chromadb.default.svc.cluster.local:6000 - name: VLLM_URL - value: http://vllm-server.default.svc.cluster.local:8000/v1 + value: http://vllm-server.default.svc.cluster.local:8001/v1 - name: VLLM_MAX_TOKENS value: "3072" - name: NVIDIA_BASE_URL - value: http://llama-nano-nim.default.svc.cluster.local:8000/v1 - - name: VLLM_SAFETY_URL - value: http://vllm-server-safety.default.svc.cluster.local:8001/v1 + value: http://llm-nim-code.default.svc.cluster.local:8000/v1 - name: POSTGRES_HOST value: postgres-server.default.svc.cluster.local - name: POSTGRES_PORT @@ -57,8 +64,8 @@ spec: value: "false" - name: INFERENCE_MODEL value: "${INFERENCE_MODEL}" - - name: SAFETY_MODEL - value: "${SAFETY_MODEL}" + - name: CODE_MODEL + value: "${CODE_MODEL}" - name: TAVILY_SEARCH_API_KEY value: "${TAVILY_SEARCH_API_KEY}" command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"] diff --git a/docs/source/distributions/k8s/stack_run_config.yaml b/docs/source/distributions/k8s/stack_run_config.yaml index 23993ca5d..30c73e0ea 100644 --- a/docs/source/distributions/k8s/stack_run_config.yaml +++ b/docs/source/distributions/k8s/stack_run_config.yaml @@ -16,13 +16,12 @@ providers: max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - - provider_id: vllm-safety - provider_type: remote::vllm + - provider_id: nvidia + provider_type: remote::nvidia config: - url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1} - max_tokens: ${env.VLLM_MAX_TOKENS:=4096} - api_token: ${env.VLLM_API_TOKEN:=fake} - tls_verify: ${env.VLLM_TLS_VERIFY:=true} + url: ${env.NVIDIA_BASE_URL:=http://localhost:8001/v1} + api_key: ${env.NVIDIA_API_KEY:=} + append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} @@ -103,11 +102,9 @@ models: provider_id: vllm-inference model_type: llm - metadata: {} - model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} - provider_id: vllm-safety + model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b} + provider_id: nvidia model_type: llm -shields: -- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} vector_dbs: [] datasets: [] scoring_fns: [] diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template index efbdcfdde..2dcf286e1 100644 --- a/docs/source/distributions/k8s/vllm-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -34,7 +34,7 @@ spec: image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: - - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4" + - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4 --port 8001" env: - name: INFERENCE_MODEL value: "${INFERENCE_MODEL}" @@ -44,7 +44,7 @@ spec: name: hf-token-secret key: token ports: - - containerPort: 8000 + - containerPort: 8001 resources: limits: nvidia.com/gpu: 1 @@ -67,6 +67,6 @@ spec: app.kubernetes.io/name: vllm ports: - protocol: TCP - port: 8000 - targetPort: 8000 + port: 8001 + targetPort: 8001 type: ClusterIP diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index d07df0eef..edff14cc2 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -273,6 +273,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper, Models response_format: ResponseFormat | None = None, stream: bool | None = False, logprobs: LogProbConfig | None = None, + suffix: str | None = None, ) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]: if sampling_params is None: sampling_params = SamplingParams() @@ -293,6 +294,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper, Models response_format=response_format, stream=stream, logprobs=logprobs, + suffix=suffix, ), n=1, ) diff --git a/llama_stack/providers/remote/inference/nvidia/openai_utils.py b/llama_stack/providers/remote/inference/nvidia/openai_utils.py index 0b0d7fcf3..57e93183d 100644 --- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py +++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py @@ -155,7 +155,8 @@ def convert_completion_request( if request.logprobs: payload.update(logprobs=request.logprobs.top_k) - + if request.suffix: + payload.update(suffix=request.suffix) if request.sampling_params: nvext.update(repetition_penalty=request.sampling_params.repetition_penalty)