diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh
index 11c327821..a38144898 100755
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@@ -6,12 +6,18 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+# Check if NGC_API_KEY is provided as argument
+if [ -n "$1" ]; then
+  export NGC_API_KEY=$1
+  echo "Using NGC API key provided as argument."
+fi
+
 export POSTGRES_USER=llamastack
 export POSTGRES_DB=llamastack
 export POSTGRES_PASSWORD=llamastack
 
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
-export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
+export CODE_MODEL=bigcode/starcoder2-7b
 
 # Set USE_EBS to false if you don't have permission to use EKS EBS
 export USE_EBS=${USE_EBS:-false}
@@ -24,13 +30,16 @@ else
   exit 1
 fi
 
-if [ -z "${GITHUB_CLIENT_ID:-}" ]; then
-  echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
-  exit 1
-fi
-
-if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then
-  echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
+# NGC_API_KEY should be set by the user or provided as argument; base64 encode it for the secret
+if [ -n "${NGC_API_KEY:-}" ]; then
+  export NGC_API_KEY_BASE64=$(echo -n "$NGC_API_KEY" | base64)
+  # Create Docker config JSON for NGC image pull
+  NGC_DOCKER_CONFIG="{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"$NGC_API_KEY\"}}}"
+  export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
+else
+  echo "ERROR: NGC_API_KEY not set. You need it for NIM to download models from NVIDIA."
+  echo "Usage: $0 [your-ngc-api-key]"
+  echo "You can either provide your NGC API key as an argument or set it as an environment variable."
   exit 1
 fi
 
@@ -41,20 +50,18 @@ fi
 
 
 
+# Apply the HF token secret if HF_TOKEN is provided
+if [ -n "${HF_TOKEN:-}" ]; then
+  envsubst < ./set-secret.yaml.template | kubectl apply -f -
+fi
 
 set -euo pipefail
 set -x
-
-# Apply the HF token secret if HF_TOKEN is provided
-if [ -n "${HF_TOKEN:-}" ]; then
-  envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
-fi
-
 # Apply templates with appropriate storage configuration based on USE_EBS setting
 if [ "$USE_EBS" = "true" ]; then
   echo "Using EBS storage for persistent volumes"
   envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
-  envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
+  envsubst < ./llama-nim.yaml.template | kubectl apply -f -
   envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
   envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
 
@@ -70,7 +77,7 @@ else
   echo "Using emptyDir for storage (data will not persist across pod restarts)"
   # Process templates to replace EBS storage with emptyDir
   envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
-  envsubst < ./vllm-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
+  envsubst < ./llama-nim.yaml.template | kubectl apply -f -
   envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
   envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
 
diff --git a/docs/source/distributions/k8s/delete.sh b/docs/source/distributions/k8s/delete.sh
old mode 100644
new mode 100755
index c095212e5..54afb0bc4
--- a/docs/source/distributions/k8s/delete.sh
+++ b/docs/source/distributions/k8s/delete.sh
@@ -21,6 +21,14 @@ if [ -n "${HF_TOKEN:-}" ]; then
   export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
 fi
 
+# NGC_API_KEY should be set by the user; base64 encode it for the secret
+if [ -n "${NGC_API_KEY:-}" ]; then
+  export NGC_API_KEY_BASE64=$(echo -n "$NGC_API_KEY" | base64)
+  # Create Docker config JSON for NGC image pull
+  NGC_DOCKER_CONFIG="{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"$NGC_API_KEY\"}}}"
+  export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
+fi
+
 set -euo pipefail
 set -x
 
@@ -45,14 +53,16 @@ envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=t
 envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 
 # Delete vllm-safety deployment
-envsubst < ./vllm-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+envsubst < ./llama-nim.yaml.template | kubectl delete -f - --ignore-not-found=true
 
 # Delete vllm deployment
 envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 
 # Delete the HF token secret if it exists
 if [ -n "${HF_TOKEN:-}" ]; then
-  envsubst < ./hf-token-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
+  envsubst < ./set-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
 fi
 
+# NGC API key secrets are now part of llama-nim.yaml.template and are deleted with it
+
 echo "All LlamaStack Kubernetes resources have been deleted."
diff --git a/docs/source/distributions/k8s/hf-token-secret.yaml.template b/docs/source/distributions/k8s/hf-token-secret.yaml.template
deleted file mode 100644
index b6db8e7bc..000000000
--- a/docs/source/distributions/k8s/hf-token-secret.yaml.template
+++ /dev/null
@@ -1,7 +0,0 @@
-apiVersion: v1
-kind: Secret
-metadata:
-  name: hf-token-secret
-type: Opaque
-data:
-  token: ${HF_TOKEN_BASE64}
diff --git a/docs/source/distributions/k8s/llama-nim.yaml.template b/docs/source/distributions/k8s/llama-nim.yaml.template
index 775e85629..e384e3635 100644
--- a/docs/source/distributions/k8s/llama-nim.yaml.template
+++ b/docs/source/distributions/k8s/llama-nim.yaml.template
@@ -1,25 +1,25 @@
 # -------------------------------------------------
-# NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1
+# NVIDIA NIM - Code
 # -------------------------------------------------
 
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: llama-nano-nim
+  name: llm-nim-code
   labels:
-    app: llama-nano-nim
+    app: llm-nim-code
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: llama-nano-nim
+      app: llm-nim-code
   template:
     metadata:
       labels:
-        app: llama-nano-nim
+        app: llm-nim-code
     spec:
       imagePullSecrets:
-        - name: ngc-secret          # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
+        - name: ngc-docker-registry          # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
       volumes:
         - name: model-cache
           emptyDir:
@@ -27,7 +27,7 @@ spec:
             sizeLimit: 12Gi          # fits the 4 B model + tensors; adjust if needed
       containers:
         - name: nim
-          image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0
+          image: nvcr.io/nim/bigcode/starcoder2-7b:1.8.1
           ports:
             - name: http-openai
               containerPort: 8000
@@ -36,7 +36,7 @@ spec:
               nvidia.com/gpu: 1
           env:
             - name: NIM_MODEL_NAME
-              value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1"
+              value: "nvidia/starcoder2-7b"
             - name: NGC_API_KEY
               valueFrom:
                 secretKeyRef:
@@ -49,23 +49,23 @@ spec:
             httpGet:
               path: /v1/models
               port: http-openai
-            initialDelaySeconds: 20
-            periodSeconds: 10
+            initialDelaySeconds: 360
+            periodSeconds: 360
           livenessProbe:
             httpGet:
               path: /v1/health
               port: http-openai
-            initialDelaySeconds: 60
-            periodSeconds: 30
+            initialDelaySeconds: 600
+            periodSeconds: 360
 
 ---
 apiVersion: v1
 kind: Service
 metadata:
-  name: llama-nano-nim
+  name: llm-nim-code
 spec:
   selector:
-    app: llama-nano-nim
+    app: llm-nim-code
   ports:
     - name: http-openai
       port: 8000
diff --git a/docs/source/distributions/k8s/set-secret.yaml.template b/docs/source/distributions/k8s/set-secret.yaml.template
new file mode 100644
index 000000000..e020e3076
--- /dev/null
+++ b/docs/source/distributions/k8s/set-secret.yaml.template
@@ -0,0 +1,31 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+type: Opaque
+data:
+  token: ${HF_TOKEN_BASE64}
+
+# -------------------------------------------------
+# NGC Docker Registry Secret
+# -------------------------------------------------
+
+apiVersion: apps/v1
+kind: Secret
+metadata:
+  name: ngc-docker-registry
+type: kubernetes.io/dockerconfigjson
+data:
+  .dockerconfigjson: ${NGC_DOCKER_CONFIG_JSON}
+
+# -------------------------------------------------
+# NGC API Secret
+# -------------------------------------------------
+
+apiVersion: apps/v1
+kind: Secret
+metadata:
+  name: ngc-api
+type: Opaque
+data:
+  NGC_API_KEY: ${NGC_API_KEY_BASE64}
diff --git a/docs/source/distributions/k8s/stack-configmap.yaml b/docs/source/distributions/k8s/stack-configmap.yaml
index e69de29bb..79630795a 100644
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@@ -0,0 +1,125 @@
+apiVersion: v1
+data:
+  stack_run_config.yaml: |
+    version: '2'
+    image_name: kubernetes-demo
+    apis:
+    - agents
+    - inference
+    - safety
+    - telemetry
+    - tool_runtime
+    - vector_io
+    providers:
+      inference:
+      - provider_id: vllm-inference
+        provider_type: remote::vllm
+        config:
+          url: ${env.VLLM_URL:=http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+          api_token: ${env.VLLM_API_TOKEN:=fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+      - provider_id: nvidia
+        provider_type: remote::nvidia
+        config:
+          url: ${env.NVIDIA_BASE_URL:=http://localhost:8001/v1}
+          api_key: ${env.NVIDIA_API_KEY:=}
+          append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
+      - provider_id: sentence-transformers
+        provider_type: inline::sentence-transformers
+        config: {}
+      vector_io:
+      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
+        provider_type: remote::chromadb
+        config:
+          url: ${env.CHROMADB_URL:=}
+      safety:
+      - provider_id: llama-guard
+        provider_type: inline::llama-guard
+        config:
+          excluded_categories: []
+      agents:
+      - provider_id: meta-reference
+        provider_type: inline::meta-reference
+        config:
+          persistence_store:
+            type: postgres
+            host: ${env.POSTGRES_HOST:=localhost}
+            port: ${env.POSTGRES_PORT:=5432}
+            db: ${env.POSTGRES_DB:=llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
+            password: ${env.POSTGRES_PASSWORD:=llamastack}
+          responses_store:
+            type: postgres
+            host: ${env.POSTGRES_HOST:=localhost}
+            port: ${env.POSTGRES_PORT:=5432}
+            db: ${env.POSTGRES_DB:=llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
+            password: ${env.POSTGRES_PASSWORD:=llamastack}
+      telemetry:
+      - provider_id: meta-reference
+        provider_type: inline::meta-reference
+        config:
+          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+          sinks: ${env.TELEMETRY_SINKS:=console}
+      tool_runtime:
+      - provider_id: brave-search
+        provider_type: remote::brave-search
+        config:
+          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
+          max_results: 3
+      - provider_id: tavily-search
+        provider_type: remote::tavily-search
+        config:
+          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
+          max_results: 3
+      - provider_id: rag-runtime
+        provider_type: inline::rag-runtime
+        config: {}
+      - provider_id: model-context-protocol
+        provider_type: remote::model-context-protocol
+        config: {}
+    metadata_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+      table_name: llamastack_kvstore
+    inference_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+    models:
+    - metadata:
+        embedding_dimension: 384
+      model_id: all-MiniLM-L6-v2
+      provider_id: sentence-transformers
+      model_type: embedding
+    - metadata: {}
+      model_id: ${env.INFERENCE_MODEL}
+      provider_id: vllm-inference
+      model_type: llm
+    - metadata: {}
+      model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
+      provider_id: nvidia
+      model_type: llm
+    vector_dbs: []
+    datasets: []
+    scoring_fns: []
+    benchmarks: []
+    tool_groups:
+    - toolgroup_id: builtin::websearch
+      provider_id: tavily-search
+    - toolgroup_id: builtin::rag
+      provider_id: rag-runtime
+    server:
+      port: 8321
+kind: ConfigMap
+metadata:
+  creationTimestamp: null
+  name: llama-stack-config
diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template
index 0d6aba6f5..1ac8d743e 100644
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@@ -28,27 +28,34 @@ spec:
       initContainers:
       - name: wait-for-vllm-server
         image: busybox:1.28
-        command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8000; do echo waiting for vllm-server on port 8000; sleep 2; done;']
-      - name: wait-for-vllm-server-safety
+        command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8001; do echo waiting for vllm-server on port 8000; sleep 2; done;']
+      - name: wait-for-llm-nim-code
         image: busybox:1.28
-        command: ['sh', '-c', 'until nc -z vllm-server-safety.default.svc.cluster.local 8001; do echo waiting for vllm-server-safety on port 8001; sleep 2; done;']
+        command: ['sh', '-c', 'until nc -z llm-nim-code.default.svc.cluster.local 8000; do echo waiting for llm-nim-code on port 8001; sleep 2; done;']
       containers:
       - name: llama-stack
         image: llamastack/distribution-starter:latest
         imagePullPolicy: Always # since we have specified latest instead of a version
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "500m"
+            ephemeral-storage: "2Gi"
+          limits:
+            memory: "1Gi"
+            cpu: "1000m"
+            ephemeral-storage: "5Gi"
         env:
         - name: ENABLE_CHROMADB
           value: "true"
         - name: CHROMADB_URL
           value: http://chromadb.default.svc.cluster.local:6000
         - name: VLLM_URL
-          value: http://vllm-server.default.svc.cluster.local:8000/v1
+          value: http://vllm-server.default.svc.cluster.local:8001/v1
         - name: VLLM_MAX_TOKENS
           value: "3072"
         - name: NVIDIA_BASE_URL
-          value: http://llama-nano-nim.default.svc.cluster.local:8000/v1
-        - name: VLLM_SAFETY_URL
-          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
+          value: http://llm-nim-code.default.svc.cluster.local:8000/v1
         - name: POSTGRES_HOST
           value: postgres-server.default.svc.cluster.local
         - name: POSTGRES_PORT
@@ -57,8 +64,8 @@ spec:
           value: "false"
         - name: INFERENCE_MODEL
           value: "${INFERENCE_MODEL}"
-        - name: SAFETY_MODEL
-          value: "${SAFETY_MODEL}"
+        - name: CODE_MODEL
+          value: "${CODE_MODEL}"
         - name: TAVILY_SEARCH_API_KEY
           value: "${TAVILY_SEARCH_API_KEY}"
         command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
diff --git a/docs/source/distributions/k8s/stack_run_config.yaml b/docs/source/distributions/k8s/stack_run_config.yaml
index 23993ca5d..30c73e0ea 100644
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@@ -16,13 +16,12 @@ providers:
       max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
       api_token: ${env.VLLM_API_TOKEN:=fake}
       tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-  - provider_id: vllm-safety
-    provider_type: remote::vllm
+  - provider_id: nvidia
+    provider_type: remote::nvidia
     config:
-      url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
-      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-      api_token: ${env.VLLM_API_TOKEN:=fake}
-      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+      url: ${env.NVIDIA_BASE_URL:=http://localhost:8001/v1}
+      api_key: ${env.NVIDIA_API_KEY:=}
+      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}
@@ -103,11 +102,9 @@ models:
   provider_id: vllm-inference
   model_type: llm
 - metadata: {}
-  model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-  provider_id: vllm-safety
+  model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
+  provider_id: nvidia
   model_type: llm
-shields:
-- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
 vector_dbs: []
 datasets: []
 scoring_fns: []
diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template
index efbdcfdde..2dcf286e1 100644
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@@ -34,7 +34,7 @@ spec:
         image: vllm/vllm-openai:latest
         command: ["/bin/sh", "-c"]
         args:
-        - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4"
+        - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4 --port 8001"
         env:
         - name: INFERENCE_MODEL
           value: "${INFERENCE_MODEL}"
@@ -44,7 +44,7 @@ spec:
               name: hf-token-secret
               key: token
         ports:
-          - containerPort: 8000
+          - containerPort: 8001
         resources:
           limits:
             nvidia.com/gpu: 1
@@ -67,6 +67,6 @@ spec:
     app.kubernetes.io/name: vllm
   ports:
   - protocol: TCP
-    port: 8000
-    targetPort: 8000
+    port: 8001
+    targetPort: 8001
   type: ClusterIP
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
index d07df0eef..edff14cc2 100644
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -273,6 +273,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper, Models
         response_format: ResponseFormat | None = None,
         stream: bool | None = False,
         logprobs: LogProbConfig | None = None,
+        suffix: str | None = None,
     ) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
         if sampling_params is None:
             sampling_params = SamplingParams()
@@ -293,6 +294,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper, Models
                 response_format=response_format,
                 stream=stream,
                 logprobs=logprobs,
+                suffix=suffix,
             ),
             n=1,
         )
diff --git a/llama_stack/providers/remote/inference/nvidia/openai_utils.py b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
index 0b0d7fcf3..57e93183d 100644
--- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
@@ -155,7 +155,8 @@ def convert_completion_request(
 
     if request.logprobs:
         payload.update(logprobs=request.logprobs.top_k)
-
+    if request.suffix:
+        payload.update(suffix=request.suffix)
     if request.sampling_params:
         nvext.update(repetition_penalty=request.sampling_params.repetition_penalty)