Merge branch 'main' into content-extension

2025-10-05 12:21:52 +00:00 · 2025-08-13 14:04:47 -06:00 · 2025-08-13 14:04:47 -06:00 · 84a26339c8
commit 84a26339c8
parent 2fbddb4beb 5b312a80b9
73 changed files with 2416 additions and 506 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -8293,28 +8293,60 @@
                        "type": "array",
                        "items": {
                            "type": "object",
-                            "additionalProperties": {
-                                "oneOf": [
-                                    {
-                                        "type": "null"
+                            "properties": {
+                                "attributes": {
+                                    "type": "object",
+                                    "additionalProperties": {
+                                        "oneOf": [
+                                            {
+                                                "type": "null"
+                                            },
+                                            {
+                                                "type": "boolean"
+                                            },
+                                            {
+                                                "type": "number"
+                                            },
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "array"
+                                            },
+                                            {
+                                                "type": "object"
+                                            }
+                                        ]
                                    },
-                                    {
-                                        "type": "boolean"
-                                    },
-                                    {
-                                        "type": "number"
-                                    },
-                                    {
-                                        "type": "string"
-                                    },
-                                    {
-                                        "type": "array"
-                                    },
-                                    {
-                                        "type": "object"
-                                    }
-                                ]
-                            }
+                                    "description": "(Optional) Key-value attributes associated with the file"
+                                },
+                                "file_id": {
+                                    "type": "string",
+                                    "description": "Unique identifier of the file containing the result"
+                                },
+                                "filename": {
+                                    "type": "string",
+                                    "description": "Name of the file containing the result"
+                                },
+                                "score": {
+                                    "type": "number",
+                                    "description": "Relevance score for this search result (between 0 and 1)"
+                                },
+                                "text": {
+                                    "type": "string",
+                                    "description": "Text content of the search result"
+                                }
+                            },
+                            "additionalProperties": false,
+                            "required": [
+                                "attributes",
+                                "file_id",
+                                "filename",
+                                "score",
+                                "text"
+                            ],
+                            "title": "OpenAIResponseOutputMessageFileSearchToolCallResults",
+                            "description": "Search results returned by the file search operation."
                        },
                        "description": "(Optional) Search results returned by the file search operation"
                    }
@ -8515,6 +8547,13 @@
                            "$ref": "#/components/schemas/OpenAIResponseInputTool"
                        }
                    },
+                    "include": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        },
+                        "description": "(Optional) Additional fields to include in the response."
+                    },
                    "max_infer_iters": {
                        "type": "integer"
                    }
@ -16571,7 +16610,7 @@
                        "additionalProperties": {
                            "type": "number"
                        },
-                        "description": "A list of the categories along with their scores as predicted by model. Required set of categories that need to be in response - violence - violence/graphic - harassment - harassment/threatening - hate - hate/threatening - illicit - illicit/violent - sexual - sexual/minors - self-harm - self-harm/intent - self-harm/instructions"
+                        "description": "A list of the categories along with their scores as predicted by model."
                    },
                    "user_message": {
                        "type": "string"
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -6021,14 +6021,44 @@ components:
          type: array
          items:
            type: object
-            additionalProperties:
-              oneOf:
-                - type: 'null'
-                - type: boolean
-                - type: number
-                - type: string
-                - type: array
-                - type: object
+            properties:
+              attributes:
+                type: object
+                additionalProperties:
+                  oneOf:
+                    - type: 'null'
+                    - type: boolean
+                    - type: number
+                    - type: string
+                    - type: array
+                    - type: object
+                description: >-
+                  (Optional) Key-value attributes associated with the file
+              file_id:
+                type: string
+                description: >-
+                  Unique identifier of the file containing the result
+              filename:
+                type: string
+                description: Name of the file containing the result
+              score:
+                type: number
+                description: >-
+                  Relevance score for this search result (between 0 and 1)
+              text:
+                type: string
+                description: Text content of the search result
+            additionalProperties: false
+            required:
+              - attributes
+              - file_id
+              - filename
+              - score
+              - text
+            title: >-
+              OpenAIResponseOutputMessageFileSearchToolCallResults
+            description: >-
+              Search results returned by the file search operation.
          description: >-
            (Optional) Search results returned by the file search operation
      additionalProperties: false
@ -6188,6 +6218,12 @@ components:
          type: array
          items:
            $ref: '#/components/schemas/OpenAIResponseInputTool'
+        include:
+          type: array
+          items:
+            type: string
+          description: >-
+            (Optional) Additional fields to include in the response.
        max_infer_iters:
          type: integer
      additionalProperties: false
@ -12314,10 +12350,6 @@ components:
            type: number
          description: >-
            A list of the categories along with their scores as predicted by model.
-            Required set of categories that need to be in response - violence - violence/graphic
-            - harassment - harassment/threatening - hate - hate/threatening - illicit
-            - illicit/violent - sexual - sexual/minors - self-harm - self-harm/intent
-            - self-harm/instructions
        user_message:
          type: string
        metadata:
--- a/docs/source/apis/external.md
+++ b/docs/source/apis/external.md
@ -111,7 +111,7 @@ name = "llama-stack-api-weather"
 version = "0.1.0"
 description = "Weather API for Llama Stack"
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = ">=3.12"
 dependencies = ["llama-stack", "pydantic"]

 [build-system]
@ -231,7 +231,7 @@ name = "llama-stack-provider-kaze"
 version = "0.1.0"
 description = "Kaze weather provider for Llama Stack"
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = ">=3.12"
 dependencies = ["llama-stack", "pydantic", "aiohttp"]

 [build-system]
--- a/docs/source/building_applications/responses_vs_agents.md
+++ b/docs/source/building_applications/responses_vs_agents.md
@ -2,7 +2,9 @@

 Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.

-> **Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
+```{note}
+For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
+```

 ## Overview

--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -76,7 +76,9 @@ Features:
 - Context retrieval with token limits


-> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
+```{note}
+By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
+```

 ## Model Context Protocol (MCP)

--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -2,17 +2,6 @@
 ```{include} ../../../CONTRIBUTING.md
 ```

-## Testing
-
-See the [Test Page](testing.md) which describes how to test your changes.
-```{toctree}
-:maxdepth: 1
-:hidden:
-:caption: Testing
-
-testing
-```
-
 ## Adding a New Provider

 See the [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
@ -27,3 +16,14 @@ See the [External Provider Page](../providers/external/index.md) which describes
 new_api_provider
 new_vector_database
 ```
+
+## Testing
+
+See the [Test Page](testing.md) which describes how to test your changes.
+```{toctree}
+:maxdepth: 1
+:hidden:
+:caption: Testing
+
+testing
+```
--- a/docs/source/distributions/k8s-benchmark/apply.sh
+++ b/docs/source/distributions/k8s-benchmark/apply.sh
@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
+
+export MOCK_INFERENCE_PORT=8080
+export STREAM_DELAY_SECONDS=0.005
+
+export POSTGRES_USER=llamastack
+export POSTGRES_DB=llamastack
+export POSTGRES_PASSWORD=llamastack
+
+export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+
+export MOCK_INFERENCE_MODEL=mock-inference
+
+# Use llama-stack-benchmark-service as the benchmark server
+export LOCUST_HOST=http://llama-stack-benchmark-service:8323
+export LOCUST_BASE_PATH=/v1/openai/v1
+
+# Use vllm-service as the benchmark server
+# export LOCUST_HOST=http://vllm-server:8000
+# export LOCUST_BASE_PATH=/v1
+
+
+export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
+
+set -euo pipefail
+set -x
+
+# Deploy benchmark-specific components
+# Deploy OpenAI mock server
+kubectl create configmap openai-mock --from-file=openai-mock-server.py \
+  --dry-run=client -o yaml | kubectl apply --validate=false -f -
+
+envsubst < openai-mock-deployment.yaml | kubectl apply --validate=false -f -
+
+# Create configmap with our custom stack config
+kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
+  --dry-run=client -o yaml > stack-configmap.yaml
+
+kubectl apply --validate=false -f stack-configmap.yaml
+
+# Deploy our custom llama stack server (overriding the base one)
+envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
+
+# Deploy Locust load testing
+kubectl create configmap locust-script --from-file=locustfile.py \
+  --dry-run=client -o yaml | kubectl apply --validate=false -f -
+
+envsubst < locust-k8s.yaml | kubectl apply --validate=false -f -
--- a/docs/source/distributions/k8s-benchmark/locust-k8s.yaml
+++ b/docs/source/distributions/k8s-benchmark/locust-k8s.yaml
@ -0,0 +1,131 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: locust-master
+  labels:
+    app: locust
+    role: master
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: locust
+      role: master
+  template:
+    metadata:
+      labels:
+        app: locust
+        role: master
+    spec:
+      containers:
+      - name: locust-master
+        image: locustio/locust:2.31.8
+        ports:
+        - containerPort: 8089  # Web UI
+        - containerPort: 5557  # Master communication
+        env:
+        - name: LOCUST_HOST
+          value: "${LOCUST_HOST}"
+        - name: LOCUST_LOCUSTFILE
+          value: "/locust/locustfile.py"
+        - name: LOCUST_WEB_HOST
+          value: "0.0.0.0"
+        - name: LOCUST_MASTER
+          value: "true"
+        - name: LOCUST_BASE_PATH
+          value: "${LOCUST_BASE_PATH}"
+        - name: INFERENCE_MODEL
+          value: "${BENCHMARK_INFERENCE_MODEL}"
+        volumeMounts:
+        - name: locust-script
+          mountPath: /locust
+        command: ["locust"]
+        args:
+        - "--master"
+        - "--web-host=0.0.0.0"
+        - "--web-port=8089"
+        - "--host=${LOCUST_HOST}"
+        - "--locustfile=/locust/locustfile.py"
+      volumes:
+      - name: locust-script
+        configMap:
+          name: locust-script
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: locust-worker
+  labels:
+    app: locust
+    role: worker
+spec:
+  replicas: 2  # Start with 2 workers, can be scaled up
+  selector:
+    matchLabels:
+      app: locust
+      role: worker
+  template:
+    metadata:
+      labels:
+        app: locust
+        role: worker
+    spec:
+      containers:
+      - name: locust-worker
+        image: locustio/locust:2.31.8
+        env:
+        - name: LOCUST_HOST
+          value: "${LOCUST_HOST}"
+        - name: LOCUST_LOCUSTFILE
+          value: "/locust/locustfile.py"
+        - name: LOCUST_MASTER_HOST
+          value: "locust-master-service"
+        - name: LOCUST_MASTER_PORT
+          value: "5557"
+        - name: INFERENCE_MODEL
+          value: "${BENCHMARK_INFERENCE_MODEL}"
+        - name: LOCUST_BASE_PATH
+          value: "${LOCUST_BASE_PATH}"
+        volumeMounts:
+        - name: locust-script
+          mountPath: /locust
+        command: ["locust"]
+        args:
+        - "--worker"
+        - "--master-host=locust-master-service"
+        - "--master-port=5557"
+        - "--locustfile=/locust/locustfile.py"
+      volumes:
+      - name: locust-script
+        configMap:
+          name: locust-script
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: locust-master-service
+spec:
+  selector:
+    app: locust
+    role: master
+  ports:
+  - name: web-ui
+    port: 8089
+    targetPort: 8089
+  - name: master-comm
+    port: 5557
+    targetPort: 5557
+  type: ClusterIP
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: locust-web-ui
+spec:
+  selector:
+    app: locust
+    role: master
+  ports:
+  - port: 8089
+    targetPort: 8089
+  type: ClusterIP  # Keep internal, use port-forward to access
--- a/docs/source/distributions/k8s-benchmark/locustfile.py
+++ b/docs/source/distributions/k8s-benchmark/locustfile.py
@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Locust load testing script for Llama Stack with Prism mock OpenAI provider.
+"""
+
+import random
+from locust import HttpUser, task, between
+import os
+
+base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
+
+MODEL_ID = os.getenv("INFERENCE_MODEL")
+
+class LlamaStackUser(HttpUser):
+    wait_time = between(0.0, 0.0001)
+    
+    def on_start(self):
+        """Setup authentication and test data."""
+        # No auth required for benchmark server
+        self.headers = {
+            "Content-Type": "application/json"
+        }
+        
+        # Test messages of varying lengths
+        self.test_messages = [
+            [{"role": "user", "content": "Hi"}],
+            [{"role": "user", "content": "What is the capital of France?"}],
+            [{"role": "user", "content": "Explain quantum physics in simple terms."}],
+            [{"role": "user", "content": "Write a short story about a robot learning to paint."}],
+            [
+                {"role": "user", "content": "What is machine learning?"},
+                {"role": "assistant", "content": "Machine learning is a subset of AI..."},
+                {"role": "user", "content": "Can you give me a practical example?"}
+            ]
+        ]
+
+    @task(weight=100)
+    def chat_completion_streaming(self):
+        """Test streaming chat completion (20% of requests)."""
+        messages = random.choice(self.test_messages)
+        payload = {
+            "model": MODEL_ID, 
+            "messages": messages,
+            "stream": True,
+            "max_tokens": 100
+        }
+        
+        with self.client.post(
+            f"{base_path}/chat/completions",
+            headers=self.headers,
+            json=payload,
+            stream=True,
+            catch_response=True
+        ) as response:
+            if response.status_code == 200:
+                chunks_received = 0
+                try:
+                    for line in response.iter_lines():
+                        if line:
+                            line_str = line.decode('utf-8')
+                            if line_str.startswith('data: '):
+                                chunks_received += 1
+                                if line_str.strip() == 'data: [DONE]':
+                                    break
+                    
+                    if chunks_received > 0:
+                        response.success()
+                    else:
+                        response.failure("No streaming chunks received")
+                except Exception as e:
+                    response.failure(f"Streaming error: {e}")
+            else:
+                response.failure(f"HTTP {response.status_code}: {response.text}")
--- a/docs/source/distributions/k8s-benchmark/openai-mock-deployment.yaml
+++ b/docs/source/distributions/k8s-benchmark/openai-mock-deployment.yaml
@ -0,0 +1,52 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: openai-mock
+  labels:
+    app: openai-mock
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: openai-mock
+  template:
+    metadata:
+      labels:
+        app: openai-mock
+    spec:
+      containers:
+      - name: openai-mock
+        image: python:3.12-slim
+        ports:
+        - containerPort: ${MOCK_INFERENCE_PORT}
+        env:
+        - name: PORT
+          value: "${MOCK_INFERENCE_PORT}"
+        - name: MOCK_MODELS
+          value: "${MOCK_INFERENCE_MODEL}"
+        - name: STREAM_DELAY_SECONDS
+          value: "${STREAM_DELAY_SECONDS}"
+        command: ["sh", "-c"]
+        args:
+        - |
+          pip install flask &&
+          python /app/openai-mock-server.py --port ${MOCK_INFERENCE_PORT}
+        volumeMounts:
+        - name: openai-mock-script
+          mountPath: /app
+      volumes:
+      - name: openai-mock-script
+        configMap:
+          name: openai-mock
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: openai-mock-service
+spec:
+  selector:
+    app: openai-mock
+  ports:
+  - port: 8080
+    targetPort: 8080
+  type: ClusterIP
--- a/docs/source/distributions/k8s-benchmark/openai-mock-server.py
+++ b/docs/source/distributions/k8s-benchmark/openai-mock-server.py
@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+OpenAI-compatible mock server that returns:
+- Hardcoded /models response for consistent validation
+- Valid OpenAI-formatted chat completion responses with dynamic content
+"""
+
+from flask import Flask, request, jsonify, Response
+import time
+import random
+import uuid
+import json
+import argparse
+import os
+
+app = Flask(__name__)
+
+# Models from environment variables
+def get_models():
+    models_str = os.getenv("MOCK_MODELS", "mock-inference")
+    model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
+    
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": model_id,
+                "object": "model",
+                "created": 1234567890,
+                "owned_by": "vllm"
+            }
+            for model_id in model_ids
+        ]
+    }
+
+def generate_random_text(length=50):
+    """Generate random but coherent text for responses."""
+    words = [
+        "Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
+        "with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
+        "you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
+        "with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
+    ]
+    return " ".join(random.choices(words, k=length))
+
+@app.route('/models', methods=['GET'])
+def list_models():
+    models = get_models()
+    print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
+    return jsonify(models)
+
+@app.route('/chat/completions', methods=['POST'])
+def chat_completions():
+    """Return OpenAI-formatted chat completion responses."""
+    data = request.get_json()
+    default_model = get_models()['data'][0]['id']
+    model = data.get('model', default_model)
+    messages = data.get('messages', [])
+    stream = data.get('stream', False)
+     
+    print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
+    
+    if stream:
+        return handle_streaming_completion(model, messages)
+    else:
+        return handle_non_streaming_completion(model, messages)
+
+def handle_non_streaming_completion(model, messages):
+    response_text = generate_random_text(random.randint(20, 80))
+    
+    # Calculate realistic token counts
+    prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
+    completion_tokens = len(response_text.split())
+    
+    response = {
+        "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": response_text
+                },
+                "finish_reason": "stop"
+            }
+        ],
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": prompt_tokens + completion_tokens
+        }
+    }
+    
+    return jsonify(response)
+
+def handle_streaming_completion(model, messages):
+    def generate_stream():
+        # Generate response text
+        full_response = generate_random_text(random.randint(30, 100))
+        words = full_response.split()
+        
+        # Send initial chunk
+        initial_chunk = {
+            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"role": "assistant", "content": ""}
+                }
+            ]
+        }
+        yield f"data: {json.dumps(initial_chunk)}\n\n"
+        
+        # Send word by word
+        for i, word in enumerate(words):
+            chunk = {
+                "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
+                "object": "chat.completion.chunk", 
+                "created": int(time.time()),
+                "model": model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {"content": f"{word} " if i < len(words) - 1 else word}
+                    }
+                ]
+            }
+            yield f"data: {json.dumps(chunk)}\n\n"
+            # Configurable delay to simulate realistic streaming
+            stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
+            time.sleep(stream_delay)
+        
+        # Send final chunk
+        final_chunk = {
+            "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"content": ""},
+                    "finish_reason": "stop"
+                }
+            ]
+        }
+        yield f"data: {json.dumps(final_chunk)}\n\n"
+        yield "data: [DONE]\n\n"
+    
+    return Response(
+        generate_stream(),
+        mimetype='text/event-stream',
+        headers={
+            'Cache-Control': 'no-cache',
+            'Connection': 'keep-alive',
+            'Access-Control-Allow-Origin': '*',
+        }
+    )
+
+@app.route('/health', methods=['GET'])
+def health():
+    return jsonify({"status": "healthy", "type": "openai-mock"})
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
+    parser.add_argument('--port', type=int, default=8081, 
+                       help='Port to run the server on (default: 8081)')
+    args = parser.parse_args()
+    
+    port = args.port
+    
+    models = get_models()
+    print("Starting OpenAI-compatible mock server...")
+    print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
+    print("- OpenAI-formatted chat/completion responses with dynamic content")
+    print("- Streaming support with valid SSE format")
+    print(f"- Listening on: http://0.0.0.0:{port}")
+    app.run(host='0.0.0.0', port=port, debug=False)
--- a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
@ -0,0 +1,143 @@
+apiVersion: v1
+data:
+  stack_run_config.yaml: |
+    version: '2'
+    image_name: kubernetes-benchmark-demo
+    apis:
+    - agents
+    - inference
+    - safety
+    - telemetry
+    - tool_runtime
+    - vector_io
+    providers:
+      inference:
+      - provider_id: vllm-inference
+        provider_type: remote::vllm
+        config:
+          url: ${env.VLLM_URL:=http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+          api_token: ${env.VLLM_API_TOKEN:=fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+      - provider_id: vllm-safety
+        provider_type: remote::vllm
+        config:
+          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+          api_token: ${env.VLLM_API_TOKEN:=fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+      - provider_id: mock-vllm-inference
+        provider_type: remote::vllm
+        config:
+          url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
+          max_tokens: 4096
+          api_token: fake
+          tls_verify: false
+      - provider_id: sentence-transformers
+        provider_type: inline::sentence-transformers
+        config: {}
+      vector_io:
+      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
+        provider_type: remote::chromadb
+        config:
+          url: ${env.CHROMADB_URL:=}
+          kvstore:
+            type: postgres
+            host: ${env.POSTGRES_HOST:=localhost}
+            port: ${env.POSTGRES_PORT:=5432}
+            db: ${env.POSTGRES_DB:=llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
+            password: ${env.POSTGRES_PASSWORD:=llamastack}
+      safety:
+      - provider_id: llama-guard
+        provider_type: inline::llama-guard
+        config:
+          excluded_categories: []
+      agents:
+      - provider_id: meta-reference
+        provider_type: inline::meta-reference
+        config:
+          persistence_store:
+            type: postgres
+            host: ${env.POSTGRES_HOST:=localhost}
+            port: ${env.POSTGRES_PORT:=5432}
+            db: ${env.POSTGRES_DB:=llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
+            password: ${env.POSTGRES_PASSWORD:=llamastack}
+          responses_store:
+            type: postgres
+            host: ${env.POSTGRES_HOST:=localhost}
+            port: ${env.POSTGRES_PORT:=5432}
+            db: ${env.POSTGRES_DB:=llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
+            password: ${env.POSTGRES_PASSWORD:=llamastack}
+      telemetry:
+      - provider_id: meta-reference
+        provider_type: inline::meta-reference
+        config:
+          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+          sinks: ${env.TELEMETRY_SINKS:=console}
+      tool_runtime:
+      - provider_id: brave-search
+        provider_type: remote::brave-search
+        config:
+          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
+          max_results: 3
+      - provider_id: tavily-search
+        provider_type: remote::tavily-search
+        config:
+          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
+          max_results: 3
+      - provider_id: rag-runtime
+        provider_type: inline::rag-runtime
+        config: {}
+      - provider_id: model-context-protocol
+        provider_type: remote::model-context-protocol
+        config: {}
+    metadata_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+      table_name: llamastack_kvstore
+    inference_store:
+      type: postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+    models:
+    - metadata:
+        embedding_dimension: 384
+      model_id: all-MiniLM-L6-v2
+      provider_id: sentence-transformers
+      model_type: embedding
+    - model_id: ${env.INFERENCE_MODEL}
+      provider_id: vllm-inference
+      model_type: llm
+    - model_id: ${env.SAFETY_MODEL}
+      provider_id: vllm-safety
+      model_type: llm
+    - model_id: ${env.MOCK_INFERENCE_MODEL}
+      provider_id: mock-vllm-inference
+      model_type: llm
+    shields:
+    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+    vector_dbs: []
+    datasets: []
+    scoring_fns: []
+    benchmarks: []
+    tool_groups:
+    - toolgroup_id: builtin::websearch
+      provider_id: tavily-search
+    - toolgroup_id: builtin::rag
+      provider_id: rag-runtime
+    server:
+      port: 8323
+kind: ConfigMap
+metadata:
+  creationTimestamp: null
+  name: llama-stack-config
--- a/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s-benchmark/stack-k8s.yaml.template
@ -0,0 +1,87 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-benchmark-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama-stack-benchmark-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: llama-stack-benchmark
+      app.kubernetes.io/component: server
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: llama-stack-benchmark
+        app.kubernetes.io/component: server
+    spec:
+      containers:
+      - name: llama-stack-benchmark
+        image: llamastack/distribution-starter:latest
+        imagePullPolicy: Always # since we have specified latest instead of a version
+        env:
+        - name: ENABLE_CHROMADB
+          value: "true"
+        - name: CHROMADB_URL
+          value: http://chromadb.default.svc.cluster.local:6000
+        - name: POSTGRES_HOST
+          value: postgres-server.default.svc.cluster.local
+        - name: POSTGRES_PORT
+          value: "5432"
+        - name: INFERENCE_MODEL
+          value: "${INFERENCE_MODEL}"
+        - name: SAFETY_MODEL
+          value: "${SAFETY_MODEL}"
+        - name: TAVILY_SEARCH_API_KEY
+          value: "${TAVILY_SEARCH_API_KEY}"
+        - name: MOCK_INFERENCE_PORT
+          value: "${MOCK_INFERENCE_PORT}"
+        - name: VLLM_URL
+          value: http://vllm-server.default.svc.cluster.local:8000/v1
+        - name: VLLM_MAX_TOKENS
+          value: "3072"
+        - name: VLLM_SAFETY_URL
+          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
+        - name: VLLM_TLS_VERIFY
+          value: "false"
+        - name: MOCK_INFERENCE_MODEL
+          value: "${MOCK_INFERENCE_MODEL}"
+        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
+        ports:
+          - containerPort: 8323
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.llama
+          - name: llama-config
+            mountPath: /etc/config
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: llama-benchmark-pvc
+      - name: llama-config
+        configMap:
+          name: llama-stack-config
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-stack-benchmark-service
+spec:
+  selector:
+    app.kubernetes.io/name: llama-stack-benchmark
+    app.kubernetes.io/component: server
+  ports:
+  - name: http
+    port: 8323
+    targetPort: 8323
+  type: ClusterIP
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@ -0,0 +1,136 @@
+version: '2'
+image_name: kubernetes-benchmark-demo
+apis:
+- agents
+- inference
+- safety
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: vllm-inference
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:=http://localhost:8000/v1}
+      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+      api_token: ${env.VLLM_API_TOKEN:=fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+  - provider_id: vllm-safety
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
+      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+      api_token: ${env.VLLM_API_TOKEN:=fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+  - provider_id: mock-vllm-inference
+    provider_type: remote::vllm
+    config:
+      url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
+      max_tokens: 4096
+      api_token: fake
+      tls_verify: false
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:=}
+      kvstore:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+      responses_store:
+        type: postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+      sinks: ${env.TELEMETRY_SINKS:=console}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:+}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:+}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:=localhost}
+  port: ${env.POSTGRES_PORT:=5432}
+  db: ${env.POSTGRES_DB:=llamastack}
+  user: ${env.POSTGRES_USER:=llamastack}
+  password: ${env.POSTGRES_PASSWORD:=llamastack}
+  table_name: llamastack_kvstore
+inference_store:
+  type: postgres
+  host: ${env.POSTGRES_HOST:=localhost}
+  port: ${env.POSTGRES_PORT:=5432}
+  db: ${env.POSTGRES_DB:=llamastack}
+  user: ${env.POSTGRES_USER:=llamastack}
+  password: ${env.POSTGRES_PASSWORD:=llamastack}
+models:
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+- model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  model_type: llm
+- model_id: ${env.SAFETY_MODEL}
+  provider_id: vllm-safety
+  model_type: llm
+- model_id: ${env.MOCK_INFERENCE_MODEL}
+  provider_id: mock-vllm-inference
+  model_type: llm
+shields:
+- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+server:
+  port: 8323
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -40,19 +40,19 @@ spec:
          value: "3072"
        - name: VLLM_SAFETY_URL
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
+        - name: VLLM_TLS_VERIFY
+          value: "false"
        - name: POSTGRES_HOST
          value: postgres-server.default.svc.cluster.local
        - name: POSTGRES_PORT
          value: "5432"
-        - name: VLLM_TLS_VERIFY
-          value: "false"
        - name: INFERENCE_MODEL
          value: "${INFERENCE_MODEL}"
        - name: SAFETY_MODEL
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
-        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
        ports:
          - containerPort: 8321
        volumeMounts:
--- a/docs/source/providers/external/external-providers-guide.md
+++ b/docs/source/providers/external/external-providers-guide.md
@ -226,7 +226,7 @@ uv init
 name = "llama-stack-provider-ollama"
 version = "0.1.0"
 description = "Ollama provider for Llama Stack"
-requires-python = ">=3.10"
+requires-python = ">=3.12"
 dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
 ```

--- a/docs/source/providers/vector_io/inline_meta-reference.md
+++ b/docs/source/providers/vector_io/inline_meta-reference.md
@ -21,5 +21,7 @@ kvstore:

 ## Deprecation Notice

-⚠️ **Warning**: Please use the `inline::faiss` provider instead.
+```{warning}
+Please use the `inline::faiss` provider instead.
+```

--- a/docs/source/providers/vector_io/inline_sqlite_vec.md
+++ b/docs/source/providers/vector_io/inline_sqlite_vec.md
@ -25,5 +25,7 @@ kvstore:

 ## Deprecation Notice

-⚠️ **Warning**: Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
+```{warning}
+Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
+```

--- a/docs/source/providers/vector_io/remote_milvus.md
+++ b/docs/source/providers/vector_io/remote_milvus.md
@ -204,7 +204,10 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
 | `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
 | `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |

-> **Note**: This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
+```{note}
+ This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
+ ```
+

 ## Sample Configuration

--- a/docs/source/references/llama_cli_reference/download_models.md
+++ b/docs/source/references/llama_cli_reference/download_models.md
@ -128,7 +128,9 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern

 **Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).

-> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
+```{tip}
+Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
+```

 ## List the downloaded models

--- a/docs/source/references/llama_cli_reference/index.md
+++ b/docs/source/references/llama_cli_reference/index.md
@ -152,7 +152,9 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern

 **Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).

-> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
+```{tip}
+Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
+```

 ## List the downloaded models